mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-22 16:33:00 +00:00
Seriously optimise the search system via some profiling.
This commit is contained in:
parent
67648199d7
commit
3d3b6c491a
4 changed files with 50 additions and 42 deletions
|
@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
/** The version of Pepperminty Wiki currently running. */
|
/** The version of Pepperminty Wiki currently running. */
|
||||||
$version = "v0.17-dev";
|
$version = "v0.17-dev";
|
||||||
$commit = "75b6b6c55fa9710d82b6623971581db7c6c5309b";
|
$commit = "67648199d7ebd8a1b2ec400af0192dc0bb94b233";
|
||||||
/// Environment ///
|
/// Environment ///
|
||||||
/** Holds information about the current request environment. */
|
/** Holds information about the current request environment. */
|
||||||
$env = new stdClass();
|
$env = new stdClass();
|
||||||
|
@ -792,7 +792,7 @@ function starts_with($haystack, $needle)
|
||||||
function mb_stripos_all($haystack, $needle) {
|
function mb_stripos_all($haystack, $needle) {
|
||||||
$s = 0; $i = 0;
|
$s = 0; $i = 0;
|
||||||
while(is_integer($i)) {
|
while(is_integer($i)) {
|
||||||
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
|
$i = mb_stripos($haystack, $needle, $s);
|
||||||
if(is_integer($i)) {
|
if(is_integer($i)) {
|
||||||
$aStrPos[] = $i;
|
$aStrPos[] = $i;
|
||||||
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
|
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
|
||||||
|
@ -1269,9 +1269,11 @@ class ids
|
||||||
{
|
{
|
||||||
global $idindex;
|
global $idindex;
|
||||||
|
|
||||||
|
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
|
||||||
foreach ($idindex as $id => $entry)
|
foreach ($idindex as $id => $entry)
|
||||||
{
|
{
|
||||||
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
|
// We don't need to normalise here because we normralise when assigning ids
|
||||||
|
if($entry == $pagename_norm)
|
||||||
return $id;
|
return $id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3782,12 +3784,16 @@ register_module([
|
||||||
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
||||||
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
||||||
|
|
||||||
|
$start = microtime(true);
|
||||||
foreach($results as &$result) {
|
foreach($results as &$result) {
|
||||||
$result["context"] = search::extract_context(
|
$result["context"] = search::extract_context(
|
||||||
|
$invindex, ids::getid($result["pagename"]),
|
||||||
$_GET["query"],
|
$_GET["query"],
|
||||||
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
|
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
|
||||||
|
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
|
||||||
|
|
||||||
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
|
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
|
||||||
|
|
||||||
|
@ -4208,7 +4214,7 @@ class search
|
||||||
public static function tokenize($source)
|
public static function tokenize($source)
|
||||||
{
|
{
|
||||||
/** Normalises input characters for searching & indexing */
|
/** Normalises input characters for searching & indexing */
|
||||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
// We don't need to normalise here because the transliterator handles
|
// We don't need to normalise here because the transliterator handles
|
||||||
// this for us. Also, we can't move the literator to a static variable
|
// this for us. Also, we can't move the literator to a static variable
|
||||||
|
@ -4237,9 +4243,6 @@ class search
|
||||||
{
|
{
|
||||||
global $pageindex, $env, $paths, $settings;
|
global $pageindex, $env, $paths, $settings;
|
||||||
|
|
||||||
/** Normalises input characters for searching & indexing */
|
|
||||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
|
||||||
|
|
||||||
if($output) {
|
if($output) {
|
||||||
header("content-type: text/event-stream");
|
header("content-type: text/event-stream");
|
||||||
ob_end_flush();
|
ob_end_flush();
|
||||||
|
@ -4415,7 +4418,7 @@ class search
|
||||||
global $settings, $pageindex;
|
global $settings, $pageindex;
|
||||||
|
|
||||||
/** Normalises input characters for searching & indexing */
|
/** Normalises input characters for searching & indexing */
|
||||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
$query_terms = self::tokenize($query);
|
$query_terms = self::tokenize($query);
|
||||||
$matching_pages = [];
|
$matching_pages = [];
|
||||||
|
@ -4436,7 +4439,7 @@ class search
|
||||||
if(isset($invindex[$qterm]))
|
if(isset($invindex[$qterm]))
|
||||||
{
|
{
|
||||||
// Loop over each page in the inverted index entry
|
// Loop over each page in the inverted index entry
|
||||||
reset($invindex); // Reset array/object pointer
|
reset($invindex[$qterm]); // Reset array/object pointer
|
||||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||||
{
|
{
|
||||||
// Create an entry in the matching pages array if it doesn't exist
|
// Create an entry in the matching pages array if it doesn't exist
|
||||||
|
@ -4486,7 +4489,7 @@ class search
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reset($matching_pages);
|
||||||
foreach($matching_pages as $pageid => &$pagedata)
|
foreach($matching_pages as $pageid => &$pagedata)
|
||||||
{
|
{
|
||||||
$pagedata["pagename"] = ids::getpagename($pageid);
|
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||||
|
@ -4495,6 +4498,7 @@ class search
|
||||||
$pageOffsets = [];
|
$pageOffsets = [];
|
||||||
|
|
||||||
// Loop over each search term found on this page
|
// Loop over each search term found on this page
|
||||||
|
reset($pagedata["nterms"]);
|
||||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||||
{
|
{
|
||||||
// Add the number of occurrences of this search term to the ranking
|
// Add the number of occurrences of this search term to the ranking
|
||||||
|
@ -4554,23 +4558,23 @@ class search
|
||||||
* @param string $source The page source to extract the context from.
|
* @param string $source The page source to extract the context from.
|
||||||
* @return string The generated context string.
|
* @return string The generated context string.
|
||||||
*/
|
*/
|
||||||
public static function extract_context($query, $source)
|
public static function extract_context($invindex, $pageid, $query, $source)
|
||||||
{
|
{
|
||||||
global $settings;
|
global $settings;
|
||||||
|
|
||||||
$nterms = self::tokenize($query);
|
$nterms = self::tokenize($query);
|
||||||
$matches = [];
|
$matches = [];
|
||||||
// Loop over each nterm and find it in the source
|
|
||||||
foreach($nterms as $nterm)
|
foreach($nterms as $nterm) {
|
||||||
{
|
// Skip over words that don't appear in the inverted index (e.g. stop words)
|
||||||
if(in_array($nterm, static::$stop_words))
|
if(!isset($invindex[$nterm]))
|
||||||
continue;
|
continue;
|
||||||
$all_offsets = mb_stripos_all($source, $nterm);
|
// Skip if the page isn't found in the inverted index for this word
|
||||||
// Skip over adding matches if there aren't any
|
if(!isset($invindex[$nterm][$pageid]))
|
||||||
if($all_offsets === false)
|
|
||||||
continue;
|
continue;
|
||||||
foreach($all_offsets as $offset)
|
|
||||||
$matches[] = [ $nterm, $offset ];
|
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
|
||||||
|
$matches[] = [ $nterm, $next_offset ];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort the matches by offset
|
// Sort the matches by offset
|
||||||
|
|
6
core.php
6
core.php
|
@ -405,7 +405,7 @@ function starts_with($haystack, $needle)
|
||||||
function mb_stripos_all($haystack, $needle) {
|
function mb_stripos_all($haystack, $needle) {
|
||||||
$s = 0; $i = 0;
|
$s = 0; $i = 0;
|
||||||
while(is_integer($i)) {
|
while(is_integer($i)) {
|
||||||
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
|
$i = mb_stripos($haystack, $needle, $s);
|
||||||
if(is_integer($i)) {
|
if(is_integer($i)) {
|
||||||
$aStrPos[] = $i;
|
$aStrPos[] = $i;
|
||||||
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
|
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
|
||||||
|
@ -882,9 +882,11 @@ class ids
|
||||||
{
|
{
|
||||||
global $idindex;
|
global $idindex;
|
||||||
|
|
||||||
|
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
|
||||||
foreach ($idindex as $id => $entry)
|
foreach ($idindex as $id => $entry)
|
||||||
{
|
{
|
||||||
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
|
// We don't need to normalise here because we normralise when assigning ids
|
||||||
|
if($entry == $pagename_norm)
|
||||||
return $id;
|
return $id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||||
"id": "feature-search",
|
"id": "feature-search",
|
||||||
"lastupdate": 1529968213,
|
"lastupdate": 1530018727,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -138,12 +138,16 @@ register_module([
|
||||||
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
||||||
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
||||||
|
|
||||||
|
$start = microtime(true);
|
||||||
foreach($results as &$result) {
|
foreach($results as &$result) {
|
||||||
$result["context"] = search::extract_context(
|
$result["context"] = search::extract_context(
|
||||||
|
$invindex, ids::getid($result["pagename"]),
|
||||||
$_GET["query"],
|
$_GET["query"],
|
||||||
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
|
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
|
||||||
|
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
|
||||||
|
|
||||||
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
|
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
|
||||||
|
|
||||||
|
@ -564,7 +568,7 @@ class search
|
||||||
public static function tokenize($source)
|
public static function tokenize($source)
|
||||||
{
|
{
|
||||||
/** Normalises input characters for searching & indexing */
|
/** Normalises input characters for searching & indexing */
|
||||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
// We don't need to normalise here because the transliterator handles
|
// We don't need to normalise here because the transliterator handles
|
||||||
// this for us. Also, we can't move the literator to a static variable
|
// this for us. Also, we can't move the literator to a static variable
|
||||||
|
@ -593,9 +597,6 @@ class search
|
||||||
{
|
{
|
||||||
global $pageindex, $env, $paths, $settings;
|
global $pageindex, $env, $paths, $settings;
|
||||||
|
|
||||||
/** Normalises input characters for searching & indexing */
|
|
||||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
|
||||||
|
|
||||||
if($output) {
|
if($output) {
|
||||||
header("content-type: text/event-stream");
|
header("content-type: text/event-stream");
|
||||||
ob_end_flush();
|
ob_end_flush();
|
||||||
|
@ -771,7 +772,7 @@ class search
|
||||||
global $settings, $pageindex;
|
global $settings, $pageindex;
|
||||||
|
|
||||||
/** Normalises input characters for searching & indexing */
|
/** Normalises input characters for searching & indexing */
|
||||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
$query_terms = self::tokenize($query);
|
$query_terms = self::tokenize($query);
|
||||||
$matching_pages = [];
|
$matching_pages = [];
|
||||||
|
@ -792,7 +793,7 @@ class search
|
||||||
if(isset($invindex[$qterm]))
|
if(isset($invindex[$qterm]))
|
||||||
{
|
{
|
||||||
// Loop over each page in the inverted index entry
|
// Loop over each page in the inverted index entry
|
||||||
reset($invindex); // Reset array/object pointer
|
reset($invindex[$qterm]); // Reset array/object pointer
|
||||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||||
{
|
{
|
||||||
// Create an entry in the matching pages array if it doesn't exist
|
// Create an entry in the matching pages array if it doesn't exist
|
||||||
|
@ -842,7 +843,7 @@ class search
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reset($matching_pages);
|
||||||
foreach($matching_pages as $pageid => &$pagedata)
|
foreach($matching_pages as $pageid => &$pagedata)
|
||||||
{
|
{
|
||||||
$pagedata["pagename"] = ids::getpagename($pageid);
|
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||||
|
@ -851,6 +852,7 @@ class search
|
||||||
$pageOffsets = [];
|
$pageOffsets = [];
|
||||||
|
|
||||||
// Loop over each search term found on this page
|
// Loop over each search term found on this page
|
||||||
|
reset($pagedata["nterms"]);
|
||||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||||
{
|
{
|
||||||
// Add the number of occurrences of this search term to the ranking
|
// Add the number of occurrences of this search term to the ranking
|
||||||
|
@ -910,23 +912,23 @@ class search
|
||||||
* @param string $source The page source to extract the context from.
|
* @param string $source The page source to extract the context from.
|
||||||
* @return string The generated context string.
|
* @return string The generated context string.
|
||||||
*/
|
*/
|
||||||
public static function extract_context($query, $source)
|
public static function extract_context($invindex, $pageid, $query, $source)
|
||||||
{
|
{
|
||||||
global $settings;
|
global $settings;
|
||||||
|
|
||||||
$nterms = self::tokenize($query);
|
$nterms = self::tokenize($query);
|
||||||
$matches = [];
|
$matches = [];
|
||||||
// Loop over each nterm and find it in the source
|
|
||||||
foreach($nterms as $nterm)
|
foreach($nterms as $nterm) {
|
||||||
{
|
// Skip over words that don't appear in the inverted index (e.g. stop words)
|
||||||
if(in_array($nterm, static::$stop_words))
|
if(!isset($invindex[$nterm]))
|
||||||
continue;
|
continue;
|
||||||
$all_offsets = mb_stripos_all($source, $nterm);
|
// Skip if the page isn't found in the inverted index for this word
|
||||||
// Skip over adding matches if there aren't any
|
if(!isset($invindex[$nterm][$pageid]))
|
||||||
if($all_offsets === false)
|
|
||||||
continue;
|
continue;
|
||||||
foreach($all_offsets as $offset)
|
|
||||||
$matches[] = [ $nterm, $offset ];
|
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
|
||||||
|
$matches[] = [ $nterm, $next_offset ];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort the matches by offset
|
// Sort the matches by offset
|
||||||
|
|
Loading…
Reference in a new issue