1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-11-22 16:33:00 +00:00

Seriously optimise the search system via some profiling.

This commit is contained in:
Starbeamrainbowlabs 2018-06-26 14:15:19 +01:00
parent 67648199d7
commit 3d3b6c491a
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
4 changed files with 50 additions and 42 deletions

View file

@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
/** The version of Pepperminty Wiki currently running. */ /** The version of Pepperminty Wiki currently running. */
$version = "v0.17-dev"; $version = "v0.17-dev";
$commit = "75b6b6c55fa9710d82b6623971581db7c6c5309b"; $commit = "67648199d7ebd8a1b2ec400af0192dc0bb94b233";
/// Environment /// /// Environment ///
/** Holds information about the current request environment. */ /** Holds information about the current request environment. */
$env = new stdClass(); $env = new stdClass();
@ -792,7 +792,7 @@ function starts_with($haystack, $needle)
function mb_stripos_all($haystack, $needle) { function mb_stripos_all($haystack, $needle) {
$s = 0; $i = 0; $s = 0; $i = 0;
while(is_integer($i)) { while(is_integer($i)) {
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s); $i = mb_stripos($haystack, $needle, $s);
if(is_integer($i)) { if(is_integer($i)) {
$aStrPos[] = $i; $aStrPos[] = $i;
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle)); $s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
@ -1269,9 +1269,11 @@ class ids
{ {
global $idindex; global $idindex;
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
foreach ($idindex as $id => $entry) foreach ($idindex as $id => $entry)
{ {
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C)) // We don't need to normalise here because we normralise when assigning ids
if($entry == $pagename_norm)
return $id; return $id;
} }
@ -3782,12 +3784,16 @@ register_module([
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms"); header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms"); header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
$start = microtime(true);
foreach($results as &$result) { foreach($results as &$result) {
$result["context"] = search::extract_context( $result["context"] = search::extract_context(
$invindex, ids::getid($result["pagename"]),
$_GET["query"], $_GET["query"],
file_get_contents($env->storage_prefix . $result["pagename"] . ".md") file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
); );
} }
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3); $env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
@ -4208,7 +4214,7 @@ class search
public static function tokenize($source) public static function tokenize($source)
{ {
/** Normalises input characters for searching & indexing */ /** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
// We don't need to normalise here because the transliterator handles // We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable // this for us. Also, we can't move the literator to a static variable
@ -4237,9 +4243,6 @@ class search
{ {
global $pageindex, $env, $paths, $settings; global $pageindex, $env, $paths, $settings;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
if($output) { if($output) {
header("content-type: text/event-stream"); header("content-type: text/event-stream");
ob_end_flush(); ob_end_flush();
@ -4415,7 +4418,7 @@ class search
global $settings, $pageindex; global $settings, $pageindex;
/** Normalises input characters for searching & indexing */ /** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query_terms = self::tokenize($query); $query_terms = self::tokenize($query);
$matching_pages = []; $matching_pages = [];
@ -4436,7 +4439,7 @@ class search
if(isset($invindex[$qterm])) if(isset($invindex[$qterm]))
{ {
// Loop over each page in the inverted index entry // Loop over each page in the inverted index entry
reset($invindex); // Reset array/object pointer reset($invindex[$qterm]); // Reset array/object pointer
foreach($invindex[$qterm] as $pageid => $page_entry) foreach($invindex[$qterm] as $pageid => $page_entry)
{ {
// Create an entry in the matching pages array if it doesn't exist // Create an entry in the matching pages array if it doesn't exist
@ -4486,7 +4489,7 @@ class search
} }
} }
reset($matching_pages);
foreach($matching_pages as $pageid => &$pagedata) foreach($matching_pages as $pageid => &$pagedata)
{ {
$pagedata["pagename"] = ids::getpagename($pageid); $pagedata["pagename"] = ids::getpagename($pageid);
@ -4495,6 +4498,7 @@ class search
$pageOffsets = []; $pageOffsets = [];
// Loop over each search term found on this page // Loop over each search term found on this page
reset($pagedata["nterms"]);
foreach($pagedata["nterms"] as $pterm => $entry) foreach($pagedata["nterms"] as $pterm => $entry)
{ {
// Add the number of occurrences of this search term to the ranking // Add the number of occurrences of this search term to the ranking
@ -4554,23 +4558,23 @@ class search
* @param string $source The page source to extract the context from. * @param string $source The page source to extract the context from.
* @return string The generated context string. * @return string The generated context string.
*/ */
public static function extract_context($query, $source) public static function extract_context($invindex, $pageid, $query, $source)
{ {
global $settings; global $settings;
$nterms = self::tokenize($query); $nterms = self::tokenize($query);
$matches = []; $matches = [];
// Loop over each nterm and find it in the source
foreach($nterms as $nterm) foreach($nterms as $nterm) {
{ // Skip over words that don't appear in the inverted index (e.g. stop words)
if(in_array($nterm, static::$stop_words)) if(!isset($invindex[$nterm]))
continue; continue;
$all_offsets = mb_stripos_all($source, $nterm); // Skip if the page isn't found in the inverted index for this word
// Skip over adding matches if there aren't any if(!isset($invindex[$nterm][$pageid]))
if($all_offsets === false)
continue; continue;
foreach($all_offsets as $offset)
$matches[] = [ $nterm, $offset ]; foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
$matches[] = [ $nterm, $next_offset ];
} }
// Sort the matches by offset // Sort the matches by offset

View file

@ -405,7 +405,7 @@ function starts_with($haystack, $needle)
function mb_stripos_all($haystack, $needle) { function mb_stripos_all($haystack, $needle) {
$s = 0; $i = 0; $s = 0; $i = 0;
while(is_integer($i)) { while(is_integer($i)) {
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s); $i = mb_stripos($haystack, $needle, $s);
if(is_integer($i)) { if(is_integer($i)) {
$aStrPos[] = $i; $aStrPos[] = $i;
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle)); $s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
@ -882,9 +882,11 @@ class ids
{ {
global $idindex; global $idindex;
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
foreach ($idindex as $id => $entry) foreach ($idindex as $id => $entry)
{ {
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C)) // We don't need to normalise here because we normralise when assigning ids
if($entry == $pagename_norm)
return $id; return $id;
} }

View file

@ -104,7 +104,7 @@
"author": "Starbeamrainbowlabs", "author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.", "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"id": "feature-search", "id": "feature-search",
"lastupdate": 1529968213, "lastupdate": 1530018727,
"optional": false "optional": false
}, },
{ {

View file

@ -138,12 +138,16 @@ register_module([
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms"); header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms"); header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
$start = microtime(true);
foreach($results as &$result) { foreach($results as &$result) {
$result["context"] = search::extract_context( $result["context"] = search::extract_context(
$invindex, ids::getid($result["pagename"]),
$_GET["query"], $_GET["query"],
file_get_contents($env->storage_prefix . $result["pagename"] . ".md") file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
); );
} }
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3); $env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
@ -564,7 +568,7 @@ class search
public static function tokenize($source) public static function tokenize($source)
{ {
/** Normalises input characters for searching & indexing */ /** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
// We don't need to normalise here because the transliterator handles // We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable // this for us. Also, we can't move the literator to a static variable
@ -593,9 +597,6 @@ class search
{ {
global $pageindex, $env, $paths, $settings; global $pageindex, $env, $paths, $settings;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
if($output) { if($output) {
header("content-type: text/event-stream"); header("content-type: text/event-stream");
ob_end_flush(); ob_end_flush();
@ -771,7 +772,7 @@ class search
global $settings, $pageindex; global $settings, $pageindex;
/** Normalises input characters for searching & indexing */ /** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query_terms = self::tokenize($query); $query_terms = self::tokenize($query);
$matching_pages = []; $matching_pages = [];
@ -792,7 +793,7 @@ class search
if(isset($invindex[$qterm])) if(isset($invindex[$qterm]))
{ {
// Loop over each page in the inverted index entry // Loop over each page in the inverted index entry
reset($invindex); // Reset array/object pointer reset($invindex[$qterm]); // Reset array/object pointer
foreach($invindex[$qterm] as $pageid => $page_entry) foreach($invindex[$qterm] as $pageid => $page_entry)
{ {
// Create an entry in the matching pages array if it doesn't exist // Create an entry in the matching pages array if it doesn't exist
@ -842,7 +843,7 @@ class search
} }
} }
reset($matching_pages);
foreach($matching_pages as $pageid => &$pagedata) foreach($matching_pages as $pageid => &$pagedata)
{ {
$pagedata["pagename"] = ids::getpagename($pageid); $pagedata["pagename"] = ids::getpagename($pageid);
@ -851,6 +852,7 @@ class search
$pageOffsets = []; $pageOffsets = [];
// Loop over each search term found on this page // Loop over each search term found on this page
reset($pagedata["nterms"]);
foreach($pagedata["nterms"] as $pterm => $entry) foreach($pagedata["nterms"] as $pterm => $entry)
{ {
// Add the number of occurrences of this search term to the ranking // Add the number of occurrences of this search term to the ranking
@ -910,23 +912,23 @@ class search
* @param string $source The page source to extract the context from. * @param string $source The page source to extract the context from.
* @return string The generated context string. * @return string The generated context string.
*/ */
public static function extract_context($query, $source) public static function extract_context($invindex, $pageid, $query, $source)
{ {
global $settings; global $settings;
$nterms = self::tokenize($query); $nterms = self::tokenize($query);
$matches = []; $matches = [];
// Loop over each nterm and find it in the source
foreach($nterms as $nterm) foreach($nterms as $nterm) {
{ // Skip over words that don't appear in the inverted index (e.g. stop words)
if(in_array($nterm, static::$stop_words)) if(!isset($invindex[$nterm]))
continue; continue;
$all_offsets = mb_stripos_all($source, $nterm); // Skip if the page isn't found in the inverted index for this word
// Skip over adding matches if there aren't any if(!isset($invindex[$nterm][$pageid]))
if($all_offsets === false)
continue; continue;
foreach($all_offsets as $offset)
$matches[] = [ $nterm, $offset ]; foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
$matches[] = [ $nterm, $next_offset ];
} }
// Sort the matches by offset // Sort the matches by offset