Seriously optimise the search system via some profiling.

This commit is contained in:
Starbeamrainbowlabs 2018-06-26 14:15:19 +01:00
parent 67648199d7
commit 3d3b6c491a
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
4 changed files with 50 additions and 42 deletions

View File

@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
/////////////////////////////////////////////////////////////////////////////
/** The version of Pepperminty Wiki currently running. */
$version = "v0.17-dev";
$commit = "75b6b6c55fa9710d82b6623971581db7c6c5309b";
$commit = "67648199d7ebd8a1b2ec400af0192dc0bb94b233";
/// Environment ///
/** Holds information about the current request environment. */
$env = new stdClass();
@ -792,7 +792,7 @@ function starts_with($haystack, $needle)
function mb_stripos_all($haystack, $needle) {
$s = 0; $i = 0;
while(is_integer($i)) {
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
$i = mb_stripos($haystack, $needle, $s);
if(is_integer($i)) {
$aStrPos[] = $i;
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
@ -1268,10 +1268,12 @@ class ids
public static function getid($pagename)
{
global $idindex;
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
foreach ($idindex as $id => $entry)
{
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
// We don't need to normalise here because we normralise when assigning ids
if($entry == $pagename_norm)
return $id;
}
@ -3782,12 +3784,16 @@ register_module([
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
$start = microtime(true);
foreach($results as &$result) {
$result["context"] = search::extract_context(
$invindex, ids::getid($result["pagename"]),
$_GET["query"],
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
);
}
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
@ -4208,7 +4214,7 @@ class search
public static function tokenize($source)
{
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
// We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable
@ -4237,9 +4243,6 @@ class search
{
global $pageindex, $env, $paths, $settings;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
if($output) {
header("content-type: text/event-stream");
ob_end_flush();
@ -4415,7 +4418,7 @@ class search
global $settings, $pageindex;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query_terms = self::tokenize($query);
$matching_pages = [];
@ -4436,7 +4439,7 @@ class search
if(isset($invindex[$qterm]))
{
// Loop over each page in the inverted index entry
reset($invindex); // Reset array/object pointer
reset($invindex[$qterm]); // Reset array/object pointer
foreach($invindex[$qterm] as $pageid => $page_entry)
{
// Create an entry in the matching pages array if it doesn't exist
@ -4486,7 +4489,7 @@ class search
}
}
reset($matching_pages);
foreach($matching_pages as $pageid => &$pagedata)
{
$pagedata["pagename"] = ids::getpagename($pageid);
@ -4495,6 +4498,7 @@ class search
$pageOffsets = [];
// Loop over each search term found on this page
reset($pagedata["nterms"]);
foreach($pagedata["nterms"] as $pterm => $entry)
{
// Add the number of occurrences of this search term to the ranking
@ -4554,23 +4558,23 @@ class search
* @param string $source The page source to extract the context from.
* @return string The generated context string.
*/
public static function extract_context($query, $source)
public static function extract_context($invindex, $pageid, $query, $source)
{
global $settings;
$nterms = self::tokenize($query);
$matches = [];
// Loop over each nterm and find it in the source
foreach($nterms as $nterm)
{
if(in_array($nterm, static::$stop_words))
foreach($nterms as $nterm) {
// Skip over words that don't appear in the inverted index (e.g. stop words)
if(!isset($invindex[$nterm]))
continue;
$all_offsets = mb_stripos_all($source, $nterm);
// Skip over adding matches if there aren't any
if($all_offsets === false)
// Skip if the page isn't found in the inverted index for this word
if(!isset($invindex[$nterm][$pageid]))
continue;
foreach($all_offsets as $offset)
$matches[] = [ $nterm, $offset ];
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
$matches[] = [ $nterm, $next_offset ];
}
// Sort the matches by offset

View File

@ -405,7 +405,7 @@ function starts_with($haystack, $needle)
function mb_stripos_all($haystack, $needle) {
$s = 0; $i = 0;
while(is_integer($i)) {
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
$i = mb_stripos($haystack, $needle, $s);
if(is_integer($i)) {
$aStrPos[] = $i;
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
@ -881,10 +881,12 @@ class ids
public static function getid($pagename)
{
global $idindex;
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
foreach ($idindex as $id => $entry)
{
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
// We don't need to normalise here because we normralise when assigning ids
if($entry == $pagename_norm)
return $id;
}

View File

@ -104,7 +104,7 @@
"author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"id": "feature-search",
"lastupdate": 1529968213,
"lastupdate": 1530018727,
"optional": false
},
{

View File

@ -138,12 +138,16 @@ register_module([
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
$start = microtime(true);
foreach($results as &$result) {
$result["context"] = search::extract_context(
$invindex, ids::getid($result["pagename"]),
$_GET["query"],
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
);
}
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
@ -564,7 +568,7 @@ class search
public static function tokenize($source)
{
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
// We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable
@ -593,9 +597,6 @@ class search
{
global $pageindex, $env, $paths, $settings;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
if($output) {
header("content-type: text/event-stream");
ob_end_flush();
@ -771,7 +772,7 @@ class search
global $settings, $pageindex;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query_terms = self::tokenize($query);
$matching_pages = [];
@ -792,7 +793,7 @@ class search
if(isset($invindex[$qterm]))
{
// Loop over each page in the inverted index entry
reset($invindex); // Reset array/object pointer
reset($invindex[$qterm]); // Reset array/object pointer
foreach($invindex[$qterm] as $pageid => $page_entry)
{
// Create an entry in the matching pages array if it doesn't exist
@ -842,7 +843,7 @@ class search
}
}
reset($matching_pages);
foreach($matching_pages as $pageid => &$pagedata)
{
$pagedata["pagename"] = ids::getpagename($pageid);
@ -851,6 +852,7 @@ class search
$pageOffsets = [];
// Loop over each search term found on this page
reset($pagedata["nterms"]);
foreach($pagedata["nterms"] as $pterm => $entry)
{
// Add the number of occurrences of this search term to the ranking
@ -910,23 +912,23 @@ class search
* @param string $source The page source to extract the context from.
* @return string The generated context string.
*/
public static function extract_context($query, $source)
public static function extract_context($invindex, $pageid, $query, $source)
{
global $settings;
$nterms = self::tokenize($query);
$matches = [];
// Loop over each nterm and find it in the source
foreach($nterms as $nterm)
{
if(in_array($nterm, static::$stop_words))
foreach($nterms as $nterm) {
// Skip over words that don't appear in the inverted index (e.g. stop words)
if(!isset($invindex[$nterm]))
continue;
$all_offsets = mb_stripos_all($source, $nterm);
// Skip over adding matches if there aren't any
if($all_offsets === false)
// Skip if the page isn't found in the inverted index for this word
if(!isset($invindex[$nterm][$pageid]))
continue;
foreach($all_offsets as $offset)
$matches[] = [ $nterm, $offset ];
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
$matches[] = [ $nterm, $next_offset ];
}
// Sort the matches by offset