mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-12-22 13:45:02 +00:00
Seriously optimise the search system via some profiling.
This commit is contained in:
parent
67648199d7
commit
3d3b6c491a
4 changed files with 50 additions and 42 deletions
|
@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
|
|||
/////////////////////////////////////////////////////////////////////////////
|
||||
/** The version of Pepperminty Wiki currently running. */
|
||||
$version = "v0.17-dev";
|
||||
$commit = "75b6b6c55fa9710d82b6623971581db7c6c5309b";
|
||||
$commit = "67648199d7ebd8a1b2ec400af0192dc0bb94b233";
|
||||
/// Environment ///
|
||||
/** Holds information about the current request environment. */
|
||||
$env = new stdClass();
|
||||
|
@ -792,7 +792,7 @@ function starts_with($haystack, $needle)
|
|||
function mb_stripos_all($haystack, $needle) {
|
||||
$s = 0; $i = 0;
|
||||
while(is_integer($i)) {
|
||||
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
|
||||
$i = mb_stripos($haystack, $needle, $s);
|
||||
if(is_integer($i)) {
|
||||
$aStrPos[] = $i;
|
||||
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
|
||||
|
@ -1268,10 +1268,12 @@ class ids
|
|||
public static function getid($pagename)
|
||||
{
|
||||
global $idindex;
|
||||
|
||||
|
||||
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
|
||||
foreach ($idindex as $id => $entry)
|
||||
{
|
||||
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
|
||||
// We don't need to normalise here because we normralise when assigning ids
|
||||
if($entry == $pagename_norm)
|
||||
return $id;
|
||||
}
|
||||
|
||||
|
@ -3782,12 +3784,16 @@ register_module([
|
|||
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
||||
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
||||
|
||||
$start = microtime(true);
|
||||
foreach($results as &$result) {
|
||||
$result["context"] = search::extract_context(
|
||||
$invindex, ids::getid($result["pagename"]),
|
||||
$_GET["query"],
|
||||
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
|
||||
);
|
||||
}
|
||||
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
|
||||
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
|
||||
|
||||
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
|
||||
|
||||
|
@ -4208,7 +4214,7 @@ class search
|
|||
public static function tokenize($source)
|
||||
{
|
||||
/** Normalises input characters for searching & indexing */
|
||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
|
||||
// We don't need to normalise here because the transliterator handles
|
||||
// this for us. Also, we can't move the literator to a static variable
|
||||
|
@ -4237,9 +4243,6 @@ class search
|
|||
{
|
||||
global $pageindex, $env, $paths, $settings;
|
||||
|
||||
/** Normalises input characters for searching & indexing */
|
||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
|
||||
if($output) {
|
||||
header("content-type: text/event-stream");
|
||||
ob_end_flush();
|
||||
|
@ -4415,7 +4418,7 @@ class search
|
|||
global $settings, $pageindex;
|
||||
|
||||
/** Normalises input characters for searching & indexing */
|
||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
|
||||
$query_terms = self::tokenize($query);
|
||||
$matching_pages = [];
|
||||
|
@ -4436,7 +4439,7 @@ class search
|
|||
if(isset($invindex[$qterm]))
|
||||
{
|
||||
// Loop over each page in the inverted index entry
|
||||
reset($invindex); // Reset array/object pointer
|
||||
reset($invindex[$qterm]); // Reset array/object pointer
|
||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||
{
|
||||
// Create an entry in the matching pages array if it doesn't exist
|
||||
|
@ -4486,7 +4489,7 @@ class search
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
reset($matching_pages);
|
||||
foreach($matching_pages as $pageid => &$pagedata)
|
||||
{
|
||||
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||
|
@ -4495,6 +4498,7 @@ class search
|
|||
$pageOffsets = [];
|
||||
|
||||
// Loop over each search term found on this page
|
||||
reset($pagedata["nterms"]);
|
||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||
{
|
||||
// Add the number of occurrences of this search term to the ranking
|
||||
|
@ -4554,23 +4558,23 @@ class search
|
|||
* @param string $source The page source to extract the context from.
|
||||
* @return string The generated context string.
|
||||
*/
|
||||
public static function extract_context($query, $source)
|
||||
public static function extract_context($invindex, $pageid, $query, $source)
|
||||
{
|
||||
global $settings;
|
||||
|
||||
$nterms = self::tokenize($query);
|
||||
$matches = [];
|
||||
// Loop over each nterm and find it in the source
|
||||
foreach($nterms as $nterm)
|
||||
{
|
||||
if(in_array($nterm, static::$stop_words))
|
||||
|
||||
foreach($nterms as $nterm) {
|
||||
// Skip over words that don't appear in the inverted index (e.g. stop words)
|
||||
if(!isset($invindex[$nterm]))
|
||||
continue;
|
||||
$all_offsets = mb_stripos_all($source, $nterm);
|
||||
// Skip over adding matches if there aren't any
|
||||
if($all_offsets === false)
|
||||
// Skip if the page isn't found in the inverted index for this word
|
||||
if(!isset($invindex[$nterm][$pageid]))
|
||||
continue;
|
||||
foreach($all_offsets as $offset)
|
||||
$matches[] = [ $nterm, $offset ];
|
||||
|
||||
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
|
||||
$matches[] = [ $nterm, $next_offset ];
|
||||
}
|
||||
|
||||
// Sort the matches by offset
|
||||
|
|
8
core.php
8
core.php
|
@ -405,7 +405,7 @@ function starts_with($haystack, $needle)
|
|||
function mb_stripos_all($haystack, $needle) {
|
||||
$s = 0; $i = 0;
|
||||
while(is_integer($i)) {
|
||||
$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
|
||||
$i = mb_stripos($haystack, $needle, $s);
|
||||
if(is_integer($i)) {
|
||||
$aStrPos[] = $i;
|
||||
$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
|
||||
|
@ -881,10 +881,12 @@ class ids
|
|||
public static function getid($pagename)
|
||||
{
|
||||
global $idindex;
|
||||
|
||||
|
||||
$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
|
||||
foreach ($idindex as $id => $entry)
|
||||
{
|
||||
if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
|
||||
// We don't need to normalise here because we normralise when assigning ids
|
||||
if($entry == $pagename_norm)
|
||||
return $id;
|
||||
}
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@
|
|||
"author": "Starbeamrainbowlabs",
|
||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||
"id": "feature-search",
|
||||
"lastupdate": 1529968213,
|
||||
"lastupdate": 1530018727,
|
||||
"optional": false
|
||||
},
|
||||
{
|
||||
|
|
|
@ -138,12 +138,16 @@ register_module([
|
|||
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
||||
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
||||
|
||||
$start = microtime(true);
|
||||
foreach($results as &$result) {
|
||||
$result["context"] = search::extract_context(
|
||||
$invindex, ids::getid($result["pagename"]),
|
||||
$_GET["query"],
|
||||
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
|
||||
);
|
||||
}
|
||||
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
|
||||
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
|
||||
|
||||
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
|
||||
|
||||
|
@ -564,7 +568,7 @@ class search
|
|||
public static function tokenize($source)
|
||||
{
|
||||
/** Normalises input characters for searching & indexing */
|
||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
|
||||
// We don't need to normalise here because the transliterator handles
|
||||
// this for us. Also, we can't move the literator to a static variable
|
||||
|
@ -593,9 +597,6 @@ class search
|
|||
{
|
||||
global $pageindex, $env, $paths, $settings;
|
||||
|
||||
/** Normalises input characters for searching & indexing */
|
||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
|
||||
if($output) {
|
||||
header("content-type: text/event-stream");
|
||||
ob_end_flush();
|
||||
|
@ -771,7 +772,7 @@ class search
|
|||
global $settings, $pageindex;
|
||||
|
||||
/** Normalises input characters for searching & indexing */
|
||||
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||
|
||||
$query_terms = self::tokenize($query);
|
||||
$matching_pages = [];
|
||||
|
@ -792,7 +793,7 @@ class search
|
|||
if(isset($invindex[$qterm]))
|
||||
{
|
||||
// Loop over each page in the inverted index entry
|
||||
reset($invindex); // Reset array/object pointer
|
||||
reset($invindex[$qterm]); // Reset array/object pointer
|
||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||
{
|
||||
// Create an entry in the matching pages array if it doesn't exist
|
||||
|
@ -842,7 +843,7 @@ class search
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
reset($matching_pages);
|
||||
foreach($matching_pages as $pageid => &$pagedata)
|
||||
{
|
||||
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||
|
@ -851,6 +852,7 @@ class search
|
|||
$pageOffsets = [];
|
||||
|
||||
// Loop over each search term found on this page
|
||||
reset($pagedata["nterms"]);
|
||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||
{
|
||||
// Add the number of occurrences of this search term to the ranking
|
||||
|
@ -910,23 +912,23 @@ class search
|
|||
* @param string $source The page source to extract the context from.
|
||||
* @return string The generated context string.
|
||||
*/
|
||||
public static function extract_context($query, $source)
|
||||
public static function extract_context($invindex, $pageid, $query, $source)
|
||||
{
|
||||
global $settings;
|
||||
|
||||
$nterms = self::tokenize($query);
|
||||
$matches = [];
|
||||
// Loop over each nterm and find it in the source
|
||||
foreach($nterms as $nterm)
|
||||
{
|
||||
if(in_array($nterm, static::$stop_words))
|
||||
|
||||
foreach($nterms as $nterm) {
|
||||
// Skip over words that don't appear in the inverted index (e.g. stop words)
|
||||
if(!isset($invindex[$nterm]))
|
||||
continue;
|
||||
$all_offsets = mb_stripos_all($source, $nterm);
|
||||
// Skip over adding matches if there aren't any
|
||||
if($all_offsets === false)
|
||||
// Skip if the page isn't found in the inverted index for this word
|
||||
if(!isset($invindex[$nterm][$pageid]))
|
||||
continue;
|
||||
foreach($all_offsets as $offset)
|
||||
$matches[] = [ $nterm, $offset ];
|
||||
|
||||
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
|
||||
$matches[] = [ $nterm, $next_offset ];
|
||||
}
|
||||
|
||||
// Sort the matches by offset
|
||||
|
|
Loading…
Reference in a new issue