1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-11-22 04:23:01 +00:00
Pepperminty-Wiki/modules/lib-search-engine.php
Starbeamrainbowlabs d131666ff5
Squash all the error messages, but it's not working as intended.
All that seems to have happened is that searches are taking longer and 
not doing anything different.....
2020-03-15 18:10:23 +00:00

997 lines
36 KiB
PHP

<?php
register_module([
"name" => "Library: Search engine",
"version" => "0.13",
"author" => "Starbeamrainbowlabs",
"description" => "A library module that provides the backend to the search engine module.",
"id" => "lib-search-engine",
"depends" => [ "lib-storage-box" ],
"code" => function() {
}
]);
/*
███████ ███████ █████ ██████ ██████ ██ ██
██ ██ ██ ██ ██ ██ ██ ██ ██
███████ █████ ███████ ██████ ██ ███████
██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
/**
* Holds a collection to methods to manipulate various types of search index.
* @package search
*/
class search
{
/**
* Words that we should exclude from the inverted index.
* @source http://xpo6.com/list-of-english-stop-words/
* @var string[]
*/
public static $stop_words = [
"a", "about", "above", "above", "across", "after", "afterwards", "again",
"against", "all", "almost", "alone", "along", "already", "also",
"although", "always", "am", "among", "amongst", "amoungst", "amount",
"an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway",
"anywhere", "are", "around", "as", "at", "back", "be", "became",
"because", "become", "becomes", "becoming", "been", "before",
"beforehand", "behind", "being", "below", "beside", "besides",
"between", "beyond", "bill", "both", "bottom", "but", "by", "call",
"can", "can't", "cannot", "co", "con", "could", "couldnt", "cry", "de",
"describe", "detail", "do", "done", "down", "due", "during", "each",
"eg", "eight", "either", "eleven", "else", "elsewhere", "empty",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "fill", "find",
"fire", "first", "five", "for", "former", "formerly", "found",
"four", "from", "front", "full", "further", "get", "give", "go", "had",
"has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
"hereby", "herein", "hereupon", "hers", "herself", "him", "himself",
"his", "how", "however", "ie", "if", "in", "inc", "indeed",
"interest", "into", "is", "it", "its", "it's", "itself", "keep", "last",
"latter", "latterly", "least", "less", "ltd", "made", "many", "may",
"me", "meanwhile", "might", "mine", "more", "moreover", "most",
"mostly", "move", "much", "must", "my", "myself", "name", "namely",
"neither", "never", "nevertheless", "next", "nine", "no", "none",
"nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once", "one", "only", "onto", "or", "other", "others", "otherwise",
"our", "ours", "ourselves", "out", "over", "own", "part", "per",
"perhaps", "please", "put", "rather", "re", "same", "see", "seem",
"seemed", "seeming", "seems", "serious", "several", "she", "should",
"show", "side", "since", "sincere", "six", "sixty", "so", "some",
"somehow", "someone", "something", "sometime", "sometimes",
"somewhere", "still", "such", "system", "take", "ten", "than", "that",
"the", "their", "them", "themselves", "then", "thence", "there",
"thereafter", "thereby", "therefore", "therein", "thereupon", "these",
"they", "thickv", "thin", "third", "this", "those", "though", "three",
"through", "throughout", "thru", "thus", "to", "together", "too", "top",
"toward", "towards", "twelve", "twenty", "two", "un", "under", "until",
"up", "upon", "us", "very", "via", "was", "we", "well", "were", "what",
"whatever", "when", "whence", "whenever", "where", "whereafter",
"whereas", "whereby", "wherein", "whereupon", "wherever", "whether",
"which", "while", "whither", "who", "whoever", "whole", "whom", "whose",
"why", "will", "with", "within", "without", "would", "yet", "you",
"your", "yours", "yourself", "yourselves"
];
/**
* The StorageBox that contains the inverted index.
* @var StorageBox
*/
private static $invindex = null;
/**
* The 'did you mean?' index for typo correction.
* Only populated if the feature-search-didyoumean module is present.
* @var BkTree
*/
private static $didyoumeanindex = null;
/**
* The transliterator that can be used to transliterate strings.
* Transliterated strings are more suitable for use with the search index.
* Note that this is no longer wrapped in a function as of v0.21 for
* performance reasons.
* @var Transliterator
*/
public static $literator = null;
/**
* Sorter for sorting lists of *transliterated* strings.
* Should work for non-transliterated strings too.
* @var Collator
*/
private static $sorter;
/**
* Initialises the search system.
* Do not call this function! It is called automatically.
*/
public static function init() {
self::$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
self::$sorter = new Collator("");
}
/**
* Logs a progress message in the right format depending on the current
* environment.
* @param string $message The message to log.
*/
private static function log_progress(string $message, bool $sameline = false) : void {
if(is_cli()) {
if($sameline) $message = "$message\r";
else $message = "$message\n";
echo($message);
}
else {
echo("data: $message\n\n");
flush();
}
}
/**
* Loads the didyoumean index.
* Don't forget to call this before making any search queries if didyoumean
* typo correction is enabled.
* Note that calling it multiple times has no effect. Returns true if the
* didyoumean index is already loaded.
* @param string $filename The filename of the didyoumean index.
* @param string $seed_word The seed word. If this changes, the index must be rebuilt.
* @return bool Whether the index was loaded successfully or not. Returns false if the feature-search-didyoumean module is not present.
*/
public static function didyoumean_load() : bool {
global $settings, $paths;
if(!module_exists("feature-search-didyoumean"))
return false;
// Avoid loading twice
if(is_a(self::$didyoumeanindex, BkTree::class))
return true;
self::$didyoumeanindex = new BkTree(
$paths->didyoumeanindex,
$settings->search_didyoumean_seed_word
);
self::$didyoumeanindex->set_costs(
$settings->search_didyoumean_cost_insert,
$settings->search_didyoumean_cost_delete,
$settings->search_didyoumean_cost_replace
);
return true;
}
/**
* Returns a correction for a given word according to the didyoumean index.
* Note that this is quite an expensive call.
* Check that the word exists in the regular search index first, and that
* it's not a stop word before calling this function.
* @param string $term The term to correct.
* @return string|null The closest correction found, or null if none could be located.
*/
public static function didyoumean_correct(string $term) : ?string {
global $settings, $paths, $env;
$start_time = microtime(true);
// Load the didyoumean index, but only if it's enabled etc
if(!module_exists("feature-search-didyoumean") || !$settings->search_didyoumean_enabled)
return null;
// If it's not loaded already, load the didyoumean index on-demand
if(self::$didyoumeanindex == null)
search::didyoumean_load($paths->searchindex);
$results = self::$didyoumeanindex->lookup(
$term,
$settings->search_didyoumean_editdistance
);
if(empty($results)) return null;
usort($results, function($a, $b) : int {
return self::compare($a, $b);
});
if(!isset($env->perfdata->didyoumean_correction))
$env->perfdata->didyoumean_correction = 0;
$env->perfdata->didyoumean_correction += (microtime(true) - $start_time) * 1000;
return $results[0];
}
public static function didyoumean_rebuild(bool $output = true) : void {
global $env;
if($output && !is_cli()) {
header("content-type: text/event-stream");
ob_end_flush();
}
$env->perfdata->didyoumean_rebuild = microtime(true);
if($output) self::log_progress("Beginning didyoumean index rebuild");
if($output) self::log_progress("Loading indexes");
self::invindex_load();
self::didyoumean_load();
if($output) self::log_progress("Populating index");
self::$didyoumeanindex->clear();
$i = 0;
foreach(self::$invindex->get_keys("|") as $key) {
$key = $key["key"];
if(self::$didyoumeanindex->add($key) === null && $output)
self::log_progress("[$i] Skipping '$key' as it's too long");
elseif($output && $i % 1500 == 0) self::log_progress("[$i] Added '$key'", true);
$i++;
}
self::log_progress(""); // Blank newline
if($output) self::log_progress("Syncing to disk...");
// Closing = saving, but we can't use it afterwards
self::$didyoumeanindex->close();
// Just in case it's loaded again later
self::$didyoumeanindex = null;
$env->perfdata->didyoumean_rebuild = round(microtime(true) - $env->perfdata->didyoumean_rebuild, 4);
if($output) self::log_progress("didyoumean index rebuild complete in {$env->perfdata->didyoumean_rebuild}s");
}
/**
* Converts a source string into an index of search terms that can be
* merged into an inverted index.
* Automatically transliterates the source string.
* @param string $source The source string to index.
* @return array An index represents the specified string.
*/
public static function index_generate(string $source) : array {
// We don't need to normalise or transliterate here because self::tokenize() does this for us
$source = html_entity_decode($source, ENT_QUOTES);
$source_length = mb_strlen($source);
$index = [];
$terms = self::tokenize($source, true);
foreach($terms as $term) {
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
if(in_array($term[0], self::$stop_words)) continue;
if(!isset($index[$term[0]]))
$index[$term[0]] = [ "freq" => 0, "offsets" => [] ];
$index[$term[0]]["freq"]++;
$index[$term[0]]["offsets"][] = $term[1];
}
return $index;
}
/**
* Converts a source string into a series of raw tokens.
* @param string $source The source string to process.
* @param bool $capture_offsets Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ].
* @return array An array of raw tokens extracted from the specified source string.
*/
public static function tokenize(string $source, bool $capture_offsets = false) : array {
$flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items
if($capture_offsets)
$flags |= PREG_SPLIT_OFFSET_CAPTURE;
// We don't need to normalise here because the transliterator handles
$source = self::$literator->transliterate($source);
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags);
}
/**
* Removes (most) markdown markup from the specified string.
* Stripped strings are not suitable for indexing!
* @param string $source The source string to process.
* @return string The stripped string.
*/
public static function strip_markup(string $source) : string {
return preg_replace('/([\"*_\[\]]| - |`)/u', "", $source);
}
/**
* Rebuilds the master inverted index and clears the page id index.
* @param bool $output Whether to send progress information to the user's browser.
*/
public static function invindex_rebuild(bool $output = true) : void {
global $pageindex, $env, $paths, $settings;
$env->perfdata->invindex_rebuild = microtime(true);
if($output && !is_cli()) {
header("content-type: text/event-stream");
ob_end_flush();
}
// Clear the id index out
ids::clear();
// Clear the existing inverted index out
if(self::$invindex == null)
self::invindex_load($paths->searchindex);
self::$invindex->clear();
self::$invindex->set("|termlist|", []);
// Reindex each page in turn
$i = 0; $max = count(get_object_vars($pageindex));
$missing_files = 0;
foreach($pageindex as $pagename => $pagedetails)
{
$page_filename = $env->storage_prefix . $pagedetails->filename;
if(!file_exists($page_filename)) {
if(!is_cli()) echo("data: ");
echo("[" . ($i + 1) . " / $max] Error: Can't find $page_filename\n");
flush();
$i++; $missing_files++;
continue;
}
// We do not transliterate or normalise here because the indexer will take care of this for us
$index = self::index_generate(file_get_contents($page_filename));
$pageid = ids::getid($pagename);
self::invindex_merge($pageid, $index);
if($output) {
$message = "[" . ($i + 1) . " / $max] Added $pagename (id #$pageid) to the new search index.";
if(!is_cli()) $message = "data: $message\n\n";
else $message = "$message\r";
echo($message);
flush();
}
$i++;
}
$msg = "Syncing to disk....";
if(!is_cli()) $msg = "data: $msg\n\n";
else $msg = "$msg\r";
echo($msg);
self::invindex_close();
$env->perfdata->invindex_rebuild = round(microtime(true) - $env->perfdata->invindex_rebuild, 4);
if($output && !is_cli()) {
echo("data: Search index rebuilding complete in {$env->perfdata->invindex_rebuild}s.\n\n");
echo("data: Couldn't find $missing_files pages on disk. If $settings->sitename couldn't find some pages on disk, then you might need to manually correct $settings->sitename's page index (stored in pageindex.json).\n\n");
echo("data: Done! Saving new search index to '$paths->searchindex'.\n\n");
}
if(is_cli()) echo("\nSearch index rebuilding complete in {$env->perfdata->invindex_rebuild}s.\n");
}
/**
* Sorts an index alphabetically.
* This allows us to do a binary search instead of a regular
* sequential search.
* @param array $index The index to sort.
*/
public static function index_sort(&$index) {
$sorter = new Collator("");
uksort($index, function($a, $b) use($sorter) : int {
return $sorter->compare($a, $b);
});
}
/**
* Compares two *regular* indexes to find the differences between them.
* @param array $oldindex The old index.
* @param array $newindex The new index.
* @param array $changed An array to be filled with the nterms of all the changed entries.
* @param array $removed An array to be filled with the nterms of all the removed entries.
*/
public static function index_compare($oldindex, $newindex, &$changed, &$removed) {
foreach($oldindex as $nterm => $entry) {
if(!isset($newindex[$nterm]))
$removed[] = $nterm;
}
foreach($newindex as $nterm => $entry) {
if(!isset($oldindex[$nterm]) or // If this word is new
$newindex[$nterm] !== $oldindex[$nterm]) // If this word has changed
$changed[$nterm] = $newindex[$nterm];
}
}
/**
* Loads a connection to an inverted index.
*/
public static function invindex_load() {
global $env, $paths;
$start_time = microtime(true);
self::$invindex = new StorageBox($paths->searchindex);
$env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3);
}
/**
* Closes the currently open inverted index.
*/
public static function invindex_close() {
global $env;
$start_time = microtime(true);
self::$invindex->close();
$env->perfdata->searchindex_close_time = round((microtime(true) - $start_time)*1000, 3);
}
/**
* Merge an index into an inverted index.
* @param int $pageid The id of the page to assign to the index that's being merged.
* @param array $index The regular index to merge.
* @param array $removals An array of index entries to remove from the inverted index. Useful for applying changes to an inverted index instead of deleting and remerging an entire page's index.
*/
public static function invindex_merge($pageid, &$index, &$removals = []) : void {
if(self::$invindex == null)
throw new Exception("Error: Can't merge into an inverted index that isn't loaded.");
if(!self::$invindex->has("|termlist|"))
self::$invindex->set("|termlist|", []);
$termlist = self::$invindex->get("|termlist|");
// Remove all the subentries that were removed since last time
foreach($removals as $nterm) {
// Delete the offsets
self::$invindex->delete("$nterm|$pageid");
// Delete the item from the list of pageids containing this term
$nterm_pageids = self::$invindex->get_arr_simple($nterm);
array_splice($nterm_pageids, array_search($pageid, $nterm_pageids), 1);
if(empty($nterm_pageids)) { // No need to keep the pageid list if there's nothing in it
self::$invindex->delete($nterm);
// Update the termlist if we're deleting the term completely
$termlist_loc = array_search($nterm, $termlist);
if($termlist_loc !== false) array_splice($termlist, $termlist_loc, 1);
}
else
self::$invindex->set_arr_simple($nterm, $nterm_pageids);
}
// Merge all the new / changed index entries into the inverted index
foreach($index as $nterm => $newentry) {
// if(!is_string($nterm)) $nterm = strval($nterm);
if(!self::$invindex->has($nterm)) {
self::$invindex->set_arr_simple($nterm, []);
$termlist[] = $nterm;
}
// Update the nterm pageid list
$nterm_pageids = self::$invindex->get_arr_simple($nterm);
if(array_search($pageid, $nterm_pageids) === false) {
$nterm_pageids[] = $pageid;
self::$invindex->set_arr_simple($nterm, $nterm_pageids);
}
// Store the offset list
self::$invindex->set("$nterm|$pageid", $newentry);
}
self::$invindex->set("|termlist|", $termlist);
}
/**
* Deletes the given pageid from the given pageindex.
* @param int $pageid The pageid to remove.
*/
public static function invindex_delete(int $pageid) {
$termlist = self::$invindex->get("|termlist|");
foreach($termlist as $nterm) {
$nterm_pageids = self::$invindex->get_arr_simple($nterm);
$nterm_loc = array_search($pageid, $nterm_pageids);
// If this nterm doesn't appear in the list, we're not interested
if($nterm_loc === false)
continue;
// Delete it from the ntemr list
array_splice($nterm_pageids, $nterm_loc, 1);
// Delete the offset list
self::$invindex->delete("$nterm|$pageid");
// If this term doesn't appear in any other documents, delete it
if(count($nterm_pageids) === 0) {
self::$invindex->delete($nterm);
array_splice($termlist, array_search($nterm, $termlist), 1);
}
else // Save the document id list back, since it still contains other pageids
self::$invindex->set_arr_simple($nterm, $nterm_pageids);
}
// Save the termlist back to the store
self::$invindex->set("|termlist|", $termlist);
}
/*
* ███████ ████████ █████ ███████
* ██ ██ ██ ██ ██
* ███████ ██ ███████ ███████
* ██ ██ ██ ██ ██
* ███████ ██ ██ ██ ███████
*/
/**
* Splits a query string into tokens. Does not require that the input string be transliterated.
* Was based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
* Now improved to be strtok-based, since it's much faster.
* Example I used when writing this: https://www.php.net/manual/en/function.strtok.php#94463
* @param string $query The query string to split.
*/
public function stas_split($query) {
$query = self::$literator->transliterate($query);
$terms = [];
$next_token = strtok($query, " \r\n\t");
while(true) {
if(strpos($next_token, '"') !== false)
$next_token .= " " . strtok('"') . '"';
if(strpos($next_token, "'") !== false)
$next_token .= " " . strtok("'") . "'";
$terms[] = $next_token;
$next_token = strtok(" \r\n\t");
if($next_token === false) break;
}
return $terms;
}
/**
* Parses an array of query tokens into an associative array of search directives.
* Supported syntax derived from these sources:
* https://help.duckduckgo.com/duckduckgo-help-pages/results/syntax/
* https://docs.microsoft.com/en-us/windows/win32/lwef/-search-2x-wds-aqsreference
* @param string[] $tokens The array of query tokens to parse.
*/
public function stas_parse($tokens) {
global $settings;
/* Supported Syntax *
*
* -term exclude a term
* +term double the weighting of a term
* terms !dest terms redirect entire query (minus the !bang) to interwiki with registered shortcut dest
* prefix:term apply prefix operator to term
* "term" exactly this term (don't try and correct)
*/
$result = [
"terms" => [],
"exclude" => [],
"interwiki" => null
];
$count = count($tokens);
for($i = count($tokens) - 1; $i >= 0; $i--) {
// Look for excludes
if($tokens[$i][0] == "-") {
if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
$result["tokens"][] = [
"term" => substr($tokens[$i], 1),
"weight" => -1,
"location" => "all",
"exact" => false
];
}
else // FUTURE: Correct excludes too
$result["exclude"][] = substr($tokens[$i], 1);
continue;
}
// Look for weighted terms
if($tokens[$i][0] == "+") {
if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
$result["tokens"] = [ "term" => substr($tokens[$i], 1), "weight" => -1, "location" => "all" ];
}
else {
$term = trim(substr($tokens[$i], 1), '"');
$result["terms"][] = [
"term" => $term,
"weight" => 2,
"location" => "all",
// if it's different, then there were quotes
"exact" => substr($tokens[$i], 1) != $term
];
}
continue;
}
// Look for interwiki searches
// You can only go to 1 interwiki destination at once, so we replace any previous finding with this one
if($tokens[$i][0] == "!" || substr($tokens[$i], -1) == "!")
$result["interwiki"] = trim($tokens[$i], "!");
// Look for colon directives in the form directive:term
// Also supports prefix:"quoted term with spaces", quotes stripped automatically
/*** Example directives *** (. = implemented, * = not implemented)
. intitle search only page titles for term
. intags search only tags for term
. inbody search page body only for term
* before search only pages that were last modified before term
* after search only pages that were last modified after term
* size search only pages that match the size spec term (e.g. 1k+ -> more than 1k bytes, 2k- -> less than 2k bytes, >5k -> more than 5k bytes, <10k -> less than 10k bytes)
**************************/
if(strpos($tokens[$i], ":") !== false) {
$parts = explode(":", $tokens[$i], 2);
$exact = false;
$term = trim($parts[1], '"');
// If we trim off quotes, then it must be because it should be exact
if($parts[1] != $term) $exact = true;
switch($parts[0]) {
case "intitle": // BUG: What if a normal word is found in a title?
$result["terms"][] = [
"term" => $term,
"weight" => $settings->search_title_matches_weighting * mb_strlen($parts[1]),
"location" => "title",
"exact" => $exact
];
break;
case "intags":
$result["terms"][] = [
"term" => $term,
"weight" => $settings->search_tags_matches_weighting * mb_strlen($parts[1]),
"location" => "tags",
"exact" => $exact
];
break;
case "inbody":
$result["terms"][] = [
"term" => $term,
"weight" => 1,
"location" => "body",
"exact" => $exact
];
break;
default:
if(!isset($result[$parts[0]]))
$result[$parts[0]] = [];
$result[$parts[0]][] = $term;
break;
}
continue;
}
$exact = false;
$term = trim($tokens[$i], '"');
// If we trim off quotes, then it must be because it should be exact
if($tokens[$i] != $term) $exact = true;
// Doesn't appear to be particularly special *shrugs*
// Set the weight to -1 if it's a stop word
$result["terms"][] = [
"term" => $term,
"weight" => in_array($tokens[$i], self::$stop_words) ? -1 : 1,
"location" => "all",
"exact" => $exact // If true then we shouldn't try to autocorrect it
];
}
// Correct typos, but only if that's enabled
if(module_exists("feature-search-didyoumean") && $settings->search_didyoumean_enabled) {
foreach($result["terms"] as $term_data) {
if($term_data["exact"] || // Skip exact-only
$term_data["weight"] < 1 || // Skip stop & irrelevant words
self::invindex_term_exists($term_data["term"])) continue;
// It's not a stop word or in the index, try and correct it
// self::didyoumean_correct auto-loads the didyoumean index on-demand
$correction = self::didyoumean_correct($term_data["term"]);
// Make a note if we fail to correct a term
if(!is_string($correction)) {
$term_data["corrected"] = false;
continue;
}
$term_data["term_before"] = $term_data["term"];
$term_data["term"] = $correction;
$term_data["corrected"] = true;
}
}
return $result;
}
/**
* Determines whether a term exists in the currently loaded inverted search
* index.
* Note that this only checked for precisely $term. See
* search::didyoumean_correct() for typo correction.
* @param string $term The term to search for.
* @return bool Whether term exists in the inverted index or not.
*/
public static function invindex_term_exists(string $term) {
// In the inverted index $term should have a list of page names in it
// if the temr exists in the index, and won't exists if not
return self::$invindex->has($term);
}
/**
* Searches the given inverted index for the specified search terms.
* @param string $query The search query.
* @return array An array of matching pages.
*/
public static function invindex_query($query)
{
global $settings, $pageindex;
$query_stas = self::stas_parse(
self::stas_split(self::$literator->transliterate($query))
);
/* Sub-array format:
* [
* nterms : [ nterm => frequency, nterm => frequency, .... ],
* offsets_body : int[],
* matches_title : int,
* matches_tags : int
* ]
*/
$matching_pages = [];
$match_template = [
"nterms" => [],
"offsets_body" => [],
"rank_title" => 0,
"rank_tags" => 0
];
// Query the inverted index
foreach($query_stas["terms"] as $term_def) {
if($term_def["weight"] == -1)
continue; // Skip stop words
if(!in_array($term_def["location"], ["all", "inbody"]))
continue; // Skip terms we shouldn't search the page body for
if(!self::$invindex->has($term_def["term"]))
continue; // Skip if it's not in the index
// For each page that contains this term.....
$term_pageids = self::$invindex->get_arr_simple($term_def["term"]);
foreach($term_pageids as $pageid) {
// Check to see if it contains any words we should exclude
$skip = false;
foreach($query_stas["exclude"] as $excl_term) {
if(self::$invindex->has("$excl_term|$pageid")) {
$skip = true;
break;
}
}
if($skip) continue;
// Get the list of offsets
$page_offsets = self::$invindex->get("{$term_def["term"]}|$pageid");
if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = $match_template; // Arrays are assigned by copy in php
// Add it to the appropriate $matching_pages entry, not forgetting to apply the weighting
$matching_pages[$pageid]["offsets_body"] = array_merge(
$matching_pages[$pageid]["offsets_body"],
$page_offsets->offsets
);
$matching_pages[$pageid]["nterms"][$term_def["term"]] = $page_offsets->freq * $term_def["weight"];
}
}
// Query page titles & tags
foreach($query_stas["terms"] as $term_def) {
// No need to skip stop words here, since we're doing a normal
// sequential search anyway
if(!in_array($term_def["location"], ["all", "title", "tags"]))
continue; // Skip terms we shouldn't search the page body for
// Loop over the pageindex and search the titles / tags
reset($pageindex); // Reset array/object pointer
foreach($pageindex as $pagename => $pagedata) {
// Setup a variable to hold the current page's id
$pageid = null; // Cache the page id
$lit_title = self::$literator->transliterate($pagename);
$lit_tags = isset($pagedata->tags) ? self::$literator->transliterate(implode(" ", $pagedata->tags)) : null;
// Make sure that the title & tags don't contain a term we should exclude
$skip = false;
foreach($query_stas["exclude"] as $excl_term) {
if(mb_strpos($lit_title, $excl_term) !== false) {
$skip = true;
// Delete it from the candidate matches (it might be present in the tags / title but not the body)
if(isset($matching_pages[$excl_term]))
unset($matching_pages[$excl_term]);
break;
}
}
if($skip) continue;
// Consider matches in the page title
if(in_array($term_def["location"], ["all", "title"])) {
// FUTURE: We may be able to optimise this further by using preg_match_all + preg_quote instead of mb_stripos_all. Experimentation / benchmarking is required to figure out which one is faster
$title_matches = mb_stripos_all($lit_title, $term_def["term"]);
$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
if($title_matches_count > 0) {
$pageid = ids::getid($pagename); // Fetch the page id
// We found the qterm in the title
if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = $match_template; // Assign by copy
$matching_pages[$pageid]["rank_title"] += $title_matches_count * $term_def["weight"];
}
}
// If this page doesn't have any tags, skip it
if($lit_tags == null)
continue;
if(!in_array($term_def["location"], ["all", "tags"]))
continue; // If we shouldn't search the tags, no point in continuing
// Consider matches in the page's tags
$tag_matches = isset($pagedata->tags) ? mb_stripos_all($lit_tags, $term_def["term"]) : false;
$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
if($tag_matches_count > 0) {// And we found the qterm in the tags
if($pageid === null) // Fill out the page id if it hasn't been already
$pageid = ids::getid($pagename);
if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = $match_template; // Assign by copy
$matching_pages[$pageid]["rank_tags"] += $tag_matches_count * $term_def["weight"];
}
}
}
// TODO: Implement the rest of STAS here
reset($matching_pages);
foreach($matching_pages as $pageid => &$pagedata) {
$pagedata["pagename"] = ids::getpagename($pageid);
$pagedata["rank"] = 0;
$pageOffsets = [];
// Loop over each search term found on this page
reset($pagedata["nterms"]);
foreach($pagedata["nterms"] as $pterm => $frequency) {
// Add the number of occurrences of this search term to the ranking
// Multiply it by the length of the word
$pagedata["rank"] += $frequency * strlen($pterm);
}
// Consider matches in the title / tags
$pagedata["rank"] += $pagedata["rank_title"] + $pagedata["rank_tags"];
// TODO: Consider implementing kernel density estimation here.
// https://en.wikipedia.org/wiki/Kernel_density_estimation
// We want it to have more of an effect the more words that are present in the query. Maybe a logarithmic function would be worth investigating here?
// TODO: Remove items if the computed rank is below a threshold
}
uasort($matching_pages, function($a, $b) {
if($a["rank"] == $b["rank"]) return 0;
return ($a["rank"] < $b["rank"]) ? +1 : -1;
});
return $matching_pages;
}
/**
* Extracts a context string (in HTML) given a search query that could be displayed
* in a list of search results.
* @param string $pagename The name of the paget that this source belongs to. Used when consulting the inverted index.
* @param string $query The search queary to generate the context for.
* @param string $source The page source to extract the context from.
* @return string The generated context string.
*/
public static function extract_context($pagename, $query, $source)
{
global $settings;
$pageid = ids::getid($pagename);
$nterms = self::stas_parse(self::stas_split($query))["terms"];
// Query the inverted index for offsets
$matches = [];
foreach($nterms as $nterm) {
// Skip if the page isn't found in the inverted index for this word
if(!self::$invindex->has("{$nterm["term"]}|$pageid"))
continue;
$nterm_offsets = self::$invindex->get("{$nterm["term"]}|$pageid")->offsets;
foreach($nterm_offsets as $next_offset)
$matches[] = [ $nterm["term"], $next_offset ];
}
// Sort the matches by offset
usort($matches, function($a, $b) {
if($a[1] == $b[1]) return 0;
return ($a[1] > $b[1]) ? +1 : -1;
});
$sourceLength = mb_strlen($source);
$contexts = [];
$matches_count = count($matches);
$total_context_length = 0;
for($i = 0; $i < $matches_count; $i++) {
$next_context = [
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
"to" => min($sourceLength, $matches[$i][1] + mb_strlen($matches[$i][0]) + $settings->search_characters_context)
];
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
// This next context overlaps with the previous one
// Extend the last one instead of adding a new one
// The array pointer is pointing at the last element now because we called end() above
// Update the total context length counter appropriately
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
$contexts[key($contexts)]["to"] = $next_context["to"];
}
else { // No overlap here! Business as usual.
$contexts[] = $next_context;
// Update the total context length counter as normal
$total_context_length += $next_context["to"] - $next_context["from"];
}
end($contexts);
$last_context = &$contexts[key($contexts)];
if($total_context_length > $settings->search_characters_context_total) {
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
break;
}
}
$contexts_text = [];
foreach($contexts as $context) {
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
}
// BUG: Make sure that a snippet is centred on the word in question if we have to cut it short
$result = implode("", $contexts_text);
end($contexts); // If there's at least one item in the list and were not at the very end of the page, add an extra ellipsis
if(isset($contexts[0]) && $contexts[key($contexts)]["to"] < $sourceLength) $result .= "";
// Prepend an ellipsis if the context doesn't start at the beginning of a page
if(isset($contexts[0]) && $contexts[0]["from"] > 0) $result = "$result";
return $result;
}
/**
* Highlights the keywords of a context string.
* @param string $query The query to use when highlighting.
* @param string $context The context string to highlight.
* @return string The highlighted (HTML) string.
*/
public static function highlight_context($query, $context)
{
$qterms = self::stas_parse(self::stas_split($query))["terms"];
foreach($qterms as $qterm) {
// Stop words are marked by STAS
if($qterm["weight"] == -1)
continue;
// From http://stackoverflow.com/a/2483859/1460422
$context = preg_replace("/" . preg_replace('/\\//u', "\/", preg_quote($qterm["term"])) . "/iu", "<strong class='search-term-highlight'>$0</strong>", $context);
}
return $context;
}
}
// Run the init function
search::init();
?>