Integrate didyoumean into the main search engine, but it's crashing.

We're getting there though!
This commit is contained in:
Starbeamrainbowlabs 2020-03-15 17:54:27 +00:00
parent 67ea09b5cf
commit f632c0907c
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
8 changed files with 267 additions and 50 deletions

View File

@ -69,8 +69,7 @@ function register_module($settings)
$module_count = count($modules);
$i = 1;
foreach($modules as $filename)
{
foreach($modules as $filename) {
echo("[$i / $module_count] Processing $filename \r");
require($filename);
$i++;

View File

@ -42,6 +42,8 @@ $paths = new stdClass();
$paths->pageindex = "pageindex.json";
/** The inverted index used for searching. Use the `search` class to interact with this - otherwise your brain might explode :P */
$paths->searchindex = "invindex.sqlite";
/** The didyoumean index for typo correction. Used by the search class - which also exposes an interface for interacting with it directly. */
$paths->didyoumeanindex = "didyoumeaninddex.sqlite";
/** The index that maps ids to page names. Use the `ids` class to interact with it :-) */
$paths->idindex = "idindex.json";
/** The cache of the most recently calculated statistics. */

View File

@ -4,10 +4,72 @@ register_module([
"version" => "0.1",
"author" => "Starbeamrainbowlabs",
"description" => "Ever searched for something but couldn't find it because you couldn't spell it correctly? This module is for you! It adds spelling correction for search queries based on the words in the inverted search index.",
"id" => "lib-storage-box",
"depends" => [ "lib-search-engine" ],
"id" => "feature-search-didyoumean",
"depends" => [ "lib-search-engine", "lib-storage-box" ],
"code" => function() {
/*
██████ ███████ ██████ ██ ██ ██ ██ ██████
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
██████ █████ ██████ ██ ██ ██ ██ ██ ██
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
██ ██ ███████ ██████ ██████ ██ ███████ ██████
*/
add_action("didyoumean-rebuild", function() {
global $env, $settings;
if($env->is_admin ||
(
!empty($_POST["secret"]) &&
$_POST["secret"] === $settings->secret
)
)
search::didyoumean_rebuild();
else
{
http_response_code(401);
exit(page_renderer::render_main("Error - didyoumean index regenerator - $settings->sitename", "<p>Error: You aren't allowed to regenerate the didyoumean index. Try logging in as an admin, or setting the <code>secret</code> POST parameter to $settings->sitename's secret - which can be found in $settings->sitename's <code>peppermint.json</code> file.</p>"));
}
});
/*
* ██████ ██ ██
* ██ ██ ██
* ██ ██ ██
* ██ ██ ██
* ██████ ███████ ██
*/
if(module_exists("feature-cli")) {
cli_register("didyoumean", "Query and manipulate the didyoumean index", function(array $args) : int {
if(count($args) < 1) {
echo("didyoumean: query and manipulate the didyoumean index
Usage:
didyoumean {subcommand}
Subcommands:
rebuild Rebuilds the didyoumean index
correct {word} Corrects {word} using the didyoumean index (careful: it's case-sensitive and operates on transliterated text *only*)
");
return 0;
}
switch($args[0]) {
case "rebuild":
search::didyoumean_rebuild();
break;
case "correct":
search::didyoumean_load();
if(count($args) < 2) {
echo("Error: Not enough arguments\n");
return 1;
}
echo("Correction: ".search::didyoumean_correct($args[1])."\n");
break;
}
return 0;
});
}
}
]);
@ -50,17 +112,20 @@ class BkTree {
private $cost_replace = 1;
public function __construct(string $filename, string $seed_word) {
$this->box = new JsonStorageBox($filename);
$this->box = new StorageBox($filename);
$this->seed_word = $seed_word;
$this->init();
}
private function init() : void {
if(!$this->box->has("node|0")) {
// If the root node of the tree doesn't exist, create it
$new = new stdClass();
$new->value = $string;
$new->value = $this->seed_word;
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
$this->box->set("node|0", $this->seed_word);
$this->box->set("node|0", $new);
$this->increment_node_count();
return 0;
}
}
@ -122,6 +187,9 @@ class BkTree {
// Can't add the seed word to the tree
if($string == $this->seed_word)
return null;
// PHP's levenshtein() function only works on strings up to 255 chars, apparently
if(strlen($string) > 255)
return null;
if(!$this->box->has("node|$starting_node_id"))
throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion");
@ -317,7 +385,7 @@ class BkTree {
* If the tree isn't balanced, you may need to insert items in a different order.
* @return array An array of statistics about this BK-Tree.
*/
public function stats() array {
public function stats() : array {
$result = [
"depth_max" => 0,
"depth_min_leaf" => INF,
@ -416,6 +484,11 @@ class BkTree {
}
}
public function clear() : void {
$this->box->clear();
$this->init();
}
/**
* Saves changes to the tree back to disk.
* @return void

View File

@ -123,9 +123,12 @@ register_module([
global $settings, $env, $pageindex, $paths;
// Create the inverted index if it doesn't exist.
// todo In the future perhaps a CLI for this would be good?
if(!file_exists($paths->searchindex))
search::invindex_rebuild(false);
// Create the didyoumean index if it doesn't exist.
if(module_exists("feature-search-didyoumean") && !file_exists($paths->didyoumeanindex))
search::didyoumean_rebuild(false);
if(!isset($_GET["query"]))
exit(page_renderer::render("No Search Terms - Error - $settings->sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
@ -332,6 +335,19 @@ register_module([
add_action("stas-parse", function() {
global $settings;
if(!isset($_GET["query"])) {
http_response_code(400);
header("x-status: failed");
header("x-problem: no-query-specified");
exit(page_renderer::render_main("Error - STAS Query Analysis - $settings->sitename", "<p>No query was present in the <code>query</code> GET parameter.</p>"));
}
// The indexes are only needed if didyoumean is enabled
if(module_exists("feature-search-didyoumean") && $settings->search_didyoumean_enabled) {
search::invindex_load();
search::didyoumean_load();
}
$tokens = search::stas_split($_GET["query"]);
$stas_query = search::stas_parse($tokens);
@ -342,7 +358,6 @@ register_module([
continue;
}
$term = null;
$token_part = $token;
if($token_part[0] == "+") $token_part = substr($token_part, 1);

View File

@ -113,21 +113,47 @@ class search
self::$sorter = new Collator("");
}
/**
* Logs a progress message in the right format depending on the current
* environment.
* @param string $message The message to log.
*/
private static function log_progress(string $message, bool $sameline = false) : void {
if(is_cli()) {
if($sameline) $message = "$message\r";
else $message = "$message\n";
echo($message);
}
else {
echo("data: $message\n\n");
flush();
}
}
/**
* Loads the didyoumean index.
* Don't forget to call this before making any search queries if didyoumean
* typoy correction is enabled.
* typo correction is enabled.
* Note that calling it multiple times has no effect. Returns true if the
* didyoumean index is already loaded.
* @param string $filename The filename of the didyoumean index.
* @param string $seed_word The seed word. If this changes, the index must be rebuilt.
* @return bool Whether the index was loaded successfully or not. Returns false if the feature-search-didyoumean module is not present.
*/
public static function didyoumean_load(string $filename, string $seed_word) : bool {
global $settings;
public static function didyoumean_load() : bool {
global $settings, $paths;
if(!module_exists("feature-search-didyoumean"))
return false;
$this->didyoumeanindex = new BkTree($filename, $seed_word);
$this->didyoumeanindex->set_costs(
// Avoid loading twice
if(is_a(self::$didyoumeanindex, BkTree::class))
return true;
self::$didyoumeanindex = new BkTree(
$paths->didyoumeanindex,
$settings->search_didyoumean_seed_word
);
self::$didyoumeanindex->set_costs(
$settings->search_didyoumean_cost_insert,
$settings->search_didyoumean_cost_delete,
$settings->search_didyoumean_cost_replace
@ -145,7 +171,7 @@ class search
*/
public static function didyoumean_correct(string $term) : ?string {
global $settings;
$results = $this->didyoumeanindex->lookup(
$results = self::$didyoumeanindex->lookup(
$term,
$settings->search_didyoumean_editdistance
);
@ -156,6 +182,46 @@ class search
return $results[0];
}
public static function didyoumean_rebuild(bool $output = true) : void {
global $env;
if($output && !is_cli()) {
header("content-type: text/event-stream");
ob_end_flush();
}
$env->perfdata->didyoumean_rebuild = microtime(true);
if($output) self::log_progress("Beginning didyoumean index rebuild");
if($output) self::log_progress("Loading indexes");
self::invindex_load();
self::didyoumean_load();
if($output) self::log_progress("Populating index");
self::$didyoumeanindex->clear();
$i = 0;
foreach(self::$invindex->get_keys("|") as $key) {
$key = $key["key"];
if(self::$didyoumeanindex->add($key) === null && $output)
self::log_progress("[$i] Skipping '$key' as it's too long");
elseif($output && $i % 1500 == 0) self::log_progress("[$i] Added '$key'", true);
$i++;
}
self::log_progress(""); // Blank newline
if($output) self::log_progress("Syncing to disk...");
// Closing = saving, but we can't use it afterwards
self::$didyoumeanindex->close();
// Just in case it's loaded again later
self::$didyoumeanindex = null;
$env->perfdata->didyoumean_rebuild = round(microtime(true) - $env->perfdata->didyoumean_rebuild, 4);
if($output) self::log_progress("didyoumean index rebuild complete in {$env->perfdata->didyoumean_rebuild}s");
}
/**
* Converts a source string into an index of search terms that can be
* merged into an inverted index.
@ -281,7 +347,6 @@ class search
echo("data: Done! Saving new search index to '$paths->searchindex'.\n\n");
}
if(is_cli()) echo("\nSearch index rebuilding complete in {$env->perfdata->invindex_rebuild}s.\n");
// No need to save, it's an SQLite DB backend
}
/**
@ -318,13 +383,11 @@ class search
/**
* Loads a connection to an inverted index.
* @param string $invindex_filename The path to the inverted index to load.
* @todo Remove this function and make everything streamable
*/
public static function invindex_load(string $invindex_filename) {
public static function invindex_load() {
global $env, $paths;
$start_time = microtime(true);
self::$invindex = new StorageBox($invindex_filename);
self::$invindex = new StorageBox($paths->searchindex);
$env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3);
}
@ -445,7 +508,6 @@ class search
$terms = [];
$next_token = strtok($query, " \r\n\t");
while(true) {
if(strpos($next_token, '"') !== false)
$next_token .= " " . strtok('"') . '"';
if(strpos($next_token, "'") !== false)
@ -477,15 +539,13 @@ class search
* +term double the weighting of a term
* terms !dest terms redirect entire query (minus the !bang) to interwiki with registered shortcut dest
* prefix:term apply prefix operator to term
* "term" exactly this term (don't try and correct)
*/
// var_dump($tokens);
$result = [
"terms" => [],
"exclude" => [],
"interwiki" => null
];
// foreach($operators as $op)
// $result[$op] = [];
$count = count($tokens);
@ -496,10 +556,11 @@ class search
$result["tokens"][] = [
"term" => substr($tokens[$i], 1),
"weight" => -1,
"location" => "all"
"location" => "all",
"exact" => false
];
}
else
else // FUTURE: Correct excludes too
$result["exclude"][] = substr($tokens[$i], 1);
continue;
@ -511,21 +572,23 @@ class search
$result["tokens"] = [ "term" => substr($tokens[$i], 1), "weight" => -1, "location" => "all" ];
}
else {
$term = trim(substr($tokens[$i], 1), '"');
$result["terms"][] = [
"term" => substr($tokens[$i], 1),
"term" => $term,
"weight" => 2,
"location" => "all"
"location" => "all",
// if it's different, then there were quotes
"exact" => substr($tokens[$i], 1) != $term
];
}
continue;
}
// Look for interwiki searches
if($tokens[$i][0] == "!" || substr($tokens[$i], -1) == "!") {
// You can only go to 1 interwiki destination at once, so we replace any previous finding with this one
// You can only go to 1 interwiki destination at once, so we replace any previous finding with this one
if($tokens[$i][0] == "!" || substr($tokens[$i], -1) == "!")
$result["interwiki"] = trim($tokens[$i], "!");
}
// Look for colon directives in the form directive:term
// Also supports prefix:"quoted term with spaces", quotes stripped automatically
/*** Example directives *** (. = implemented, * = not implemented)
@ -538,50 +601,99 @@ class search
**************************/
if(strpos($tokens[$i], ":") !== false) {
$parts = explode(":", $tokens[$i], 2);
if(!isset($result[$parts[0]]))
$result[$parts[0]] = [];
$exact = false;
$term = trim($parts[1], '"');
// If we trim off quotes, then it must be because it should be exact
if($parts[1] != $term) $exact = true;
switch($parts[0]) {
case "intitle": // BUG: What if a normal word is found in a title?
$result["terms"][] = [
"term" => $parts[1],
"term" => $term,
"weight" => $settings->search_title_matches_weighting * mb_strlen($parts[1]),
"location" => "title"
"location" => "title",
"exact" => $exact
];
break;
case "intags":
$result["terms"][] = [
"term" => $parts[1],
"term" => $term,
"weight" => $settings->search_tags_matches_weighting * mb_strlen($parts[1]),
"location" => "tags"
"location" => "tags",
"exact" => $exact
];
break;
case "inbody":
$result["terms"][] = [
"term" => $parts[1],
"term" => $term,
"weight" => 1,
"location" => "body"
"location" => "body",
"exact" => $exact
];
break;
default:
$result[$parts[0]][] = trim($parts[1], '"');
if(!isset($result[$parts[0]]))
$result[$parts[0]] = [];
$result[$parts[0]][] = $term;
break;
}
continue;
}
$exact = false;
$term = trim($tokens[$i], '"');
// If we trim off quotes, then it must be because it should be exact
if($tokens[$i] != $term) $exact = true;
// Doesn't appear to be particularly special *shrugs*
// Set the weight to -1 if it's a stop word
$result["terms"][] = [
"term" => $tokens[$i],
"term" => $term,
"weight" => in_array($tokens[$i], self::$stop_words) ? -1 : 1,
"location" => "all"
"location" => "all",
"exact" => $exact // If true then we shouldn't try to autocorrect it
];
}
// Correct typos, but only if that's enabled
if(module_exists("feature-search-didyoumean") && $settings->search_didyoumean_enabled) {
foreach($result["terms"] as $term_data) {
if($term_data["exact"] || // Skip exact-only
$term_data["weight"] < 1 || // Skip stop & irrelevant words
self::invindex_term_exists($term_data["term"])) continue;
// It's not a stop word or in the index, try and correct it
$correction = self::didyoumean_correct($termdata["term"]);
// Make a note if we fail to correct a term
if(!is_string($correction)) {
$term_data["corrected"] = false;
continue;
}
$term_data["term_before"] = $term_data["term"];
$term_data["term"] = $correction;
$term_data["corrected"] = true;
}
}
return $result;
}
/**
* Determines whether a term exists in the currently loaded inverted search
* index.
* Note that this only checked for precisely $term. See
* search::didyoumean_correct() for typo correction.
* @param string $term The term to search for.
* @return bool Whether term exists in the inverted index or not.
*/
public static function invindex_term_exists(string $term) {
// In the inverted index $term should have a list of page names in it
// if the temr exists in the index, and won't exists if not
return self::$invindex->has($term);
}
/**
* Searches the given inverted index for the specified search terms.
* @param string $query The search query.

View File

@ -20,6 +20,7 @@ register_module([
/**
* Represents a key-value data store.
*
*/
class StorageBox {
const MODE_JSON = 0;
@ -83,6 +84,18 @@ class StorageBox {
)->fetchColumn() > 0;
}
/**
* Returns an iterable that returns all the keys that do not contain the given string.
* @param string $exclude The string to search for when excluding keys.
* @return PDOStatement The iterable. Use a foreach loop on it.
*/
public function get_keys(string $exclude) : \PDOStatement {
return $this->query(
"SELECT key FROM store WHERE key LIKE :containing;",
[ "containing" => "%$exclude%" ]
);
}
/**
* Gets a value from the store.
* @param string $key The key value is stored under.

View File

@ -115,9 +115,9 @@ log_str("Scanning for dependencies...\n");
$module_count = count($module_list);
for($i = 0; $i < $module_count; $i++) {
foreach($module_list[$i]->depends as $dependency) {
// echo("scanning {$module_list[$i]->id}: $dependency\n");
log_str("scanning {$module_list[$i]->id}: $dependency\n");
if(!module_list_search($module_list, $dependency)) {
log_str("Adding missing dependency $dependency for {$module_list[$i]->id}\n");
log_str("Adding missing dependency $dependency for {$module_list[$i]->id}\n\n");
$missing_dependency = module_list_find($module_index, $dependency);
if($missing_dependency == null) {
if(php_sapi_name() != "cli") header("content-type: text/plain");
@ -128,6 +128,9 @@ for($i = 0; $i < $module_count; $i++) {
$module_list[] = $missing_dependency;
$module_count++;
}
else {
log_str("present, no action needed\n");
}
}
}

View File

@ -227,7 +227,7 @@
"search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250 },
"search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10 },
"search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3 },
"search_didyoumean_enable": { "type": "checkbox", "description": "Whether to enable the did you mean? search query typo correction engine.", "default": true },
"search_didyoumean_enabled": { "type": "checkbox", "description": "Whether to enable the 'did you mean?' search query typo correction engine.", "default": true },
"search_didyoumean_editdistance": { "type": "number", "description": "The maximmum edit distance to search when checking for typos. Increasing this number causes an exponential increase in the amount of computing power required to correct all spellings.", "default": 2 },
"search_didyoumean_cost_insert": { "type": "number", "description": "The insert cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
"search_didyoumean_cost_delete": { "type": "number", "description": "The delete cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },