From 0829e7630b09dfea0a90e73b3199d2f1cf91825e Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Sun, 15 Mar 2020 14:41:14 +0000 Subject: [PATCH] Start implementing didyoumean abstraction in lib-search-engine, but its not finished yet. Next up: - Updating the index - Adding words to the index in the first place - Saving the index if changes were made - Tighter integration into the existing search system to avoid changes to other modules --- modules/feature-search-didyoumean.php | 87 ++++++++++++++++++++------- modules/lib-search-engine.php | 57 ++++++++++++++++++ peppermint.guiconfig.json | 12 +++- 3 files changed, 132 insertions(+), 24 deletions(-) diff --git a/modules/feature-search-didyoumean.php b/modules/feature-search-didyoumean.php index 10a108d..2f04ec1 100644 --- a/modules/feature-search-didyoumean.php +++ b/modules/feature-search-didyoumean.php @@ -34,14 +34,58 @@ function standard_deviation(array $array): float { class BkTree { private $box = null; - // private $touched_ids = []; + /** + * The seed word of the tree. + * This word is the root node of the tree, and has a number of special properties:: + * - It's never removed + * - It can't be added + * - It is never returned as a suggestion + * This is essential because we can't delete the root node of the tree without effectively rebuilding the entire thing, because the root node of the three doesn't have a parent. + * @var string + */ + private $seed_word = null; private $cost_insert = 1; private $cost_delete = 1; private $cost_replace = 1; - public function __construct($filename) { + public function __construct(string $filename, string $seed_word) { $this->box = new JsonStorageBox($filename); + $this->seed_word = $seed_word; + + if(!$this->box->has("node|0")) { + // If the root node of the tree doesn't exist, create it + $new = new stdClass(); + $new->value = $string; + $new->children = new stdClass(); // [ "id" => int, "distance" => int ] + $this->box->set("node|0", $this->seed_word); + $this->increment_node_count(); + return 0; + } + } + + /** + * Set the levenshtien insert/delete/replace costs. + * Note that if these values change, the entire tree needs to be rebuilt. + * @param int $insert The insert cost. + * @param int $delete The cost to delete a character. + * @param int $replace The cost to replace a character. + */ + public function set_costs(int $insert, int $delete, int $replace) : void { + $this->cost_insert = $insert; + $this->cost_delete = $delete; + $this->cost_replace = $replace; + } + /** + * Get the current levenshtein costs. + * @return stdClass The current levenshtein insert/delete/replace costs. + */ + public function get_costs() : stdClass { + return (object) [ + "insert" => $this->cost_insert, + "delete" => $this->cost_delete, + "replace" => $this->cost_replace + ]; } /** @@ -60,31 +104,25 @@ class BkTree { $this->set_node_count(0); return $this->box->get("node_count"); } - private function set_node_count(int $value) { + private function set_node_count(int $value) : void { $this->box->set("node_count", $value); } - private function increment_node_count() { + private function increment_node_count() : void { $this->box->set("node_count", $this->box->get("node_count") + 1); } /** * Adds a string to the tree. + * Note that duplicates can be added if you're not careful! * @param string $string The string to add. * @param int $starting_node_id The id fo node to start insertion from. Defaults to 0 - for internal use only. * @return int The depth at which the new node was added. */ - public function add(string $string, int $starting_node_id = 0) : int { - if(!$this->box->has("node|0")) { - // If the root node of the tree doesn't exist, create it - $new = new stdClass(); - $new->value = $string; - $new->children = new stdClass(); // [ "id" => int, "distance" => int ] - $this->box->set("node|0", $new); - $this->touched_ids[] = 0; - $this->increment_node_count(); - return 0; - } - + public function add(string $string, int $starting_node_id = 0) : ?int { + // Can't add the seed word to the tree + if($string == $this->seed_word) + return null; + if(!$this->box->has("node|$starting_node_id")) throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion"); @@ -130,6 +168,12 @@ class BkTree { * @return bool Whether the removal was successful. */ public function remove(string $string) : bool { + // Not allowed to remove the seed word + if($string == $this->seed_word) { + error_log("[PeppermintyWiki/DidYouMean-BkTree] Blocked an attempt to remove the seed word $this->seed_word"); + return false; + } + $stack = [ [ "node" => $this->box->get("node|0"), "id" => 0 ] ]; $node_target = $stack[0]["node"]; $node_target_id = 0; @@ -190,7 +234,7 @@ class BkTree { return true; } - public function trace(string $string) { + public function trace(string $string) : array { $stack = [ (object) [ "node" => $this->box->get("node|0"), "id" => 0 ] ]; @@ -218,7 +262,7 @@ class BkTree { * @param integer $distance The maximum edit distance to search. * @return string|null The first matching string, or null if no results were found. */ - public function lookup_one(string $string, int $distance = 1) { + public function lookup_one(string $string, int $distance = 1) : ?string { $result = $this->lookup($string, $distance, 1); if(empty($result)) return null; return $result[0]; @@ -232,7 +276,7 @@ class BkTree { * @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order. * @return array Similar resultant strings from the BK-Tree. */ - public function lookup(string $string, int $max_distance = 1, int $count = 0) { + public function lookup(string $string, int $max_distance = 1, int $count = 0) : array { if($this->get_node_count() == 0) return null; $result = []; $result_count = 0; @@ -249,7 +293,8 @@ class BkTree { $distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete); // If the edit distance from the target string to this node is within the tolerance, store it - if($distance <= $max_distance) { + // If it's the seed word, then we shouldn't return it either + if($distance <= $max_distance && $node_current->value != $this->seed_word) { $result[] = $node_current->value; $result_count++; if($count != 0 && $result_count >= $count) break; @@ -272,7 +317,7 @@ class BkTree { * If the tree isn't balanced, you may need to insert items in a different order. * @return array An array of statistics about this BK-Tree. */ - public function stats() { + public function stats() array { $result = [ "depth_max" => 0, "depth_min_leaf" => INF, diff --git a/modules/lib-search-engine.php b/modules/lib-search-engine.php index 72da16c..058a24f 100644 --- a/modules/lib-search-engine.php +++ b/modules/lib-search-engine.php @@ -81,6 +81,13 @@ class search * @var StorageBox */ private static $invindex = null; + /** + * The 'did you mean?' index for typo correction. + * Only populated if the feature-search-didyoumean module is present. + * @var BkTree + */ + private static $didyoumeanindex = null; + /** * The transliterator that can be used to transliterate strings. * Transliterated strings are more suitable for use with the search index. @@ -90,14 +97,64 @@ class search */ public static $literator = null; + /** + * Sorter for sorting lists of *transliterated* strings. + * Should work for non-transliterated strings too. + * @var Collator + */ + private static $sorter; + /** * Initialises the search system. * Do not call this function! It is called automatically. */ public static function init() { self::$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); + self::$sorter = new Collator(""); } + /** + * Loads the didyoumean index. + * Don't forget to call this before making any search queries if didyoumean + * typoy correction is enabled. + * @param string $filename The filename of the didyoumean index. + * @param string $seed_word The seed word. If this changes, the index must be rebuilt. + * @return bool Whether the index was loaded successfully or not. Returns false if the feature-search-didyoumean module is not present. + */ + public static function didyoumean_load(string $filename, string $seed_word) : bool { + global $settings; + if(!module_exists("feature-search-didyoumean")) + return false; + + $this->didyoumeanindex = new BkTree($filename, $seed_word); + $this->didyoumeanindex->set_costs( + $settings->search_didyoumean_cost_insert, + $settings->search_didyoumean_cost_delete, + $settings->search_didyoumean_cost_replace + ); + return true; + } + + /** + * Returns a correction for a given word according to the didyoumean index. + * Note that this is quite an expensive call. + * Check that the word exists in the regular search index first, and that + * it's not a stop word before calling this function. + * @param string $term The term to correct. + * @return string|null The closest correction found, or null if none could be located. + */ + public static function didyoumean_correct(string $term) : ?string { + global $settings; + $results = $this->didyoumeanindex->lookup( + $term, + $settings->search_didyoumean_editdistance + ); + if(empty($results)) return null; + usort($results, function($a, $b) : int { + return self::compare($a, $b); + }); + return $results[0]; + } /** * Converts a source string into an index of search terms that can be diff --git a/peppermint.guiconfig.json b/peppermint.guiconfig.json index b1139e5..e4239a3 100644 --- a/peppermint.guiconfig.json +++ b/peppermint.guiconfig.json @@ -224,9 +224,15 @@ "avatars_show": { "type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true}, "avatars_size": { "type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32}, "search_characters_context": { "type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75}, - "search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250}, - "search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10}, - "search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3}, + "search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250 }, + "search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10 }, + "search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3 }, + "search_didyoumean_enable": { "type": "checkbox", "description": "Whether to enable the did you mean? search query typo correction engine.", "default": true }, + "search_didyoumean_editdistance": { "type": "number", "description": "The maximmum edit distance to search when checking for typos. Increasing this number causes an exponential increase in the amount of computing power required to correct all spellings.", "default": 2 }, + "search_didyoumean_cost_insert": { "type": "number", "description": "The insert cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 }, + "search_didyoumean_cost_delete": { "type": "number", "description": "The delete cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 }, + "search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 }, + "search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" }, "dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't really improve performance. Set to 0 to disable.", "default": 7 }, "defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" }, "email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },