Start implementing didyoumean abstraction in lib-search-engine, but its not finished yet.

Next up:
 - Updating the index
 - Adding words to the index in the first place
 - Saving the index if changes were made
 - Tighter integration into the existing search system to avoid changes 
to other modules
This commit is contained in:
Starbeamrainbowlabs 2020-03-15 14:41:14 +00:00
parent be8595bc44
commit 0829e7630b
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
3 changed files with 132 additions and 24 deletions

View File

@ -34,14 +34,58 @@ function standard_deviation(array $array): float {
class BkTree {
private $box = null;
// private $touched_ids = [];
/**
* The seed word of the tree.
* This word is the root node of the tree, and has a number of special properties::
* - It's never removed
* - It can't be added
* - It is never returned as a suggestion
* This is essential because we can't delete the root node of the tree without effectively rebuilding the entire thing, because the root node of the three doesn't have a parent.
* @var string
*/
private $seed_word = null;
private $cost_insert = 1;
private $cost_delete = 1;
private $cost_replace = 1;
public function __construct($filename) {
public function __construct(string $filename, string $seed_word) {
$this->box = new JsonStorageBox($filename);
$this->seed_word = $seed_word;
if(!$this->box->has("node|0")) {
// If the root node of the tree doesn't exist, create it
$new = new stdClass();
$new->value = $string;
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
$this->box->set("node|0", $this->seed_word);
$this->increment_node_count();
return 0;
}
}
/**
* Set the levenshtien insert/delete/replace costs.
* Note that if these values change, the entire tree needs to be rebuilt.
* @param int $insert The insert cost.
* @param int $delete The cost to delete a character.
* @param int $replace The cost to replace a character.
*/
public function set_costs(int $insert, int $delete, int $replace) : void {
$this->cost_insert = $insert;
$this->cost_delete = $delete;
$this->cost_replace = $replace;
}
/**
* Get the current levenshtein costs.
* @return stdClass The current levenshtein insert/delete/replace costs.
*/
public function get_costs() : stdClass {
return (object) [
"insert" => $this->cost_insert,
"delete" => $this->cost_delete,
"replace" => $this->cost_replace
];
}
/**
@ -60,31 +104,25 @@ class BkTree {
$this->set_node_count(0);
return $this->box->get("node_count");
}
private function set_node_count(int $value) {
private function set_node_count(int $value) : void {
$this->box->set("node_count", $value);
}
private function increment_node_count() {
private function increment_node_count() : void {
$this->box->set("node_count", $this->box->get("node_count") + 1);
}
/**
* Adds a string to the tree.
* Note that duplicates can be added if you're not careful!
* @param string $string The string to add.
* @param int $starting_node_id The id fo node to start insertion from. Defaults to 0 - for internal use only.
* @return int The depth at which the new node was added.
*/
public function add(string $string, int $starting_node_id = 0) : int {
if(!$this->box->has("node|0")) {
// If the root node of the tree doesn't exist, create it
$new = new stdClass();
$new->value = $string;
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
$this->box->set("node|0", $new);
$this->touched_ids[] = 0;
$this->increment_node_count();
return 0;
}
public function add(string $string, int $starting_node_id = 0) : ?int {
// Can't add the seed word to the tree
if($string == $this->seed_word)
return null;
if(!$this->box->has("node|$starting_node_id"))
throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion");
@ -130,6 +168,12 @@ class BkTree {
* @return bool Whether the removal was successful.
*/
public function remove(string $string) : bool {
// Not allowed to remove the seed word
if($string == $this->seed_word) {
error_log("[PeppermintyWiki/DidYouMean-BkTree] Blocked an attempt to remove the seed word $this->seed_word");
return false;
}
$stack = [ [ "node" => $this->box->get("node|0"), "id" => 0 ] ];
$node_target = $stack[0]["node"];
$node_target_id = 0;
@ -190,7 +234,7 @@ class BkTree {
return true;
}
public function trace(string $string) {
public function trace(string $string) : array {
$stack = [
(object) [ "node" => $this->box->get("node|0"), "id" => 0 ]
];
@ -218,7 +262,7 @@ class BkTree {
* @param integer $distance The maximum edit distance to search.
* @return string|null The first matching string, or null if no results were found.
*/
public function lookup_one(string $string, int $distance = 1) {
public function lookup_one(string $string, int $distance = 1) : ?string {
$result = $this->lookup($string, $distance, 1);
if(empty($result)) return null;
return $result[0];
@ -232,7 +276,7 @@ class BkTree {
* @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order.
* @return array<string> Similar resultant strings from the BK-Tree.
*/
public function lookup(string $string, int $max_distance = 1, int $count = 0) {
public function lookup(string $string, int $max_distance = 1, int $count = 0) : array {
if($this->get_node_count() == 0) return null;
$result = []; $result_count = 0;
@ -249,7 +293,8 @@ class BkTree {
$distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
// If the edit distance from the target string to this node is within the tolerance, store it
if($distance <= $max_distance) {
// If it's the seed word, then we shouldn't return it either
if($distance <= $max_distance && $node_current->value != $this->seed_word) {
$result[] = $node_current->value;
$result_count++;
if($count != 0 && $result_count >= $count) break;
@ -272,7 +317,7 @@ class BkTree {
* If the tree isn't balanced, you may need to insert items in a different order.
* @return array An array of statistics about this BK-Tree.
*/
public function stats() {
public function stats() array {
$result = [
"depth_max" => 0,
"depth_min_leaf" => INF,

View File

@ -81,6 +81,13 @@ class search
* @var StorageBox
*/
private static $invindex = null;
/**
* The 'did you mean?' index for typo correction.
* Only populated if the feature-search-didyoumean module is present.
* @var BkTree
*/
private static $didyoumeanindex = null;
/**
* The transliterator that can be used to transliterate strings.
* Transliterated strings are more suitable for use with the search index.
@ -90,14 +97,64 @@ class search
*/
public static $literator = null;
/**
* Sorter for sorting lists of *transliterated* strings.
* Should work for non-transliterated strings too.
* @var Collator
*/
private static $sorter;
/**
* Initialises the search system.
* Do not call this function! It is called automatically.
*/
public static function init() {
self::$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
self::$sorter = new Collator("");
}
/**
* Loads the didyoumean index.
* Don't forget to call this before making any search queries if didyoumean
* typoy correction is enabled.
* @param string $filename The filename of the didyoumean index.
* @param string $seed_word The seed word. If this changes, the index must be rebuilt.
* @return bool Whether the index was loaded successfully or not. Returns false if the feature-search-didyoumean module is not present.
*/
public static function didyoumean_load(string $filename, string $seed_word) : bool {
global $settings;
if(!module_exists("feature-search-didyoumean"))
return false;
$this->didyoumeanindex = new BkTree($filename, $seed_word);
$this->didyoumeanindex->set_costs(
$settings->search_didyoumean_cost_insert,
$settings->search_didyoumean_cost_delete,
$settings->search_didyoumean_cost_replace
);
return true;
}
/**
* Returns a correction for a given word according to the didyoumean index.
* Note that this is quite an expensive call.
* Check that the word exists in the regular search index first, and that
* it's not a stop word before calling this function.
* @param string $term The term to correct.
* @return string|null The closest correction found, or null if none could be located.
*/
public static function didyoumean_correct(string $term) : ?string {
global $settings;
$results = $this->didyoumeanindex->lookup(
$term,
$settings->search_didyoumean_editdistance
);
if(empty($results)) return null;
usort($results, function($a, $b) : int {
return self::compare($a, $b);
});
return $results[0];
}
/**
* Converts a source string into an index of search terms that can be

View File

@ -224,9 +224,15 @@
"avatars_show": { "type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
"avatars_size": { "type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
"search_characters_context": { "type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
"search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
"search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
"search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
"search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250 },
"search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10 },
"search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3 },
"search_didyoumean_enable": { "type": "checkbox", "description": "Whether to enable the did you mean? search query typo correction engine.", "default": true },
"search_didyoumean_editdistance": { "type": "number", "description": "The maximmum edit distance to search when checking for typos. Increasing this number causes an exponential increase in the amount of computing power required to correct all spellings.", "default": 2 },
"search_didyoumean_cost_insert": { "type": "number", "description": "The insert cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
"search_didyoumean_cost_delete": { "type": "number", "description": "The delete cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
"search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
"search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:<ul><li>Can't be added to the index</li><li>Can't be removed from the index</li><li>Is never suggested</li></ul>Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" },
"dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
"defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" },
"email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },