mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-25 17:23:00 +00:00
Start implementing didyoumean abstraction in lib-search-engine, but its not finished yet.
Next up: - Updating the index - Adding words to the index in the first place - Saving the index if changes were made - Tighter integration into the existing search system to avoid changes to other modules
This commit is contained in:
parent
be8595bc44
commit
0829e7630b
3 changed files with 132 additions and 24 deletions
|
@ -34,14 +34,58 @@ function standard_deviation(array $array): float {
|
||||||
class BkTree {
|
class BkTree {
|
||||||
private $box = null;
|
private $box = null;
|
||||||
|
|
||||||
// private $touched_ids = [];
|
/**
|
||||||
|
* The seed word of the tree.
|
||||||
|
* This word is the root node of the tree, and has a number of special properties::
|
||||||
|
* - It's never removed
|
||||||
|
* - It can't be added
|
||||||
|
* - It is never returned as a suggestion
|
||||||
|
* This is essential because we can't delete the root node of the tree without effectively rebuilding the entire thing, because the root node of the three doesn't have a parent.
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private $seed_word = null;
|
||||||
|
|
||||||
private $cost_insert = 1;
|
private $cost_insert = 1;
|
||||||
private $cost_delete = 1;
|
private $cost_delete = 1;
|
||||||
private $cost_replace = 1;
|
private $cost_replace = 1;
|
||||||
|
|
||||||
public function __construct($filename) {
|
public function __construct(string $filename, string $seed_word) {
|
||||||
$this->box = new JsonStorageBox($filename);
|
$this->box = new JsonStorageBox($filename);
|
||||||
|
$this->seed_word = $seed_word;
|
||||||
|
|
||||||
|
if(!$this->box->has("node|0")) {
|
||||||
|
// If the root node of the tree doesn't exist, create it
|
||||||
|
$new = new stdClass();
|
||||||
|
$new->value = $string;
|
||||||
|
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
|
||||||
|
$this->box->set("node|0", $this->seed_word);
|
||||||
|
$this->increment_node_count();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the levenshtien insert/delete/replace costs.
|
||||||
|
* Note that if these values change, the entire tree needs to be rebuilt.
|
||||||
|
* @param int $insert The insert cost.
|
||||||
|
* @param int $delete The cost to delete a character.
|
||||||
|
* @param int $replace The cost to replace a character.
|
||||||
|
*/
|
||||||
|
public function set_costs(int $insert, int $delete, int $replace) : void {
|
||||||
|
$this->cost_insert = $insert;
|
||||||
|
$this->cost_delete = $delete;
|
||||||
|
$this->cost_replace = $replace;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Get the current levenshtein costs.
|
||||||
|
* @return stdClass The current levenshtein insert/delete/replace costs.
|
||||||
|
*/
|
||||||
|
public function get_costs() : stdClass {
|
||||||
|
return (object) [
|
||||||
|
"insert" => $this->cost_insert,
|
||||||
|
"delete" => $this->cost_delete,
|
||||||
|
"replace" => $this->cost_replace
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -60,30 +104,24 @@ class BkTree {
|
||||||
$this->set_node_count(0);
|
$this->set_node_count(0);
|
||||||
return $this->box->get("node_count");
|
return $this->box->get("node_count");
|
||||||
}
|
}
|
||||||
private function set_node_count(int $value) {
|
private function set_node_count(int $value) : void {
|
||||||
$this->box->set("node_count", $value);
|
$this->box->set("node_count", $value);
|
||||||
}
|
}
|
||||||
private function increment_node_count() {
|
private function increment_node_count() : void {
|
||||||
$this->box->set("node_count", $this->box->get("node_count") + 1);
|
$this->box->set("node_count", $this->box->get("node_count") + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a string to the tree.
|
* Adds a string to the tree.
|
||||||
|
* Note that duplicates can be added if you're not careful!
|
||||||
* @param string $string The string to add.
|
* @param string $string The string to add.
|
||||||
* @param int $starting_node_id The id fo node to start insertion from. Defaults to 0 - for internal use only.
|
* @param int $starting_node_id The id fo node to start insertion from. Defaults to 0 - for internal use only.
|
||||||
* @return int The depth at which the new node was added.
|
* @return int The depth at which the new node was added.
|
||||||
*/
|
*/
|
||||||
public function add(string $string, int $starting_node_id = 0) : int {
|
public function add(string $string, int $starting_node_id = 0) : ?int {
|
||||||
if(!$this->box->has("node|0")) {
|
// Can't add the seed word to the tree
|
||||||
// If the root node of the tree doesn't exist, create it
|
if($string == $this->seed_word)
|
||||||
$new = new stdClass();
|
return null;
|
||||||
$new->value = $string;
|
|
||||||
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
|
|
||||||
$this->box->set("node|0", $new);
|
|
||||||
$this->touched_ids[] = 0;
|
|
||||||
$this->increment_node_count();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!$this->box->has("node|$starting_node_id"))
|
if(!$this->box->has("node|$starting_node_id"))
|
||||||
throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion");
|
throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion");
|
||||||
|
@ -130,6 +168,12 @@ class BkTree {
|
||||||
* @return bool Whether the removal was successful.
|
* @return bool Whether the removal was successful.
|
||||||
*/
|
*/
|
||||||
public function remove(string $string) : bool {
|
public function remove(string $string) : bool {
|
||||||
|
// Not allowed to remove the seed word
|
||||||
|
if($string == $this->seed_word) {
|
||||||
|
error_log("[PeppermintyWiki/DidYouMean-BkTree] Blocked an attempt to remove the seed word $this->seed_word");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
$stack = [ [ "node" => $this->box->get("node|0"), "id" => 0 ] ];
|
$stack = [ [ "node" => $this->box->get("node|0"), "id" => 0 ] ];
|
||||||
$node_target = $stack[0]["node"];
|
$node_target = $stack[0]["node"];
|
||||||
$node_target_id = 0;
|
$node_target_id = 0;
|
||||||
|
@ -190,7 +234,7 @@ class BkTree {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function trace(string $string) {
|
public function trace(string $string) : array {
|
||||||
$stack = [
|
$stack = [
|
||||||
(object) [ "node" => $this->box->get("node|0"), "id" => 0 ]
|
(object) [ "node" => $this->box->get("node|0"), "id" => 0 ]
|
||||||
];
|
];
|
||||||
|
@ -218,7 +262,7 @@ class BkTree {
|
||||||
* @param integer $distance The maximum edit distance to search.
|
* @param integer $distance The maximum edit distance to search.
|
||||||
* @return string|null The first matching string, or null if no results were found.
|
* @return string|null The first matching string, or null if no results were found.
|
||||||
*/
|
*/
|
||||||
public function lookup_one(string $string, int $distance = 1) {
|
public function lookup_one(string $string, int $distance = 1) : ?string {
|
||||||
$result = $this->lookup($string, $distance, 1);
|
$result = $this->lookup($string, $distance, 1);
|
||||||
if(empty($result)) return null;
|
if(empty($result)) return null;
|
||||||
return $result[0];
|
return $result[0];
|
||||||
|
@ -232,7 +276,7 @@ class BkTree {
|
||||||
* @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order.
|
* @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order.
|
||||||
* @return array<string> Similar resultant strings from the BK-Tree.
|
* @return array<string> Similar resultant strings from the BK-Tree.
|
||||||
*/
|
*/
|
||||||
public function lookup(string $string, int $max_distance = 1, int $count = 0) {
|
public function lookup(string $string, int $max_distance = 1, int $count = 0) : array {
|
||||||
if($this->get_node_count() == 0) return null;
|
if($this->get_node_count() == 0) return null;
|
||||||
|
|
||||||
$result = []; $result_count = 0;
|
$result = []; $result_count = 0;
|
||||||
|
@ -249,7 +293,8 @@ class BkTree {
|
||||||
$distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
|
$distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
|
||||||
|
|
||||||
// If the edit distance from the target string to this node is within the tolerance, store it
|
// If the edit distance from the target string to this node is within the tolerance, store it
|
||||||
if($distance <= $max_distance) {
|
// If it's the seed word, then we shouldn't return it either
|
||||||
|
if($distance <= $max_distance && $node_current->value != $this->seed_word) {
|
||||||
$result[] = $node_current->value;
|
$result[] = $node_current->value;
|
||||||
$result_count++;
|
$result_count++;
|
||||||
if($count != 0 && $result_count >= $count) break;
|
if($count != 0 && $result_count >= $count) break;
|
||||||
|
@ -272,7 +317,7 @@ class BkTree {
|
||||||
* If the tree isn't balanced, you may need to insert items in a different order.
|
* If the tree isn't balanced, you may need to insert items in a different order.
|
||||||
* @return array An array of statistics about this BK-Tree.
|
* @return array An array of statistics about this BK-Tree.
|
||||||
*/
|
*/
|
||||||
public function stats() {
|
public function stats() array {
|
||||||
$result = [
|
$result = [
|
||||||
"depth_max" => 0,
|
"depth_max" => 0,
|
||||||
"depth_min_leaf" => INF,
|
"depth_min_leaf" => INF,
|
||||||
|
|
|
@ -81,6 +81,13 @@ class search
|
||||||
* @var StorageBox
|
* @var StorageBox
|
||||||
*/
|
*/
|
||||||
private static $invindex = null;
|
private static $invindex = null;
|
||||||
|
/**
|
||||||
|
* The 'did you mean?' index for typo correction.
|
||||||
|
* Only populated if the feature-search-didyoumean module is present.
|
||||||
|
* @var BkTree
|
||||||
|
*/
|
||||||
|
private static $didyoumeanindex = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The transliterator that can be used to transliterate strings.
|
* The transliterator that can be used to transliterate strings.
|
||||||
* Transliterated strings are more suitable for use with the search index.
|
* Transliterated strings are more suitable for use with the search index.
|
||||||
|
@ -90,14 +97,64 @@ class search
|
||||||
*/
|
*/
|
||||||
public static $literator = null;
|
public static $literator = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sorter for sorting lists of *transliterated* strings.
|
||||||
|
* Should work for non-transliterated strings too.
|
||||||
|
* @var Collator
|
||||||
|
*/
|
||||||
|
private static $sorter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialises the search system.
|
* Initialises the search system.
|
||||||
* Do not call this function! It is called automatically.
|
* Do not call this function! It is called automatically.
|
||||||
*/
|
*/
|
||||||
public static function init() {
|
public static function init() {
|
||||||
self::$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
self::$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
self::$sorter = new Collator("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the didyoumean index.
|
||||||
|
* Don't forget to call this before making any search queries if didyoumean
|
||||||
|
* typoy correction is enabled.
|
||||||
|
* @param string $filename The filename of the didyoumean index.
|
||||||
|
* @param string $seed_word The seed word. If this changes, the index must be rebuilt.
|
||||||
|
* @return bool Whether the index was loaded successfully or not. Returns false if the feature-search-didyoumean module is not present.
|
||||||
|
*/
|
||||||
|
public static function didyoumean_load(string $filename, string $seed_word) : bool {
|
||||||
|
global $settings;
|
||||||
|
if(!module_exists("feature-search-didyoumean"))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
$this->didyoumeanindex = new BkTree($filename, $seed_word);
|
||||||
|
$this->didyoumeanindex->set_costs(
|
||||||
|
$settings->search_didyoumean_cost_insert,
|
||||||
|
$settings->search_didyoumean_cost_delete,
|
||||||
|
$settings->search_didyoumean_cost_replace
|
||||||
|
);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a correction for a given word according to the didyoumean index.
|
||||||
|
* Note that this is quite an expensive call.
|
||||||
|
* Check that the word exists in the regular search index first, and that
|
||||||
|
* it's not a stop word before calling this function.
|
||||||
|
* @param string $term The term to correct.
|
||||||
|
* @return string|null The closest correction found, or null if none could be located.
|
||||||
|
*/
|
||||||
|
public static function didyoumean_correct(string $term) : ?string {
|
||||||
|
global $settings;
|
||||||
|
$results = $this->didyoumeanindex->lookup(
|
||||||
|
$term,
|
||||||
|
$settings->search_didyoumean_editdistance
|
||||||
|
);
|
||||||
|
if(empty($results)) return null;
|
||||||
|
usort($results, function($a, $b) : int {
|
||||||
|
return self::compare($a, $b);
|
||||||
|
});
|
||||||
|
return $results[0];
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a source string into an index of search terms that can be
|
* Converts a source string into an index of search terms that can be
|
||||||
|
|
|
@ -227,6 +227,12 @@
|
||||||
"search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250 },
|
"search_characters_context_total": { "type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250 },
|
||||||
"search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10 },
|
"search_title_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10 },
|
||||||
"search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3 },
|
"search_tags_matches_weighting": { "type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3 },
|
||||||
|
"search_didyoumean_enable": { "type": "checkbox", "description": "Whether to enable the did you mean? search query typo correction engine.", "default": true },
|
||||||
|
"search_didyoumean_editdistance": { "type": "number", "description": "The maximmum edit distance to search when checking for typos. Increasing this number causes an exponential increase in the amount of computing power required to correct all spellings.", "default": 2 },
|
||||||
|
"search_didyoumean_cost_insert": { "type": "number", "description": "The insert cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
|
||||||
|
"search_didyoumean_cost_delete": { "type": "number", "description": "The delete cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
|
||||||
|
"search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
|
||||||
|
"search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:<ul><li>Can't be added to the index</li><li>Can't be removed from the index</li><li>Is never suggested</li></ul>Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" },
|
||||||
"dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
"dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
||||||
"defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" },
|
"defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" },
|
||||||
"email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },
|
"email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },
|
||||||
|
|
Loading…
Reference in a new issue