<?php require_once("JsonStorageBox.php"); /** * Calculates the standard deviation of an array of numbers. * @source https://stackoverflow.com/a/57694168/1460422 * @param array $array The array of numbers to calculate the standard deviation of. * @return float The standard deviation of the numbers in the given array. */ function standard_deviation(array $array): float { $size = count($array); $mean = array_sum($array) / $size; $squares = array_map(function ($x) use ($mean) { return pow($x - $mean, 2); }, $array); return sqrt(array_sum($squares) / ($size - 1)); } /** * A serialisable BK-Tree Implementation. * Ref: https://nullwords.wordpress.com/2013/03/13/the-bk-tree-a-data-structure-for-spell-checking/ */ class BkTree { private $box = null; private $nodes = []; // private $touched_ids = []; private $cost_insert = 1; private $cost_delete = 1; private $cost_replace = 1; public function __construct($filename) { $this->box = new JsonStorageBox($filename); } /** * A utility function for calculating edit distance. * Warning: Do not use this internally! It is *slow*. It's much faster to do this directly. This exists only for external use. * @param string $a The first string. * @param string $b The second string to compare against. * @return int The computed edit distance. */ public function edit_distance(string $a, string $b) : int { return levenshtein($a, $b, $this->cost_insert, $this->cost_replace, $this->cost_delete); } private function get_node_count() : int { if(!$this->box->has("node_count")) $this->set_node_count(0); return $this->box->get("node_count"); } private function set_node_count(int $value) { $this->box->set("node_count", $value); } private function increment_node_count() { $this->box->set("node_count", $this->box->get("node_count") + 1); } /** * Adds a string to the tree. * @param string $string The string to add. * @param int $starting_node_id The id fo node to start insertion from. Defaults to 0 - for internal use only. * @return int The depth at which the new node was added. */ public function add(string $string, int $starting_node_id = 0) : int { // FUTURE: When we support deletes, we'll need to ensure that the root node is handled correctly if(!$this->box->has("node|0")) { // If the root node of the tree doesn't exist, create it $new = new stdClass(); $new->value = $string; $new->children = new stdClass(); // [ "id" => int, "distance" => int ] $this->box->set("node|0", $new); $this->touched_ids[] = 0; $this->increment_node_count(); return 0; } if(!$this->box->has("node|$starting_node_id")) throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion"); // if($string == "bunny") echo("\nStart $string\n"); $next_node = $this->box->get("node|$starting_node_id"); // Grab the root to start with $next_node_id = $starting_node_id; $depth = 0; $visted = 0; while(true) { $visted++; $distance = levenshtein($string, $next_node->value, $this->cost_insert, $this->cost_replace, $this->cost_delete); // if($string == "bunny") echo("$visted: Visiting $next_node->value, distance $distance (child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $next_node->children)).")\n"); if(isset($next_node->children->$distance)) { $child_id = $next_node->children->$distance; $next_node = $this->box->get("node|$child_id"); $next_node_id = $child_id; // if($string == "cake") echo("Identical distance as {$next_node["value"]}, restarting loop\n"); $depth++; continue; // Continue on the outer while loop } // if($string == "bunny") echo("Inserting on $next_node->value\n"); // If we got here, then no existing children have the same edit distance // Note that here we don't push to avoid the overhead from either array_push() (considerable) or count() (also considerable). // Create the new child node $new_id = $this->get_node_count(); $this->box->set("node|$new_id", (object) [ "value" => $string, "children" => new stdClass() ]); // Create the edge that points from the existing node to the new node $next_node->children->$distance = $new_id; $this->box->set("node|$next_node_id", $next_node); $this->increment_node_count(); break; } return $depth; } /** * Removes a string from the tree. * @param string $string The string to remove. * @return bool Whether the removal was successful. */ public function remove(string $string) : bool { $stack = [ [ "node" => $this->box->get("node|0"), "id" => 0 ] ]; $node_target = $stack[0]["node"]; $node_target_id = 0; while($node_target->value !== $string) { $distance = levenshtein($string, $node_target->value, $this->cost_insert, $this->cost_replace, $this->cost_delete); // Failed to recurse to find the node with the value in question if(!isset($node_target->children->$distance)) return false; $node_target_id = $node_target->children->$distance; $node_target = $this->box->get("node|$node_target_id"); $stack[] = [ "node" => $node_target, "id" => $node_target_id ]; } // The last item but 1 on the stack is the parent node $parent = $stack[count($stack) - 2]; // 1. Delete the connection from parent -> target foreach($parent["node"]->children as $distance => $id) { if($id == $node_target_id) { unset($parent["node"]->children->$distance); break; } } // Save the parent node's back to disk // Note that we do this *before* sorting out the orphans, since it's possible that $this->add() will modify it further $this->box->set("node|{$parent["id"]}", $parent["node"]); // 2. Iterate over the target's children (if any) and re-hang them from the parent // NOTE: We need to be careful that the characteristics of the tree are preserved. We should test this by tracing a node's location in the tree and purposefully removing nodes in the chain and see if the results returned as still the same // // Hang the now orphaned children and all their decendants from the parent foreach($node_target->children as $distance => $id) { $orphan = $this->box->get("node|$id"); $substack = [ [ "node" => $orphan, "id" => $id ] ]; $substack_top = 0; while($substack_top >= 0) { $next = $substack[$substack_top]; unset($substack[$substack_top]); $substack_top--; $this->box->delete("node|{$next["id"]}"); // Delete the orphan node $this->add($next["node"]->value, $parent["id"]); // Re-hang it from the parent foreach($next["node"]->children as $distance => $sub_id) { $substack[++$substack_top] = [ "node" => $this->box->get("node|$sub_id"), "id" => $sub_id ]; } } } // Delete the target node $this->box->delete("node|$node_target_id"); return true; } public function trace(string $string) { $stack = [ (object) [ "node" => $this->box->get("node|0"), "id" => 0 ] ]; $node_target = $stack[0]->node; while($node_target->value !== $string) { $distance = levenshtein($string, $node_target->value, $this->cost_insert, $this->cost_replace, $this->cost_delete); var_dump($node_target); // Failed to recurse to find the node with the value in question if(!isset($node_target->children->$distance)) return null; $node_target_id = $node_target->children->$distance; $node_target = $this->box->get("node|$node_target_id"); $stack[] = (object) [ "node" => $node_target, "id" => $node_target_id ]; } return $stack; } /** * Convenience function that returns just the first result when looking up a string. * @param string $string The string to lookup * @param integer $distance The maximum edit distance to search. * @return string|null The first matching string, or null if no results were found. */ public function lookup_one(string $string, int $distance = 1) { $result = $this->lookup($string, $distance, 1); if(empty($result)) return null; return $result[0]; // foreach($this->lookup($string, $distance) as $item) // return $item; } /** * Generator that walks the BK-Tree and iteratively yields results. * TODO: Refactor this to use an array, since generators are ~ * @param string $string The search string. * @param integer $max_distance The maximum edit distance to search. * @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order. * @return Generator<string> Iteratively yielded similar resultant strings from the BK-Tree. */ public function lookup(string $string, int $max_distance = 1, int $count = 0) { if($this->get_node_count() == 0) return null; $result = []; $result_count = 0; $stack = [ $this->box->get("node|0") ]; $stack_top = 0; // https://softwareengineering.stackexchange.com/a/226162/58491 while($stack_top >= 0) { // Take the topmost node off the stack $node_current = $stack[$stack_top]; unset($stack[$stack_top]); $stack_top--; $distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete); /* echo("[lookup] Visiting $node_current->value (distance $distance, child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $node_current->children)).")\n"); if(in_array($node_current->value, ["worlds", "domicil", "mealiest", "stopgaps", "pibroch", "upwardly", "nontruth", "vizoring"])) { echo("[lookup] Children: ".implode(", ", array_map(function($el) { return "$el->distance: ".$this->box->get("node|$el->id")->value; }, $node_current->children))."\n"); } if($node_current->value == "bunny") exit(); */ // If the edit distance from the target string to this node is within the tolerance, yield it if($distance <= $max_distance) { // readline("press any key to continue"); $result[] = $node_current->value; if($count != 0 && $result_count >= $count) break; // yield $node_current["value"]; } // Adding the key here speeds it up, apparently // Ref: https://phpbench.com/ for($child_distance = $distance - $max_distance; $child_distance <= $distance + $max_distance; $child_distance++) { if(!isset($node_current->children->$child_distance)) continue; // echo("[lookup] Recursing on child ".$this->box->get("node|$child->id")->value." (distance $child->distance)\n"); // Push the node onto the stack // Note that it doesn't actually matter that the stack isn't an accurate representation of ancestor nodes at any given time here. The stack is really a hybrid between a stack and a queue, having features of both. $stack[++$stack_top] = $this->box->get("node|{$node_current->children->$child_distance}"); } } return $result; } /** * Calculate statistics about the BK-Tree. * Useful for analysing a tree's structure. * If the tree isn't balanced, you may need to insert items in a different order. * @return array An array of statistics about this BK-Tree. */ public function stats() { $result = [ "depth_max" => 0, "depth_min_leaf" => INF, "depth_average" => 0, "depth_average_noleaf" => 0, "depth_standard_deviation" => [], "child_count_average" => 0, "child_count_max" => 0, "nodes" => $this->get_node_count(), "leaves" => 0, "non_leaves" => 0 ]; $start_time = microtime(true); $stack = [ [ "node" => $this->box->get("node|0"), "depth" => 0 ] ]; // https://softwareengineering.stackexchange.com/a/226162/58491 while(!empty($stack)) { // Take the top-most node off the stack $current = array_pop($stack); // echo("Visiting "); var_dump($current); // Operate on the node $result["depth_standard_deviation"][] = $current["depth"]; $result["depth_average"] += $current["depth"]; if($current["depth"] > $result["depth_max"]) $result["depth_max"] = $current["depth"]; if(empty($current["node"]->children) && $current["depth"] < $result["depth_min_leaf"]) $result["depth_min_leaf"] = $current["depth"]; $child_count = count((array)($current["node"]->children)); $result["child_count_average"] += $child_count; if($child_count > $result["child_count_max"]) $result["child_count_max"] = $child_count; if($child_count > 0) { $result["depth_average_noleaf"] += $current["depth"]; $result["non_leaves"]++; } else $result["leaves"]++; // Iterate over the child nodes foreach($current["node"]->children as $child_distance => $child_id) { $stack[] = [ "node" => $this->box->get("node|$child_id"), "depth" => $current["depth"] + 1 ]; } } $result["depth_average"] /= $result["nodes"]; $result["depth_average_noleaf"] /= $result["non_leaves"]; $result["child_count_average"] /= $result["nodes"]; $result["depth_standard_deviation"] = standard_deviation($result["depth_standard_deviation"]); $result["time_taken"] = microtime(true) - $start_time; return $result; } public function walk() { $stack = [ (object)[ "id" => 0, "node" => $this->box->get("node|0"), "parent_id" => -1, "parent" => null, "depth" => 0 ] ]; $stack_top = 0; // https://softwareengineering.stackexchange.com/a/226162/58491 while(!empty($stack)) { // Take the topmost node off the stack $current = $stack[$stack_top]; unset($stack[$stack_top]); $stack_top--; // echo("Visiting "); var_dump($current); yield $current; // Iterate over the child nodes foreach($current->node->children as $child_distance => $child_id) { $stack_top++; $stack[$stack_top] = (object) [ "id" => $child_id, "node" => $this->box->get("node|{$current->node->children->$child_distance}"), "parent_id" => $current->id, "parent" => $current->node, "depth" => $current->depth + 1 ]; } } } /** * Saves changes to the tree back to disk. * @return void */ public function close() { $this->box->close(); } }