Pepperminty-Wiki/experiments/bktree/BkTree.php

402 lines
14 KiB
PHP

<?php
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
require_once("JsonStorageBox.php");
/**
* Calculates the standard deviation of an array of numbers.
* @source https://stackoverflow.com/a/57694168/1460422
* @param array $array The array of numbers to calculate the standard deviation of.
* @return float The standard deviation of the numbers in the given array.
*/
function standard_deviation(array $array): float {
$size = count($array);
$mean = array_sum($array) / $size;
$squares = array_map(function ($x) use ($mean) {
return pow($x - $mean, 2);
}, $array);
return sqrt(array_sum($squares) / ($size - 1));
}
/**
* A serialisable BK-Tree Implementation.
* Ref: https://nullwords.wordpress.com/2013/03/13/the-bk-tree-a-data-structure-for-spell-checking/
*/
class BkTree
{
private $box = null;
private $nodes = [];
// private $touched_ids = [];
private $cost_insert = 1;
private $cost_delete = 1;
private $cost_replace = 1;
public function __construct($filename) {
$this->box = new JsonStorageBox($filename);
}
/**
* A utility function for calculating edit distance.
* Warning: Do not use this internally! It is *slow*. It's much faster to do this directly. This exists only for external use.
* @param string $a The first string.
* @param string $b The second string to compare against.
* @return int The computed edit distance.
*/
public function edit_distance(string $a, string $b) : int {
return levenshtein($a, $b, $this->cost_insert, $this->cost_replace, $this->cost_delete);
}
private function get_node_count() : int {
if(!$this->box->has("node_count"))
$this->set_node_count(0);
return $this->box->get("node_count");
}
private function set_node_count(int $value) {
$this->box->set("node_count", $value);
}
private function increment_node_count() {
$this->box->set("node_count", $this->box->get("node_count") + 1);
}
/**
* Adds a string to the tree.
* @param string $string The string to add.
* @param int $starting_node_id The id fo node to start insertion from. Defaults to 0 - for internal use only.
* @return int The depth at which the new node was added.
*/
public function add(string $string, int $starting_node_id = 0) : int {
// FUTURE: When we support deletes, we'll need to ensure that the root node is handled correctly
if(!$this->box->has("node|0")) {
// If the root node of the tree doesn't exist, create it
$new = new stdClass();
$new->value = $string;
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
$this->box->set("node|0", $new);
$this->touched_ids[] = 0;
$this->increment_node_count();
return 0;
}
if(!$this->box->has("node|$starting_node_id"))
throw new Exception("Error: Failed to find node with id $starting_node_id to begin insertion");
// if($string == "bunny") echo("\nStart $string\n");
$next_node = $this->box->get("node|$starting_node_id"); // Grab the root to start with
$next_node_id = $starting_node_id;
$depth = 0; $visted = 0;
while(true) {
$visted++;
$distance = levenshtein($string, $next_node->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
// if($string == "bunny") echo("$visted: Visiting $next_node->value, distance $distance (child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $next_node->children)).")\n");
if(isset($next_node->children->$distance)) {
$child_id = $next_node->children->$distance;
$next_node = $this->box->get("node|$child_id");
$next_node_id = $child_id;
// if($string == "cake") echo("Identical distance as {$next_node["value"]}, restarting loop\n");
$depth++;
continue; // Continue on the outer while loop
}
// if($string == "bunny") echo("Inserting on $next_node->value\n");
// If we got here, then no existing children have the same edit distance
// Note that here we don't push to avoid the overhead from either array_push() (considerable) or count() (also considerable).
// Create the new child node
$new_id = $this->get_node_count();
$this->box->set("node|$new_id", (object) [
"value" => $string,
"children" => new stdClass()
]);
// Create the edge that points from the existing node to the new node
$next_node->children->$distance = $new_id;
$this->box->set("node|$next_node_id", $next_node);
$this->increment_node_count();
break;
}
return $depth;
}
/**
* Removes a string from the tree.
* @param string $string The string to remove.
* @return bool Whether the removal was successful.
*/
public function remove(string $string) : bool {
$stack = [ [ "node" => $this->box->get("node|0"), "id" => 0 ] ];
$node_target = $stack[0]["node"];
$node_target_id = 0;
while($node_target->value !== $string) {
$distance = levenshtein($string, $node_target->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
// Failed to recurse to find the node with the value in question
if(!isset($node_target->children->$distance))
return false;
$node_target_id = $node_target->children->$distance;
$node_target = $this->box->get("node|$node_target_id");
$stack[] = [ "node" => $node_target, "id" => $node_target_id ];
}
// The last item but 1 on the stack is the parent node
$parent = $stack[count($stack) - 2];
// 1. Delete the connection from parent -> target
foreach($parent["node"]->children as $distance => $id) {
if($id == $node_target_id) {
unset($parent["node"]->children->$distance);
break;
}
}
// Save the parent node's back to disk
// Note that we do this *before* sorting out the orphans, since it's possible that $this->add() will modify it further
$this->box->set("node|{$parent["id"]}", $parent["node"]);
// 2. Iterate over the target's children (if any) and re-hang them from the parent
// NOTE: We need to be careful that the characteristics of the tree are preserved. We should test this by tracing a node's location in the tree and purposefully removing nodes in the chain and see if the results returned as still the same
//
// Hang the now orphaned children and all their decendants from the parent
foreach($node_target->children as $distance => $id) {
$orphan = $this->box->get("node|$id");
$substack = [ [ "node" => $orphan, "id" => $id ] ]; $substack_top = 0;
while($substack_top >= 0) {
$next = $substack[$substack_top];
unset($substack[$substack_top]);
$substack_top--;
$this->box->delete("node|{$next["id"]}"); // Delete the orphan node
$this->add($next["node"]->value, $parent["id"]); // Re-hang it from the parent
foreach($next["node"]->children as $distance => $sub_id) {
$substack[++$substack_top] = [
"node" => $this->box->get("node|$sub_id"),
"id" => $sub_id
];
}
}
}
// Delete the target node
$this->box->delete("node|$node_target_id");
return true;
}
public function trace(string $string) {
$stack = [
(object) [ "node" => $this->box->get("node|0"), "id" => 0 ]
];
$node_target = $stack[0]->node;
while($node_target->value !== $string) {
$distance = levenshtein($string, $node_target->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
var_dump($node_target);
// Failed to recurse to find the node with the value in question
if(!isset($node_target->children->$distance))
return null;
$node_target_id = $node_target->children->$distance;
$node_target = $this->box->get("node|$node_target_id");
$stack[] = (object) [ "node" => $node_target, "id" => $node_target_id ];
}
return $stack;
}
/**
* Convenience function that returns just the first result when looking up a string.
* @param string $string The string to lookup
* @param integer $distance The maximum edit distance to search.
* @return string|null The first matching string, or null if no results were found.
*/
public function lookup_one(string $string, int $distance = 1) {
$result = $this->lookup($string, $distance, 1);
if(empty($result)) return null;
return $result[0];
// foreach($this->lookup($string, $distance) as $item)
// return $item;
}
/**
* Generator that walks the BK-Tree and iteratively yields results.
* TODO: Refactor this to use an array, since generators are ~
* @param string $string The search string.
* @param integer $max_distance The maximum edit distance to search.
* @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order.
* @return Generator<string> Iteratively yielded similar resultant strings from the BK-Tree.
*/
public function lookup(string $string, int $max_distance = 1, int $count = 0) {
if($this->get_node_count() == 0) return null;
$result = []; $result_count = 0;
$stack = [ $this->box->get("node|0") ];
$stack_top = 0;
// https://softwareengineering.stackexchange.com/a/226162/58491
while($stack_top >= 0) {
// Take the topmost node off the stack
$node_current = $stack[$stack_top];
unset($stack[$stack_top]);
$stack_top--;
$distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
/*
echo("[lookup] Visiting $node_current->value (distance $distance, child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $node_current->children)).")\n");
if(in_array($node_current->value, ["worlds", "domicil", "mealiest", "stopgaps", "pibroch", "upwardly", "nontruth", "vizoring"])) {
echo("[lookup] Children: ".implode(", ", array_map(function($el) {
return "$el->distance: ".$this->box->get("node|$el->id")->value;
}, $node_current->children))."\n");
}
if($node_current->value == "bunny") exit();
*/
// If the edit distance from the target string to this node is within the tolerance, yield it
if($distance <= $max_distance) {
// readline("press any key to continue");
$result[] = $node_current->value;
if($count != 0 && $result_count >= $count) break;
// yield $node_current["value"];
}
// Adding the key here speeds it up, apparently
// Ref: https://phpbench.com/
for($child_distance = $distance - $max_distance; $child_distance <= $distance + $max_distance; $child_distance++) {
if(!isset($node_current->children->$child_distance))
continue;
// echo("[lookup] Recursing on child ".$this->box->get("node|$child->id")->value." (distance $child->distance)\n");
// Push the node onto the stack
// Note that it doesn't actually matter that the stack isn't an accurate representation of ancestor nodes at any given time here. The stack is really a hybrid between a stack and a queue, having features of both.
$stack[++$stack_top] = $this->box->get("node|{$node_current->children->$child_distance}");
}
}
return $result;
}
/**
* Calculate statistics about the BK-Tree.
* Useful for analysing a tree's structure.
* If the tree isn't balanced, you may need to insert items in a different order.
* @return array An array of statistics about this BK-Tree.
*/
public function stats() {
$result = [
"depth_max" => 0,
"depth_min_leaf" => INF,
"depth_average" => 0,
"depth_average_noleaf" => 0,
"depth_standard_deviation" => [],
"child_count_average" => 0,
"child_count_max" => 0,
"nodes" => $this->get_node_count(),
"leaves" => 0,
"non_leaves" => 0
];
$start_time = microtime(true);
$stack = [ [ "node" => $this->box->get("node|0"), "depth" => 0 ] ];
// https://softwareengineering.stackexchange.com/a/226162/58491
while(!empty($stack)) {
// Take the top-most node off the stack
$current = array_pop($stack);
// echo("Visiting "); var_dump($current);
// Operate on the node
$result["depth_standard_deviation"][] = $current["depth"];
$result["depth_average"] += $current["depth"];
if($current["depth"] > $result["depth_max"])
$result["depth_max"] = $current["depth"];
if(empty($current["node"]->children) && $current["depth"] < $result["depth_min_leaf"])
$result["depth_min_leaf"] = $current["depth"];
$child_count = count((array)($current["node"]->children));
$result["child_count_average"] += $child_count;
if($child_count > $result["child_count_max"])
$result["child_count_max"] = $child_count;
if($child_count > 0) {
$result["depth_average_noleaf"] += $current["depth"];
$result["non_leaves"]++;
}
else
$result["leaves"]++;
// Iterate over the child nodes
foreach($current["node"]->children as $child_distance => $child_id) {
$stack[] = [
"node" => $this->box->get("node|$child_id"),
"depth" => $current["depth"] + 1
];
}
}
$result["depth_average"] /= $result["nodes"];
$result["depth_average_noleaf"] /= $result["non_leaves"];
$result["child_count_average"] /= $result["nodes"];
$result["depth_standard_deviation"] = standard_deviation($result["depth_standard_deviation"]);
$result["time_taken"] = microtime(true) - $start_time;
return $result;
}
public function walk() {
$stack = [ (object)[
"id" => 0,
"node" => $this->box->get("node|0"),
"parent_id" => -1,
"parent" => null,
"depth" => 0
] ];
$stack_top = 0;
// https://softwareengineering.stackexchange.com/a/226162/58491
while(!empty($stack)) {
// Take the topmost node off the stack
$current = $stack[$stack_top];
unset($stack[$stack_top]);
$stack_top--;
// echo("Visiting "); var_dump($current);
yield $current;
// Iterate over the child nodes
foreach($current->node->children as $child_distance => $child_id) {
$stack_top++;
$stack[$stack_top] = (object) [
"id" => $child_id,
"node" => $this->box->get("node|{$current->node->children->$child_distance}"),
"parent_id" => $current->id,
"parent" => $current->node,
"depth" => $current->depth + 1
];
}
}
}
/**
* Saves changes to the tree back to disk.
* @return void
*/
public function close() {
$this->box->close();
}
}