2020-03-04 01:57:13 +00:00
< ? php
2020-09-23 22:22:39 +00:00
/* This Source Code Form is subject to the terms of the Mozilla Public
* License , v . 2.0 . If a copy of the MPL was not distributed with this
* file , You can obtain one at https :// mozilla . org / MPL / 2.0 /. */
2020-03-04 01:57:13 +00:00
require_once ( " JsonStorageBox.php " );
2020-03-14 17:49:26 +00:00
/**
* Calculates the standard deviation of an array of numbers .
* @ source https :// stackoverflow . com / a / 57694168 / 1460422
* @ param array $array The array of numbers to calculate the standard deviation of .
* @ return float The standard deviation of the numbers in the given array .
*/
function standard_deviation ( array $array ) : float {
$size = count ( $array );
$mean = array_sum ( $array ) / $size ;
$squares = array_map ( function ( $x ) use ( $mean ) {
return pow ( $x - $mean , 2 );
}, $array );
return sqrt ( array_sum ( $squares ) / ( $size - 1 ));
2020-03-04 01:57:13 +00:00
}
/**
* A serialisable BK - Tree Implementation .
* Ref : https :// nullwords . wordpress . com / 2013 / 03 / 13 / the - bk - tree - a - data - structure - for - spell - checking /
*/
class BkTree
{
private $box = null ;
private $nodes = [];
// private $touched_ids = [];
private $cost_insert = 1 ;
private $cost_delete = 1 ;
private $cost_replace = 1 ;
public function __construct ( $filename ) {
$this -> box = new JsonStorageBox ( $filename );
}
/**
* A utility function for calculating edit distance .
* Warning : Do not use this internally ! It is * slow *. It ' s much faster to do this directly . This exists only for external use .
* @ param string $a The first string .
* @ param string $b The second string to compare against .
* @ return int The computed edit distance .
*/
public function edit_distance ( string $a , string $b ) : int {
return levenshtein ( $a , $b , $this -> cost_insert , $this -> cost_replace , $this -> cost_delete );
}
private function get_node_count () : int {
if ( ! $this -> box -> has ( " node_count " ))
$this -> set_node_count ( 0 );
return $this -> box -> get ( " node_count " );
}
private function set_node_count ( int $value ) {
$this -> box -> set ( " node_count " , $value );
}
private function increment_node_count () {
$this -> box -> set ( " node_count " , $this -> box -> get ( " node_count " ) + 1 );
}
/**
* Adds a string to the tree .
2020-03-09 01:14:31 +00:00
* @ param string $string The string to add .
* @ param int $starting_node_id The id fo node to start insertion from . Defaults to 0 - for internal use only .
2020-03-04 01:57:13 +00:00
* @ return int The depth at which the new node was added .
*/
2020-03-09 01:14:31 +00:00
public function add ( string $string , int $starting_node_id = 0 ) : int {
2020-03-04 01:57:13 +00:00
// FUTURE: When we support deletes, we'll need to ensure that the root node is handled correctly
if ( ! $this -> box -> has ( " node|0 " )) {
2020-03-09 01:14:31 +00:00
// If the root node of the tree doesn't exist, create it
2020-03-04 01:57:13 +00:00
$new = new stdClass ();
$new -> value = $string ;
$new -> children = new stdClass (); // [ "id" => int, "distance" => int ]
$this -> box -> set ( " node|0 " , $new );
$this -> touched_ids [] = 0 ;
$this -> increment_node_count ();
return 0 ;
}
2020-03-09 01:14:31 +00:00
if ( ! $this -> box -> has ( " node| $starting_node_id " ))
throw new Exception ( " Error: Failed to find node with id $starting_node_id to begin insertion " );
2020-03-04 01:57:13 +00:00
// if($string == "bunny") echo("\nStart $string\n");
2020-03-09 01:14:31 +00:00
$next_node = $this -> box -> get ( " node| $starting_node_id " ); // Grab the root to start with
2020-03-09 21:27:20 +00:00
$next_node_id = $starting_node_id ;
2020-03-04 01:57:13 +00:00
$depth = 0 ; $visted = 0 ;
while ( true ) {
$visted ++ ;
$distance = levenshtein ( $string , $next_node -> value , $this -> cost_insert , $this -> cost_replace , $this -> cost_delete );
// if($string == "bunny") echo("$visted: Visiting $next_node->value, distance $distance (child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $next_node->children)).")\n");
if ( isset ( $next_node -> children -> $distance )) {
$child_id = $next_node -> children -> $distance ;
$next_node = $this -> box -> get ( " node| $child_id " );
$next_node_id = $child_id ;
// if($string == "cake") echo("Identical distance as {$next_node["value"]}, restarting loop\n");
$depth ++ ;
continue ; // Continue on the outer while loop
}
// if($string == "bunny") echo("Inserting on $next_node->value\n");
// If we got here, then no existing children have the same edit distance
// Note that here we don't push to avoid the overhead from either array_push() (considerable) or count() (also considerable).
// Create the new child node
$new_id = $this -> get_node_count ();
$this -> box -> set ( " node| $new_id " , ( object ) [
" value " => $string ,
" children " => new stdClass ()
]);
// Create the edge that points from the existing node to the new node
$next_node -> children -> $distance = $new_id ;
$this -> box -> set ( " node| $next_node_id " , $next_node );
$this -> increment_node_count ();
break ;
}
return $depth ;
}
/**
* Removes a string from the tree .
* @ param string $string The string to remove .
* @ return bool Whether the removal was successful .
*/
public function remove ( string $string ) : bool {
2020-03-09 01:14:31 +00:00
$stack = [ [ " node " => $this -> box -> get ( " node|0 " ), " id " => 0 ] ];
$node_target = $stack [ 0 ][ " node " ];
$node_target_id = 0 ;
while ( $node_target -> value !== $string ) {
$distance = levenshtein ( $string , $node_target -> value , $this -> cost_insert , $this -> cost_replace , $this -> cost_delete );
// Failed to recurse to find the node with the value in question
if ( ! isset ( $node_target -> children -> $distance ))
return false ;
$node_target_id = $node_target -> children -> $distance ;
$node_target = $this -> box -> get ( " node| $node_target_id " );
2020-03-09 21:27:20 +00:00
$stack [] = [ " node " => $node_target , " id " => $node_target_id ];
2020-03-09 01:14:31 +00:00
}
2020-03-09 21:27:20 +00:00
// The last item but 1 on the stack is the parent node
$parent = $stack [ count ( $stack ) - 2 ];
2020-03-09 01:14:31 +00:00
2020-03-04 01:57:13 +00:00
// 1. Delete the connection from parent -> target
2020-03-09 21:32:06 +00:00
foreach ( $parent [ " node " ] -> children as $distance => $id ) {
2020-03-09 01:14:31 +00:00
if ( $id == $node_target_id ) {
unset ( $parent [ " node " ] -> children -> $distance );
break ;
}
}
// Save the parent node's back to disk
// Note that we do this *before* sorting out the orphans, since it's possible that $this->add() will modify it further
$this -> box -> set ( " node| { $parent [ " id " ] } " , $parent [ " node " ]);
2020-03-04 01:57:13 +00:00
// 2. Iterate over the target's children (if any) and re-hang them from the parent
// NOTE: We need to be careful that the characteristics of the tree are preserved. We should test this by tracing a node's location in the tree and purposefully removing nodes in the chain and see if the results returned as still the same
2020-03-09 01:14:31 +00:00
//
2020-03-09 21:27:20 +00:00
// Hang the now orphaned children and all their decendants from the parent
2020-03-09 21:32:06 +00:00
foreach ( $node_target -> children as $distance => $id ) {
2020-03-09 01:14:31 +00:00
$orphan = $this -> box -> get ( " node| $id " );
2020-03-09 21:27:20 +00:00
$substack = [ [ " node " => $orphan , " id " => $id ] ]; $substack_top = 0 ;
while ( $substack_top >= 0 ) {
$next = $substack [ $substack_top ];
unset ( $substack [ $substack_top ]);
$substack_top -- ;
$this -> box -> delete ( " node| { $next [ " id " ] } " ); // Delete the orphan node
2020-03-09 21:32:06 +00:00
$this -> add ( $next [ " node " ] -> value , $parent [ " id " ]); // Re-hang it from the parent
2020-03-09 21:27:20 +00:00
foreach ( $next [ " node " ] -> children as $distance => $sub_id ) {
$substack [ ++ $substack_top ] = [
" node " => $this -> box -> get ( " node| $sub_id " ),
" id " => $sub_id
];
}
}
2020-03-09 01:14:31 +00:00
}
// Delete the target node
$this -> box -> delete ( " node| $node_target_id " );
2020-03-09 21:27:20 +00:00
return true ;
}
public function trace ( string $string ) {
$stack = [
( object ) [ " node " => $this -> box -> get ( " node|0 " ), " id " => 0 ]
];
$node_target = $stack [ 0 ] -> node ;
while ( $node_target -> value !== $string ) {
$distance = levenshtein ( $string , $node_target -> value , $this -> cost_insert , $this -> cost_replace , $this -> cost_delete );
var_dump ( $node_target );
// Failed to recurse to find the node with the value in question
if ( ! isset ( $node_target -> children -> $distance ))
return null ;
$node_target_id = $node_target -> children -> $distance ;
$node_target = $this -> box -> get ( " node| $node_target_id " );
$stack [] = ( object ) [ " node " => $node_target , " id " => $node_target_id ];
}
return $stack ;
2020-03-04 01:57:13 +00:00
}
/**
* Convenience function that returns just the first result when looking up a string .
* @ param string $string The string to lookup
* @ param integer $distance The maximum edit distance to search .
* @ return string | null The first matching string , or null if no results were found .
*/
public function lookup_one ( string $string , int $distance = 1 ) {
$result = $this -> lookup ( $string , $distance , 1 );
if ( empty ( $result )) return null ;
return $result [ 0 ];
// foreach($this->lookup($string, $distance) as $item)
// return $item;
}
/**
* Generator that walks the BK - Tree and iteratively yields results .
* TODO : Refactor this to use an array , since generators are ~
* @ param string $string The search string .
* @ param integer $max_distance The maximum edit distance to search .
* @ param integer $count The number of results to return . 0 = All results found . Note that results will be in a random order .
* @ return Generator < string > Iteratively yielded similar resultant strings from the BK - Tree .
*/
public function lookup ( string $string , int $max_distance = 1 , int $count = 0 ) {
if ( $this -> get_node_count () == 0 ) return null ;
$result = []; $result_count = 0 ;
$stack = [ $this -> box -> get ( " node|0 " ) ];
$stack_top = 0 ;
// https://softwareengineering.stackexchange.com/a/226162/58491
while ( $stack_top >= 0 ) {
// Take the topmost node off the stack
$node_current = $stack [ $stack_top ];
unset ( $stack [ $stack_top ]);
$stack_top -- ;
$distance = levenshtein ( $string , $node_current -> value , $this -> cost_insert , $this -> cost_replace , $this -> cost_delete );
/*
echo ( " [lookup] Visiting $node_current->value (distance $distance , child distances " . implode ( " , " , array_map ( function ( $el ) { return $el -> distance ; }, $node_current -> children )) . " ) \n " );
if ( in_array ( $node_current -> value , [ " worlds " , " domicil " , " mealiest " , " stopgaps " , " pibroch " , " upwardly " , " nontruth " , " vizoring " ])) {
echo ( " [lookup] Children: " . implode ( " , " , array_map ( function ( $el ) {
return " $el->distance : " . $this -> box -> get ( " node| $el->id " ) -> value ;
}, $node_current -> children )) . " \n " );
}
if ( $node_current -> value == " bunny " ) exit ();
*/
// If the edit distance from the target string to this node is within the tolerance, yield it
if ( $distance <= $max_distance ) {
// readline("press any key to continue");
$result [] = $node_current -> value ;
if ( $count != 0 && $result_count >= $count ) break ;
// yield $node_current["value"];
}
// Adding the key here speeds it up, apparently
// Ref: https://phpbench.com/
for ( $child_distance = $distance - $max_distance ; $child_distance <= $distance + $max_distance ; $child_distance ++ ) {
if ( ! isset ( $node_current -> children -> $child_distance ))
continue ;
// echo("[lookup] Recursing on child ".$this->box->get("node|$child->id")->value." (distance $child->distance)\n");
// Push the node onto the stack
// Note that it doesn't actually matter that the stack isn't an accurate representation of ancestor nodes at any given time here. The stack is really a hybrid between a stack and a queue, having features of both.
2020-03-09 21:27:20 +00:00
$stack [ ++ $stack_top ] = $this -> box -> get ( " node| { $node_current -> children -> $child_distance } " );
2020-03-04 01:57:13 +00:00
}
}
return $result ;
}
/**
* Calculate statistics about the BK - Tree .
* Useful for analysing a tree ' s structure .
* If the tree isn ' t balanced , you may need to insert items in a different order .
* @ return array An array of statistics about this BK - Tree .
*/
public function stats () {
$result = [
" depth_max " => 0 ,
" depth_min_leaf " => INF ,
" depth_average " => 0 ,
" depth_average_noleaf " => 0 ,
" depth_standard_deviation " => [],
" child_count_average " => 0 ,
" child_count_max " => 0 ,
" nodes " => $this -> get_node_count (),
" leaves " => 0 ,
" non_leaves " => 0
];
$start_time = microtime ( true );
$stack = [ [ " node " => $this -> box -> get ( " node|0 " ), " depth " => 0 ] ];
// https://softwareengineering.stackexchange.com/a/226162/58491
while ( ! empty ( $stack )) {
// Take the top-most node off the stack
$current = array_pop ( $stack );
// echo("Visiting "); var_dump($current);
// Operate on the node
$result [ " depth_standard_deviation " ][] = $current [ " depth " ];
$result [ " depth_average " ] += $current [ " depth " ];
if ( $current [ " depth " ] > $result [ " depth_max " ])
$result [ " depth_max " ] = $current [ " depth " ];
if ( empty ( $current [ " node " ] -> children ) && $current [ " depth " ] < $result [ " depth_min_leaf " ])
$result [ " depth_min_leaf " ] = $current [ " depth " ];
$child_count = count (( array )( $current [ " node " ] -> children ));
$result [ " child_count_average " ] += $child_count ;
if ( $child_count > $result [ " child_count_max " ])
$result [ " child_count_max " ] = $child_count ;
if ( $child_count > 0 ) {
$result [ " depth_average_noleaf " ] += $current [ " depth " ];
$result [ " non_leaves " ] ++ ;
}
else
$result [ " leaves " ] ++ ;
// Iterate over the child nodes
foreach ( $current [ " node " ] -> children as $child_distance => $child_id ) {
$stack [] = [
" node " => $this -> box -> get ( " node| $child_id " ),
" depth " => $current [ " depth " ] + 1
];
}
}
$result [ " depth_average " ] /= $result [ " nodes " ];
$result [ " depth_average_noleaf " ] /= $result [ " non_leaves " ];
$result [ " child_count_average " ] /= $result [ " nodes " ];
2020-03-14 17:49:26 +00:00
$result [ " depth_standard_deviation " ] = standard_deviation ( $result [ " depth_standard_deviation " ]);
2020-03-04 01:57:13 +00:00
$result [ " time_taken " ] = microtime ( true ) - $start_time ;
return $result ;
}
2020-03-05 01:20:51 +00:00
public function walk () {
$stack = [ ( object )[
" id " => 0 ,
" node " => $this -> box -> get ( " node|0 " ),
" parent_id " => - 1 ,
" parent " => null ,
" depth " => 0
] ];
$stack_top = 0 ;
// https://softwareengineering.stackexchange.com/a/226162/58491
while ( ! empty ( $stack )) {
// Take the topmost node off the stack
$current = $stack [ $stack_top ];
unset ( $stack [ $stack_top ]);
$stack_top -- ;
// echo("Visiting "); var_dump($current);
yield $current ;
// Iterate over the child nodes
foreach ( $current -> node -> children as $child_distance => $child_id ) {
$stack_top ++ ;
$stack [ $stack_top ] = ( object ) [
" id " => $child_id ,
" node " => $this -> box -> get ( " node| { $current -> node -> children -> $child_distance } " ),
" parent_id " => $current -> id ,
" parent " => $current -> node ,
" depth " => $current -> depth + 1
];
}
}
}
2020-03-04 01:57:13 +00:00
/**
* Saves changes to the tree back to disk .
* @ return void
*/
public function close () {
$this -> box -> close ();
}
}