Commit BkTree & Nilsimsa experiments. It's about time!

....I was gettign increasinly nervous about not committing these to git. 
Hopefully at some point soon I'll be able to integrate the BkTree into 
Pepperminty Wiki properly - but I still need to implement word removal 
first before I can do that.

Also, feature-search is getting big. It's refactoring time to be sure, 
but Im uncertain  at this stage precisely _how_ I want to go about that. 
I've got 2 ideas:

1. Refactor the engine and the storage box into separate "library 
modules"
2. Refactor them into their own repository/ies or something, and include 
them as extra data
3. Extend the  extra data system to support local files and include them 
in the main Pepperminty Wiki repository

Thought is required. If anyone actually reads this message, do get in 
touch with your thoughts!
This commit is contained in:
Starbeamrainbowlabs 2020-03-04 01:57:13 +00:00
parent 7af2ad9bb3
commit 593f16dfb9
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
6 changed files with 986 additions and 0 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
enable1*.txt
*.sqlite
module_index.json
.serenata
*.backup

View File

@ -0,0 +1,302 @@
<?php
require_once("JsonStorageBox.php");
if (!function_exists('stats_standard_deviation')) {
/**
* This user-land implementation follows the implementation quite strictly;
* it does not attempt to improve the code or algorithm in any way. It will
* raise a warning if you have fewer than 2 values in your array, just like
* the extension does (although as an E_USER_WARNING, not E_WARNING).
*
* @param array $a
* @param bool $sample [optional] Defaults to false
* @return float|bool The standard deviation or false on error.
*/
function stats_standard_deviation(array $a, $sample = false) {
$n = count($a);
if ($n === 0) {
trigger_error("The array has zero elements", E_USER_WARNING);
return false;
}
if ($sample && $n === 1) {
trigger_error("The array has only 1 element", E_USER_WARNING);
return false;
}
$mean = array_sum($a) / $n;
$carry = 0.0;
foreach ($a as $val) {
$d = ((double) $val) - $mean;
$carry += $d * $d;
};
if ($sample) {
--$n;
}
return sqrt($carry / $n);
}
}
/**
* A serialisable BK-Tree Implementation.
* Ref: https://nullwords.wordpress.com/2013/03/13/the-bk-tree-a-data-structure-for-spell-checking/
*/
class BkTree
{
private $box = null;
private $nodes = [];
// private $touched_ids = [];
private $cost_insert = 1;
private $cost_delete = 1;
private $cost_replace = 1;
public function __construct($filename) {
$this->box = new JsonStorageBox($filename);
}
/**
* A utility function for calculating edit distance.
* Warning: Do not use this internally! It is *slow*. It's much faster to do this directly. This exists only for external use.
* @param string $a The first string.
* @param string $b The second string to compare against.
* @return int The computed edit distance.
*/
public function edit_distance(string $a, string $b) : int {
return levenshtein($a, $b, $this->cost_insert, $this->cost_replace, $this->cost_delete);
}
private function get_node_count() : int {
if(!$this->box->has("node_count"))
$this->set_node_count(0);
return $this->box->get("node_count");
}
private function set_node_count(int $value) {
$this->box->set("node_count", $value);
}
private function increment_node_count() {
$this->box->set("node_count", $this->box->get("node_count") + 1);
}
/**
* Adds a string to the tree.
* @param string $string The string to add.
* @return int The depth at which the new node was added.
*/
public function add(string $string) : int {
// FUTURE: When we support deletes, we'll need to ensure that the root node is handled correctly
if(!$this->box->has("node|0")) {
$new = new stdClass();
$new->value = $string;
$new->children = new stdClass(); // [ "id" => int, "distance" => int ]
$this->box->set("node|0", $new);
$this->touched_ids[] = 0;
$this->increment_node_count();
return 0;
}
// if($string == "bunny") echo("\nStart $string\n");
$next_node = $this->box->get("node|0"); // Grab the root to start with
$next_node_id = 0;
$depth = 0; $visted = 0;
while(true) {
$visted++;
$distance = levenshtein($string, $next_node->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
// if($string == "bunny") echo("$visted: Visiting $next_node->value, distance $distance (child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $next_node->children)).")\n");
if(isset($next_node->children->$distance)) {
$child_id = $next_node->children->$distance;
$next_node = $this->box->get("node|$child_id");
$next_node_id = $child_id;
// if($string == "cake") echo("Identical distance as {$next_node["value"]}, restarting loop\n");
$depth++;
continue; // Continue on the outer while loop
}
// if($string == "bunny") echo("Inserting on $next_node->value\n");
// If we got here, then no existing children have the same edit distance
// Note that here we don't push to avoid the overhead from either array_push() (considerable) or count() (also considerable).
// Create the new child node
$new_id = $this->get_node_count();
$this->box->set("node|$new_id", (object) [
"value" => $string,
"children" => new stdClass()
]);
// Create the edge that points from the existing node to the new node
$next_node->children->$distance = $new_id;
$this->box->set("node|$next_node_id", $next_node);
$this->increment_node_count();
break;
}
return $depth;
}
/**
* Removes a string from the tree.
* @param string $string The string to remove.
* @return bool Whether the removal was successful.
*/
public function remove(string $string) : bool {
throw new Error("Error: Not implemented");
// TODO: Remove a node from the tree.
// 1. Delete the connection from parent -> target
// 2. Iterate over the target's children (if any) and re-hang them from the parent
// NOTE: We need to be careful that the characteristics of the tree are preserved. We should test this by tracing a node's location in the tree and purposefully removing nodes in the chain and see if the results returned as still the same
}
/**
* Convenience function that returns just the first result when looking up a string.
* @param string $string The string to lookup
* @param integer $distance The maximum edit distance to search.
* @return string|null The first matching string, or null if no results were found.
*/
public function lookup_one(string $string, int $distance = 1) {
$result = $this->lookup($string, $distance, 1);
if(empty($result)) return null;
return $result[0];
// foreach($this->lookup($string, $distance) as $item)
// return $item;
}
/**
* Generator that walks the BK-Tree and iteratively yields results.
* TODO: Refactor this to use an array, since generators are ~
* @param string $string The search string.
* @param integer $max_distance The maximum edit distance to search.
* @param integer $count The number of results to return. 0 = All results found. Note that results will be in a random order.
* @return Generator<string> Iteratively yielded similar resultant strings from the BK-Tree.
*/
public function lookup(string $string, int $max_distance = 1, int $count = 0) {
if($this->get_node_count() == 0) return null;
$result = []; $result_count = 0;
$stack = [ $this->box->get("node|0") ];
$stack_top = 0;
// https://softwareengineering.stackexchange.com/a/226162/58491
while($stack_top >= 0) {
// Take the topmost node off the stack
$node_current = $stack[$stack_top];
unset($stack[$stack_top]);
$stack_top--;
$distance = levenshtein($string, $node_current->value, $this->cost_insert, $this->cost_replace, $this->cost_delete);
/*
echo("[lookup] Visiting $node_current->value (distance $distance, child distances ".implode(", ", array_map(function($el) { return $el->distance; }, $node_current->children)).")\n");
if(in_array($node_current->value, ["worlds", "domicil", "mealiest", "stopgaps", "pibroch", "upwardly", "nontruth", "vizoring"])) {
echo("[lookup] Children: ".implode(", ", array_map(function($el) {
return "$el->distance: ".$this->box->get("node|$el->id")->value;
}, $node_current->children))."\n");
}
if($node_current->value == "bunny") exit();
*/
// If the edit distance from the target string to this node is within the tolerance, yield it
if($distance <= $max_distance) {
// readline("press any key to continue");
$result[] = $node_current->value;
if($count != 0 && $result_count >= $count) break;
// yield $node_current["value"];
}
// Adding the key here speeds it up, apparently
// Ref: https://phpbench.com/
for($child_distance = $distance - $max_distance; $child_distance <= $distance + $max_distance; $child_distance++) {
if(!isset($node_current->children->$child_distance))
continue;
// echo("[lookup] Recursing on child ".$this->box->get("node|$child->id")->value." (distance $child->distance)\n");
// Push the node onto the stack
// Note that it doesn't actually matter that the stack isn't an accurate representation of ancestor nodes at any given time here. The stack is really a hybrid between a stack and a queue, having features of both.
$stack_top++;
$stack[$stack_top] = $this->box->get("node|{$node_current->children->$child_distance}");
}
}
return $result;
}
/**
* Calculate statistics about the BK-Tree.
* Useful for analysing a tree's structure.
* If the tree isn't balanced, you may need to insert items in a different order.
* @return array An array of statistics about this BK-Tree.
*/
public function stats() {
$result = [
"depth_max" => 0,
"depth_min_leaf" => INF,
"depth_average" => 0,
"depth_average_noleaf" => 0,
"depth_standard_deviation" => [],
"child_count_average" => 0,
"child_count_max" => 0,
"nodes" => $this->get_node_count(),
"leaves" => 0,
"non_leaves" => 0
];
$start_time = microtime(true);
$stack = [ [ "node" => $this->box->get("node|0"), "depth" => 0 ] ];
// https://softwareengineering.stackexchange.com/a/226162/58491
while(!empty($stack)) {
// Take the top-most node off the stack
$current = array_pop($stack);
// echo("Visiting "); var_dump($current);
// Operate on the node
$result["depth_standard_deviation"][] = $current["depth"];
$result["depth_average"] += $current["depth"];
if($current["depth"] > $result["depth_max"])
$result["depth_max"] = $current["depth"];
if(empty($current["node"]->children) && $current["depth"] < $result["depth_min_leaf"])
$result["depth_min_leaf"] = $current["depth"];
$child_count = count((array)($current["node"]->children));
$result["child_count_average"] += $child_count;
if($child_count > $result["child_count_max"])
$result["child_count_max"] = $child_count;
if($child_count > 0) {
$result["depth_average_noleaf"] += $current["depth"];
$result["non_leaves"]++;
}
else
$result["leaves"]++;
// Iterate over the child nodes
foreach($current["node"]->children as $child_distance => $child_id) {
$stack[] = [
"node" => $this->box->get("node|$child_id"),
"depth" => $current["depth"] + 1
];
}
}
$result["depth_average"] /= $result["nodes"];
$result["depth_average_noleaf"] /= $result["non_leaves"];
$result["child_count_average"] /= $result["nodes"];
$result["depth_standard_deviation"] = stats_standard_deviation($result["depth_standard_deviation"]);
$result["time_taken"] = microtime(true) - $start_time;
return $result;
}
/**
* Saves changes to the tree back to disk.
* @return void
*/
public function close() {
$this->box->close();
}
}

View File

@ -0,0 +1,181 @@
<?php
/**
* Resolves a relative path against a given base directory.
* @apiVersion 0.20.0
* @source https://stackoverflow.com/a/44312137/1460422
* @param string $path The relative path to resolve.
* @param string|null $basePath The base directory to resolve against.
* @return string An absolute path.
*/
function path_resolve(string $path, string $basePath = null) {
// Make absolute path
if (substr($path, 0, 1) !== DIRECTORY_SEPARATOR) {
if ($basePath === null) {
// Get PWD first to avoid getcwd() resolving symlinks if in symlinked folder
$path=(getenv('PWD') ?: getcwd()).DIRECTORY_SEPARATOR.$path;
} elseif (strlen($basePath)) {
$path=$basePath.DIRECTORY_SEPARATOR.$path;
}
}
// Resolve '.' and '..'
$components=array();
foreach(explode(DIRECTORY_SEPARATOR, rtrim($path, DIRECTORY_SEPARATOR)) as $name) {
if ($name === '..') {
array_pop($components);
} elseif ($name !== '.' && !(count($components) && $name === '')) {
// … && !(count($components) && $name === '') - we want to keep initial '/' for abs paths
$components[]=$name;
}
}
return implode(DIRECTORY_SEPARATOR, $components);
}
/*
███████ ████████ ██████ ██████ █████ ██████ ███████ ██████ ██████ ██ ██
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ██ ██ ██ ██████ ███████ ██ ███ █████ ██████ ██ ██ ███
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ██ ██████ ██ ██ ██ ██ ██████ ███████ ██████ ██████ ██ ██
*/
/**
* Represents a key-value data store.
* @license Apache 2.0
*/
class JsonStorageBox {
/**
* The SQLite database connection.
* @var \PDO
*/
private $db;
/**
* A cache of values.
* @var object[]
*/
private $cache = [];
/**
* A cache of prepared SQL statements.
* @var \PDOStatement[]
*/
private $query_cache = [];
/**
* Initialises a new store connection.
* @param string $filename The filename that the store is located in.
*/
function __construct(string $filename) {
$firstrun = !file_exists($filename);
$this->db = new \PDO("sqlite:" . path_resolve($filename, __DIR__)); // HACK: This might not work on some systems, because it depends on the current working directory
$this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
if($firstrun) {
$this->query("CREATE TABLE store (key TEXT UNIQUE NOT NULL, value TEXT)");
}
}
/**
* Makes a query against the database.
* @param string $sql The (potentially parametised) query to make.
* @param array $variables Optional. The variables to substitute into the SQL query.
* @return \PDOStatement The result of the query, as a PDOStatement.
*/
private function query(string $sql, array $variables = []) {
// Add to the query cache if it doesn't exist
if(!isset($this->query_cache[$sql]))
$this->query_cache[$sql] = $this->db->prepare($sql);
$this->query_cache[$sql]->execute($variables);
return $this->query_cache[$sql]; // fetchColumn(), fetchAll(), etc. are defined on the statement, not the return value of execute()
}
/**
* Determines if the given key exists in the store or not.
* @param string $key The key to test.
* @return bool Whether the key exists in the store or not.
*/
public function has(string $key) : bool {
if(isset($this->cache[$key]))
return true;
return $this->query(
"SELECT COUNT(key) FROM store WHERE key = :key;",
[ "key" => $key ]
)->fetchColumn() > 0;
}
/**
* Gets a value from the store.
* @param string $key The key value is stored under.
* @return mixed The stored value.
*/
public function get(string $key) {
// If it's not in the cache, insert it
if(!isset($this->cache[$key])) {
$this->cache[$key] = [ "modified" => false, "value" => json_decode($this->query(
"SELECT value FROM store WHERE key = :key;",
[ "key" => $key ]
)->fetchColumn()) ];
}
return $this->cache[$key]["value"];
}
/**
* Sets a value in the data store.
* Note that this does NOT save changes to disk until you close the connection!
* @param string $key The key to set the value of.
* @param mixed $value The value to store.
*/
public function set(string $key, $value) : void {
if(!isset($this->cache[$key])) $this->cache[$key] = [];
$this->cache[$key]["value"] = $value;
$this->cache[$key]["modified"] = true;
}
/**
* Deletes an item from the data store.
* @param string $key The key of the item to delete.
* @return bool Whether it was really deleted or not. Note that if it doesn't exist, then it can't be deleted.
*/
public function delete(string $key) : bool {
// Remove it from the cache
if(isset($this->cache[$key]))
unset($this->cache[$key]);
// Remove it from disk
return $this->query(
"DELETE FROM store WHERE key = :key;",
[ "key" => $key ]
)->rowCount() > 0;
}
/**
* Empties the store.
*/
public function clear() : void {
// Empty the cache;
$this->cache = [];
// Empty the disk
$this->query("DELETE FROM store;");
}
/**
* Syncs changes to disk and closes the PDO connection.
*/
public function close() : void {
$this->db->beginTransaction();
foreach($this->cache as $key => $value_data) {
// If it wasn't modified, there's no point in saving it, is there?
if(!$value_data["modified"])
continue;
$this->query(
"INSERT OR REPLACE INTO store(key, value) VALUES(:key, :value)",
[
"key" => $key,
"value" => json_encode($value_data["value"])
]
);
}
$this->db->commit();
$this->db = null;
}
}

View File

@ -0,0 +1,134 @@
<?php
require("BkTree.php");
function time_callable($callable) {
$start_time = microtime(true);
return [
"value" => $callable(),
"time" => microtime(true) - $start_time
];
}
function tree_create() {
$tree = new BkTree("bktree.sqlite");
echo("Populating tree - ");
$time = microtime(true);
$handle = fopen("enable1.shuf.txt", "r"); $i = 0;
while(($line = fgets($handle)) !== false) {
// if($i > 10) exit();
$line = trim($line);
$tree->add($line);
$i++;
}
echo("done in ".round((microtime(true) - $time) * 1000, 2)."ms\n");
fclose($handle);
return $tree;
}
function tree_save(BkTree $tree) {
echo("Saving tree\n");
$tree->close();
}
function tree_load() {
return new BkTree("bktree.sqlite");
}
function test_search_linear() {
$start_time = microtime(true);
$handle = fopen("enable1.shuf.txt", "r");
while(($line = fgets($handle)) !== false) {
if(levenshtein("cakke", trim($line)) > 2) continue;
echo("linear match: ".trim($line)."\n");
}
echo("done in ".round((microtime(true) - $start_time) * 1000, 2)."ms\n");
exit();
}
if(file_exists("bktree.sqlite"))
$tree = time_callable("tree_load");
else
$tree = time_callable("tree_create");
echo("Tree created in ".($tree["time"]*1000)."ms\n");
$tree = $tree["value"];
echo("Tree stats: ");
var_dump($tree->stats());
function test_auto() {
global $tree;
for($i = 0; $i < 1; $i++) {
$start_time = microtime(true);
$results = $tree->lookup("cakke", 2);
echo("Lookup complete in ".round((microtime(true) - $start_time)*1000, 2)."ms (".count($results)." results found)\n");
}
exit();
}
test_auto();
echo("BkTree Test CLI\n");
echo("Exit with .exit\n");
echo("This ensures the tree is saved to disk\n");
while(true) {
$line = readline("> "); // Newline is removed automatically
if(strlen($line) == 0) continue;
readline_add_history($line);
if($line[0] == ".") {
switch ($line) {
case ".quit":
case ".exit":
$result = time_callable(function() use ($tree) {
tree_save($tree);
});
echo("Serialised tree in ".round($result["time"] * 1000, 2)."ms\n");
exit("exit\n");
break;
}
continue;
}
// var_dump($line);
$time = microtime(true);
$results = $tree->lookup($line, 2); $i = 0;
$time = round((microtime(true) - $time)*1000, 2);
$time_sort = microtime(true);
// Note that adding a cache here doesn't make a significant different to performance
// The overhead of calling a function far outweighs that of calling levenshtein(), apparently
usort($results, function($a, $b) use ($line, $tree) {
return $tree->edit_distance($a, $line) - $tree->edit_distance($b, $line);
});
$time_sort = round((microtime(true) - $time_sort)*1000, 2);
foreach($results as $result) {
echo(
str_pad($i, 5, " ", STR_PAD_LEFT).": ".
str_pad($result, 20).
" dist ".$tree->edit_distance($result, $line).
"\n"
);
$i++;
}
// $start_time_inc = microtime(true);
// $i = 0;
// foreach($tree->lookup($line, 2) as $result) {
// // var_dump($result);
// echo(
// str_pad(
// str_pad("$i: $result", 20)."dist ".levenshtein($result, $line),
// 40
// ).
// "+".round((microtime(true) - $start_time_inc)*1000, 2)."ms\n"
// );
// // readline("(press enter to continue)");
//
// $start_time_inc = microtime(true);
// $i++;
// }
echo("Found $i results in {$time}ms (+{$time_sort}ms sort)\n");
}

View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
curl -L https://starbeamrainbowlabs.com/wordlists/enable1.txt | shuf >enable1.shuf.txt;

View File

@ -0,0 +1,365 @@
<?php
/**
* PHP Library to calculate and compare Nilsimsa digests.
*
* The Nilsimsa hash is a locality senstive hash function. Generally similar
* documents will have similar Nilsimsa digests. The Hamming distance between
* the digests can be used to approximate the similarity between documents. For
* further information consult http://en.wikipedia.org/wiki/Nilsimsa_Hash and
* the references (particularly Damiani et al.)
*
* Implementation details:
* The Nilsimsa class takes in a data parameter which is the string of the
* document to digest Calling the methods hexdigest() and digest() give the
* nilsimsa digests in hex or array format. The helper function compare_digests
* takes in two digests and computes the Nilsimsa score. You can also use
* compare_files() and compare_strings() to compare files and strings directly.
*
* This code is a port of py-nilsimsa located at
* https://code.google.com/p/py-nilsimsa/
*/
/**
* The MIT License (MIT)
*
* Copyright (c) 2015 Bill Eager
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the 'Software'), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
class Nilsimsa
{
/**
* Tran53 hash constant
* @var array
*/
const TRAN = [
0x02,0xd6,0x9e,0x6f,0xf9,0x1d,0x04,0xab,0xd0,0x22,0x16,0x1f,0xd8,0x73,0xa1,0xac,
0x3b,0x70,0x62,0x96,0x1e,0x6e,0x8f,0x39,0x9d,0x05,0x14,0x4a,0xa6,0xbe,0xae,0x0e,
0xcf,0xb9,0x9c,0x9a,0xc7,0x68,0x13,0xe1,0x2d,0xa4,0xeb,0x51,0x8d,0x64,0x6b,0x50,
0x23,0x80,0x03,0x41,0xec,0xbb,0x71,0xcc,0x7a,0x86,0x7f,0x98,0xf2,0x36,0x5e,0xee,
0x8e,0xce,0x4f,0xb8,0x32,0xb6,0x5f,0x59,0xdc,0x1b,0x31,0x4c,0x7b,0xf0,0x63,0x01,
0x6c,0xba,0x07,0xe8,0x12,0x77,0x49,0x3c,0xda,0x46,0xfe,0x2f,0x79,0x1c,0x9b,0x30,
0xe3,0x00,0x06,0x7e,0x2e,0x0f,0x38,0x33,0x21,0xad,0xa5,0x54,0xca,0xa7,0x29,0xfc,
0x5a,0x47,0x69,0x7d,0xc5,0x95,0xb5,0xf4,0x0b,0x90,0xa3,0x81,0x6d,0x25,0x55,0x35,
0xf5,0x75,0x74,0x0a,0x26,0xbf,0x19,0x5c,0x1a,0xc6,0xff,0x99,0x5d,0x84,0xaa,0x66,
0x3e,0xaf,0x78,0xb3,0x20,0x43,0xc1,0xed,0x24,0xea,0xe6,0x3f,0x18,0xf3,0xa0,0x42,
0x57,0x08,0x53,0x60,0xc3,0xc0,0x83,0x40,0x82,0xd7,0x09,0xbd,0x44,0x2a,0x67,0xa8,
0x93,0xe0,0xc2,0x56,0x9f,0xd9,0xdd,0x85,0x15,0xb4,0x8a,0x27,0x28,0x92,0x76,0xde,
0xef,0xf8,0xb2,0xb7,0xc9,0x3d,0x45,0x94,0x4b,0x11,0x0d,0x65,0xd5,0x34,0x8b,0x91,
0x0c,0xfa,0x87,0xe9,0x7c,0x5b,0xb1,0x4d,0xe5,0xd4,0xcb,0x10,0xa2,0x17,0x89,0xbc,
0xdb,0xb0,0xe2,0x97,0x88,0x52,0xf7,0x48,0xd3,0x61,0x2c,0x3a,0x2b,0xd1,0x8c,0xfb,
0xf1,0xcd,0xe4,0x6a,0xe7,0xa9,0xfd,0xc4,0x37,0xc8,0xd2,0xf6,0xdf,0x58,0x72,0x4e,
];
/**
* Stores whether the digest is complete
*
* @var boolean
*/
private $digestComputed;
/**
* Stores the number of characters in the string digested
*
* @var int
*/
private $numChar;
/**
* Stores the accumulator as a 256-bit vector
*
* @var array
*/
private $acc;
/**
* Stores the active window used in {process} for hashing
*
* @var array
*/
private $window;
/**
* @var mixed
*/
private $digest;
/**
* The target length of the hash.
* @var int
*/
private $length;
/**
* Constructor
*
* @param string $data The data to process
*/
public function __construct($data = null, $length = null)
{
if($length !== null) $this->length = $length;
$this->digestComputed = false;
$this->numChar = 0;
$this->acc = array_fill(0, $this->length, 0);
$this->window = [];
if ($data) {
$this->process($data);
}
}
/**
* Computes the hash of all of the trigrams in the chunk using a window of
* length 5
*
* @param string $chunk The chunk to process
*/
public function process($chunk)
{
foreach (str_split($chunk) as $char) {
$this->numChar++;
$c = ord($char);
$windowLength = count($this->window);
if ($windowLength > 1) {
// seen at least three characters
$this->acc[$this->tranHash(
$c, $this->window[0], $this->window[1], 0
)]
+= 1;
}
if ($windowLength > 2) {
// seen at least four characters
$this->acc[$this->tranHash(
$c, $this->window[0], $this->window[2], 1
)]
+= 1;
$this->acc[$this->tranHash(
$c, $this->window[1], $this->window[2], 2
)]
+= 1;
}
if ($windowLength > 3) {
// have a full window
$this->acc[$this->tranHash(
$c, $this->window[0], $this->window[3], 3
)]
+= 1;
$this->acc[$this->tranHash(
$c, $this->window[1], $this->window[3], 4
)]
+= 1;
$this->acc[$this->tranHash(
$c, $this->window[2], $this->window[3], 5
)]
+= 1;
// duplicate hashes, used to maintain 8 trigrams per character
$this->acc[$this->tranHash(
$this->window[3], $this->window[0], $c, 6
)]
+= 1;
$this->acc[$this->tranHash(
$this->window[3], $this->window[2], $c, 7
)]
+= 1;
}
// add current character to the window, remove the previous character
array_unshift($this->window, $c);
if ($windowLength >= 4) {
$this->window = array_slice($this->window, 0, 4);
}
}
}
/**
* Implementation of the Tran53 hash algorithm
*
* @param int $a Input A
* @param int $b Input B
* @param int $c Input C
* @param int $n Input N
*
* @return int
*/
public function tranHash($a, $b, $c, $n)
{
return ((
(self::TRAN[($a + $n) & 255] ^ self::TRAN[$b] * ($n + $n + 1)) +
self::TRAN[($c) ^ self::TRAN[$n]]
) & ($this->length-1)); // Was 255
}
/**
* Returns the digest as a hex string. Computes it if it isn't computed
* already.
*
* @return string The digest
*/
public function hexDigest()
{
if ( ! $this->digestComputed) {
$this->computeDigest();
}
$output = null;
foreach ($this->digest as $i) {
$output .= sprintf('%02x', $i);
}
return $output;
}
/**
* Returns the digest as an array. Computes it if it isn't computed already.
*
* @return array The digest
*/
public function digest()
{
if ( ! $this->digestComputed) {
$this->computeDigest();
}
return $this->digest;
}
/**
* Using a threshold (mean of the accumulator), computes the nilsimsa
* digest after completion. Sets complete flag to true and stores result in
* $this->digest
*/
public function computeDigest()
{
$numTrigrams = 0;
if ($this->numChar == 3) {
// 3 chars -> 1 trigram
$numTrigrams = 1;
}
else if ($this->numChar == 4) {
// 4 chars -> 4 trigrams
$numTrigrams = 4;
}
else if ($this->numChar > 4) {
// > 4 chars -> 8 for each CHAR
$numTrigrams = 8 * $this->numChar - 28;
}
// threshhold is the mean of the acc buckets
$threshold = $numTrigrams / $this->length;
$digest = array_fill(0, $this->length/8, 0);
for ($i = 0; $i < ($this->length-2); $i++) {
if ($this->acc[$i] > $threshold) {
// equivalent to i/8, 2**(i mod 7)
$digest[$i >> 3] += 1 << ($i & 7);
}
}
// set flag to true
$this->digestComputed = true;
// store result in digest, reversed
$this->digest = array_reverse($digest);
}
static function hash($data, $length = 256) {
$hasher = new self($data, $length);
return $hasher->hexDigest();
}
}
function lines_count($handle) {
fseek($handle, 0);
$count = 0;
while(fgets($handle) !== false) $count++;
return $count;
}
$mode = $argv[1] ?? "help";
switch($mode) {
case "typos":
$handle = fopen("typos.csv", "r");
$line_count = lines_count($handle);
echo("$line_count lines total\n");
$sizes = [ 256, 128, 64, 32, 16, 8 ];
foreach($sizes as $size) {
fseek($handle, 0);fgets($handle); // Skipt he first line since it's the header
$count = 0; $count_same = 0; $skipped = 0;
$same = []; $not_same = [];
while(($line = fgets($handle)) !== false) {
$parts = explode(",", trim($line), 2);
if(strlen($parts[1]) < 3) {
$skipped++;
continue;
}
$hash_a = Nilsimsa::hash($parts[0], $size);
$hash_b = Nilsimsa::hash($parts[1], $size);
$count++;
if($hash_a == $hash_b) {
$count_same++;
$same[] = $parts;
}
else $not_same[] = $parts;
echo("$count_same / $count ($skipped skipped)\r");
}
file_put_contents("$size-same.csv", implode("\n", array_map(function ($el) {
return implode(",", $el);
}, $same)));
file_put_contents("$size-not-same.csv", implode("\n", array_map(function ($el) {
return implode(",", $el);
}, $not_same)));
echo(str_pad($size, 10)."$count_same / $count (".round(($count_same/$count)*100, 2)."%), $skipped skipped\n");
}
break;
case "helloworld":
foreach([ 256, 128, 64, 32, 16, 8 ] as $size) {
echo(str_pad($size, 10).Nilsimsa::hash("hello, world!", $size));
// echo(str_pad($size, 10).Nilsimsa::hash("pinnapple", $size));
echo("\n");
}
break;
case "help":
default:
echo("Mode $mode not recognised. Available modes:\n");
echo(" helloworld Show different hash sizes\n");
echo(" typos Compare typos in typos.csv and calculate statistics\n");
break;
}
/*
* TODO: Explore BK-Trees. SymSpell might be orders of magnitudes faster, but that's compared to a regular BK-Tree - and it's *much* more complicated.
* If we instead use Nilsimsa + the hamming distance comparison funnction (which we removed & will need to reinstate), is it faster than doing lots of `levenshtein()` calls?
* Experimentation is needed.
* See also gmp_hamdist() (which requires the gmp PHP extension).
*/