length = $length; $this->digestComputed = false; $this->numChar = 0; $this->acc = array_fill(0, $this->length, 0); $this->window = []; if ($data) { $this->process($data); } } /** * Computes the hash of all of the trigrams in the chunk using a window of * length 5 * * @param string $chunk The chunk to process */ public function process($chunk) { foreach (str_split($chunk) as $char) { $this->numChar++; $c = ord($char); $windowLength = count($this->window); if ($windowLength > 1) { // seen at least three characters $this->acc[$this->tranHash( $c, $this->window[0], $this->window[1], 0 )] += 1; } if ($windowLength > 2) { // seen at least four characters $this->acc[$this->tranHash( $c, $this->window[0], $this->window[2], 1 )] += 1; $this->acc[$this->tranHash( $c, $this->window[1], $this->window[2], 2 )] += 1; } if ($windowLength > 3) { // have a full window $this->acc[$this->tranHash( $c, $this->window[0], $this->window[3], 3 )] += 1; $this->acc[$this->tranHash( $c, $this->window[1], $this->window[3], 4 )] += 1; $this->acc[$this->tranHash( $c, $this->window[2], $this->window[3], 5 )] += 1; // duplicate hashes, used to maintain 8 trigrams per character $this->acc[$this->tranHash( $this->window[3], $this->window[0], $c, 6 )] += 1; $this->acc[$this->tranHash( $this->window[3], $this->window[2], $c, 7 )] += 1; } // add current character to the window, remove the previous character array_unshift($this->window, $c); if ($windowLength >= 4) { $this->window = array_slice($this->window, 0, 4); } } } /** * Implementation of the Tran53 hash algorithm * * @param int $a Input A * @param int $b Input B * @param int $c Input C * @param int $n Input N * * @return int */ public function tranHash($a, $b, $c, $n) { return (( (self::TRAN[($a + $n) & 255] ^ self::TRAN[$b] * ($n + $n + 1)) + self::TRAN[($c) ^ self::TRAN[$n]] ) & ($this->length-1)); // Was 255 } /** * Returns the digest as a hex string. Computes it if it isn't computed * already. * * @return string The digest */ public function hexDigest() { if ( ! $this->digestComputed) { $this->computeDigest(); } $output = null; foreach ($this->digest as $i) { $output .= sprintf('%02x', $i); } return $output; } /** * Returns the digest as an array. Computes it if it isn't computed already. * * @return array The digest */ public function digest() { if ( ! $this->digestComputed) { $this->computeDigest(); } return $this->digest; } /** * Using a threshold (mean of the accumulator), computes the nilsimsa * digest after completion. Sets complete flag to true and stores result in * $this->digest */ public function computeDigest() { $numTrigrams = 0; if ($this->numChar == 3) { // 3 chars -> 1 trigram $numTrigrams = 1; } else if ($this->numChar == 4) { // 4 chars -> 4 trigrams $numTrigrams = 4; } else if ($this->numChar > 4) { // > 4 chars -> 8 for each CHAR $numTrigrams = 8 * $this->numChar - 28; } // threshhold is the mean of the acc buckets $threshold = $numTrigrams / $this->length; $digest = array_fill(0, $this->length/8, 0); for ($i = 0; $i < ($this->length-2); $i++) { if ($this->acc[$i] > $threshold) { // equivalent to i/8, 2**(i mod 7) $digest[$i >> 3] += 1 << ($i & 7); } } // set flag to true $this->digestComputed = true; // store result in digest, reversed $this->digest = array_reverse($digest); } static function hash($data, $length = 256) { $hasher = new self($data, $length); return $hasher->hexDigest(); } } function lines_count($handle) { fseek($handle, 0); $count = 0; while(fgets($handle) !== false) $count++; return $count; } $mode = $argv[1] ?? "help"; switch($mode) { case "typos": $handle = fopen("typos.csv", "r"); $line_count = lines_count($handle); echo("$line_count lines total\n"); $sizes = [ 256, 128, 64, 32, 16, 8 ]; foreach($sizes as $size) { fseek($handle, 0);fgets($handle); // Skipt he first line since it's the header $count = 0; $count_same = 0; $skipped = 0; $same = []; $not_same = []; while(($line = fgets($handle)) !== false) { $parts = explode(",", trim($line), 2); if(strlen($parts[1]) < 3) { $skipped++; continue; } $hash_a = Nilsimsa::hash($parts[0], $size); $hash_b = Nilsimsa::hash($parts[1], $size); $count++; if($hash_a == $hash_b) { $count_same++; $same[] = $parts; } else $not_same[] = $parts; echo("$count_same / $count ($skipped skipped)\r"); } file_put_contents("$size-same.csv", implode("\n", array_map(function ($el) { return implode(",", $el); }, $same))); file_put_contents("$size-not-same.csv", implode("\n", array_map(function ($el) { return implode(",", $el); }, $not_same))); echo(str_pad($size, 10)."→ $count_same / $count (".round(($count_same/$count)*100, 2)."%), $skipped skipped\n"); } break; case "helloworld": foreach([ 256, 128, 64, 32, 16, 8 ] as $size) { echo(str_pad($size, 10).Nilsimsa::hash("hello, world!", $size)); // echo(str_pad($size, 10).Nilsimsa::hash("pinnapple", $size)); echo("\n"); } break; case "help": default: echo("Mode $mode not recognised. Available modes:\n"); echo(" helloworld Show different hash sizes\n"); echo(" typos Compare typos in typos.csv and calculate statistics\n"); break; } /* * TODO: Explore BK-Trees. SymSpell might be orders of magnitudes faster, but that's compared to a regular BK-Tree - and it's *much* more complicated. * If we instead use Nilsimsa + the hamming distance comparison funnction (which we removed & will need to reinstate), is it faster than doing lots of `levenshtein()` calls? * Experimentation is needed. * See also gmp_hamdist() (which requires the gmp PHP extension). */