<?php /** * PHP Library to calculate and compare Nilsimsa digests. * * The Nilsimsa hash is a locality senstive hash function. Generally similar * documents will have similar Nilsimsa digests. The Hamming distance between * the digests can be used to approximate the similarity between documents. For * further information consult http://en.wikipedia.org/wiki/Nilsimsa_Hash and * the references (particularly Damiani et al.) * * Implementation details: * The Nilsimsa class takes in a data parameter which is the string of the * document to digest Calling the methods hexdigest() and digest() give the * nilsimsa digests in hex or array format. The helper function compare_digests * takes in two digests and computes the Nilsimsa score. You can also use * compare_files() and compare_strings() to compare files and strings directly. * * This code is a port of py-nilsimsa located at * https://code.google.com/p/py-nilsimsa/ */ /** * The MIT License (MIT) * * Copyright (c) 2015 Bill Eager * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the 'Software'), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ class Nilsimsa { /** * Tran53 hash constant * @var array */ const TRAN = [ 0x02,0xd6,0x9e,0x6f,0xf9,0x1d,0x04,0xab,0xd0,0x22,0x16,0x1f,0xd8,0x73,0xa1,0xac, 0x3b,0x70,0x62,0x96,0x1e,0x6e,0x8f,0x39,0x9d,0x05,0x14,0x4a,0xa6,0xbe,0xae,0x0e, 0xcf,0xb9,0x9c,0x9a,0xc7,0x68,0x13,0xe1,0x2d,0xa4,0xeb,0x51,0x8d,0x64,0x6b,0x50, 0x23,0x80,0x03,0x41,0xec,0xbb,0x71,0xcc,0x7a,0x86,0x7f,0x98,0xf2,0x36,0x5e,0xee, 0x8e,0xce,0x4f,0xb8,0x32,0xb6,0x5f,0x59,0xdc,0x1b,0x31,0x4c,0x7b,0xf0,0x63,0x01, 0x6c,0xba,0x07,0xe8,0x12,0x77,0x49,0x3c,0xda,0x46,0xfe,0x2f,0x79,0x1c,0x9b,0x30, 0xe3,0x00,0x06,0x7e,0x2e,0x0f,0x38,0x33,0x21,0xad,0xa5,0x54,0xca,0xa7,0x29,0xfc, 0x5a,0x47,0x69,0x7d,0xc5,0x95,0xb5,0xf4,0x0b,0x90,0xa3,0x81,0x6d,0x25,0x55,0x35, 0xf5,0x75,0x74,0x0a,0x26,0xbf,0x19,0x5c,0x1a,0xc6,0xff,0x99,0x5d,0x84,0xaa,0x66, 0x3e,0xaf,0x78,0xb3,0x20,0x43,0xc1,0xed,0x24,0xea,0xe6,0x3f,0x18,0xf3,0xa0,0x42, 0x57,0x08,0x53,0x60,0xc3,0xc0,0x83,0x40,0x82,0xd7,0x09,0xbd,0x44,0x2a,0x67,0xa8, 0x93,0xe0,0xc2,0x56,0x9f,0xd9,0xdd,0x85,0x15,0xb4,0x8a,0x27,0x28,0x92,0x76,0xde, 0xef,0xf8,0xb2,0xb7,0xc9,0x3d,0x45,0x94,0x4b,0x11,0x0d,0x65,0xd5,0x34,0x8b,0x91, 0x0c,0xfa,0x87,0xe9,0x7c,0x5b,0xb1,0x4d,0xe5,0xd4,0xcb,0x10,0xa2,0x17,0x89,0xbc, 0xdb,0xb0,0xe2,0x97,0x88,0x52,0xf7,0x48,0xd3,0x61,0x2c,0x3a,0x2b,0xd1,0x8c,0xfb, 0xf1,0xcd,0xe4,0x6a,0xe7,0xa9,0xfd,0xc4,0x37,0xc8,0xd2,0xf6,0xdf,0x58,0x72,0x4e, ]; /** * Stores whether the digest is complete * * @var boolean */ private $digestComputed; /** * Stores the number of characters in the string digested * * @var int */ private $numChar; /** * Stores the accumulator as a 256-bit vector * * @var array */ private $acc; /** * Stores the active window used in {process} for hashing * * @var array */ private $window; /** * @var mixed */ private $digest; /** * The target length of the hash. * @var int */ private $length; /** * Constructor * * @param string $data The data to process */ public function __construct($data = null, $length = null) { if($length !== null) $this->length = $length; $this->digestComputed = false; $this->numChar = 0; $this->acc = array_fill(0, $this->length, 0); $this->window = []; if ($data) { $this->process($data); } } /** * Computes the hash of all of the trigrams in the chunk using a window of * length 5 * * @param string $chunk The chunk to process */ public function process($chunk) { foreach (str_split($chunk) as $char) { $this->numChar++; $c = ord($char); $windowLength = count($this->window); if ($windowLength > 1) { // seen at least three characters $this->acc[$this->tranHash( $c, $this->window[0], $this->window[1], 0 )] += 1; } if ($windowLength > 2) { // seen at least four characters $this->acc[$this->tranHash( $c, $this->window[0], $this->window[2], 1 )] += 1; $this->acc[$this->tranHash( $c, $this->window[1], $this->window[2], 2 )] += 1; } if ($windowLength > 3) { // have a full window $this->acc[$this->tranHash( $c, $this->window[0], $this->window[3], 3 )] += 1; $this->acc[$this->tranHash( $c, $this->window[1], $this->window[3], 4 )] += 1; $this->acc[$this->tranHash( $c, $this->window[2], $this->window[3], 5 )] += 1; // duplicate hashes, used to maintain 8 trigrams per character $this->acc[$this->tranHash( $this->window[3], $this->window[0], $c, 6 )] += 1; $this->acc[$this->tranHash( $this->window[3], $this->window[2], $c, 7 )] += 1; } // add current character to the window, remove the previous character array_unshift($this->window, $c); if ($windowLength >= 4) { $this->window = array_slice($this->window, 0, 4); } } } /** * Implementation of the Tran53 hash algorithm * * @param int $a Input A * @param int $b Input B * @param int $c Input C * @param int $n Input N * * @return int */ public function tranHash($a, $b, $c, $n) { return (( (self::TRAN[($a + $n) & 255] ^ self::TRAN[$b] * ($n + $n + 1)) + self::TRAN[($c) ^ self::TRAN[$n]] ) & ($this->length-1)); // Was 255 } /** * Returns the digest as a hex string. Computes it if it isn't computed * already. * * @return string The digest */ public function hexDigest() { if ( ! $this->digestComputed) { $this->computeDigest(); } $output = null; foreach ($this->digest as $i) { $output .= sprintf('%02x', $i); } return $output; } /** * Returns the digest as an array. Computes it if it isn't computed already. * * @return array The digest */ public function digest() { if ( ! $this->digestComputed) { $this->computeDigest(); } return $this->digest; } /** * Using a threshold (mean of the accumulator), computes the nilsimsa * digest after completion. Sets complete flag to true and stores result in * $this->digest */ public function computeDigest() { $numTrigrams = 0; if ($this->numChar == 3) { // 3 chars -> 1 trigram $numTrigrams = 1; } else if ($this->numChar == 4) { // 4 chars -> 4 trigrams $numTrigrams = 4; } else if ($this->numChar > 4) { // > 4 chars -> 8 for each CHAR $numTrigrams = 8 * $this->numChar - 28; } // threshhold is the mean of the acc buckets $threshold = $numTrigrams / $this->length; $digest = array_fill(0, $this->length/8, 0); for ($i = 0; $i < ($this->length-2); $i++) { if ($this->acc[$i] > $threshold) { // equivalent to i/8, 2**(i mod 7) $digest[$i >> 3] += 1 << ($i & 7); } } // set flag to true $this->digestComputed = true; // store result in digest, reversed $this->digest = array_reverse($digest); } static function hash($data, $length = 256) { $hasher = new self($data, $length); return $hasher->hexDigest(); } } function lines_count($handle) { fseek($handle, 0); $count = 0; while(fgets($handle) !== false) $count++; return $count; } $mode = $argv[1] ?? "help"; switch($mode) { case "typos": $handle = fopen("typos.csv", "r"); $line_count = lines_count($handle); echo("$line_count lines total\n"); $sizes = [ 256, 128, 64, 32, 16, 8 ]; foreach($sizes as $size) { fseek($handle, 0);fgets($handle); // Skipt he first line since it's the header $count = 0; $count_same = 0; $skipped = 0; $same = []; $not_same = []; while(($line = fgets($handle)) !== false) { $parts = explode(",", trim($line), 2); if(strlen($parts[1]) < 3) { $skipped++; continue; } $hash_a = Nilsimsa::hash($parts[0], $size); $hash_b = Nilsimsa::hash($parts[1], $size); $count++; if($hash_a == $hash_b) { $count_same++; $same[] = $parts; } else $not_same[] = $parts; echo("$count_same / $count ($skipped skipped)\r"); } file_put_contents("$size-same.csv", implode("\n", array_map(function ($el) { return implode(",", $el); }, $same))); file_put_contents("$size-not-same.csv", implode("\n", array_map(function ($el) { return implode(",", $el); }, $not_same))); echo(str_pad($size, 10)."→ $count_same / $count (".round(($count_same/$count)*100, 2)."%), $skipped skipped\n"); } break; case "helloworld": foreach([ 256, 128, 64, 32, 16, 8 ] as $size) { echo(str_pad($size, 10).Nilsimsa::hash("hello, world!", $size)); // echo(str_pad($size, 10).Nilsimsa::hash("pinnapple", $size)); echo("\n"); } break; case "help": default: echo("Mode $mode not recognised. Available modes:\n"); echo(" helloworld Show different hash sizes\n"); echo(" typos Compare typos in typos.csv and calculate statistics\n"); break; } /* * TODO: Explore BK-Trees. SymSpell might be orders of magnitudes faster, but that's compared to a regular BK-Tree - and it's *much* more complicated. * If we instead use Nilsimsa + the hamming distance comparison funnction (which we removed & will need to reinstate), is it faster than doing lots of `levenshtein()` calls? * Experimentation is needed. * See also gmp_hamdist() (which requires the gmp PHP extension). */