1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-11-26 05:32:59 +00:00

Save the character offset, not the token offset in the inverted index

This commit is contained in:
Starbeamrainbowlabs 2018-06-30 11:19:38 +01:00
parent 1b3bc1cee7
commit 8955d6d131
Signed by: sbrl
GPG key ID: 1BE5172E637709C2

View file

@ -542,26 +542,17 @@ class search
$index = []; $index = [];
$terms = self::tokenize($source); $terms = self::tokenize($source, true);
$i = 0;
foreach($terms as $term) foreach($terms as $term)
{ {
$nterm = $term;
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words) // Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
if(in_array($nterm, self::$stop_words)) { $i++; continue; } if(in_array($term[0], self::$stop_words)) continue;
if(!isset($index[$nterm])) if(!isset($index[$term[0]]))
{ $index[$term[0]] = [ "freq" => 0, "offsets" => [] ];
$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
}
// FIXME: Here we use the index of the token in the array, when we want the number of characters into the page! $index[$term[0]]["freq"]++;
$index[$nterm]["freq"]++; $index[$term[0]]["offsets"][] = $term[1];
$index[$nterm]["offsets"][] = $i;
$i++;
} }
return $index; return $index;