mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-22 04:23:01 +00:00
Save the character offset, not the token offset in the inverted index
This commit is contained in:
parent
1b3bc1cee7
commit
8955d6d131
1 changed files with 6 additions and 15 deletions
|
@ -542,26 +542,17 @@ class search
|
||||||
|
|
||||||
$index = [];
|
$index = [];
|
||||||
|
|
||||||
$terms = self::tokenize($source);
|
$terms = self::tokenize($source, true);
|
||||||
$i = 0;
|
|
||||||
foreach($terms as $term)
|
foreach($terms as $term)
|
||||||
{
|
{
|
||||||
$nterm = $term;
|
|
||||||
|
|
||||||
|
|
||||||
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
||||||
if(in_array($nterm, self::$stop_words)) { $i++; continue; }
|
if(in_array($term[0], self::$stop_words)) continue;
|
||||||
|
|
||||||
if(!isset($index[$nterm]))
|
if(!isset($index[$term[0]]))
|
||||||
{
|
$index[$term[0]] = [ "freq" => 0, "offsets" => [] ];
|
||||||
$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: Here we use the index of the token in the array, when we want the number of characters into the page!
|
$index[$term[0]]["freq"]++;
|
||||||
$index[$nterm]["freq"]++;
|
$index[$term[0]]["offsets"][] = $term[1];
|
||||||
$index[$nterm]["offsets"][] = $i;
|
|
||||||
|
|
||||||
$i++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $index;
|
return $index;
|
||||||
|
|
Loading…
Reference in a new issue