mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-12-22 13:45:02 +00:00
Continue working on indexer. Why is it converting to html entities....?
This commit is contained in:
parent
db7bf0f7ec
commit
725452a172
3 changed files with 35 additions and 75 deletions
|
@ -1268,47 +1268,27 @@ register_module([
|
|||
|
||||
$index = [];
|
||||
|
||||
// Regex from
|
||||
$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
|
||||
$i = 0;
|
||||
foreach($terms as $term)
|
||||
{
|
||||
$nterm = strtolower($term);
|
||||
if(!isset($index[$nterm]))
|
||||
{
|
||||
$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
|
||||
}
|
||||
|
||||
$index[$nterm]["freq"]++;
|
||||
$index[$nterm]["offsets"][] = $i;
|
||||
|
||||
$i++;
|
||||
}
|
||||
|
||||
var_dump($env->page);
|
||||
var_dump($source);
|
||||
echo("source length: $source_length\n");
|
||||
|
||||
$basepos = 0;
|
||||
$scanpos = $basepos;
|
||||
while($basepos < $source_length)
|
||||
{
|
||||
$word = "";
|
||||
do {
|
||||
// Break if we reach the end of the source text
|
||||
if($scanpos >= $source_length) break;
|
||||
$word .= $source[$scanpos];
|
||||
$scanpos++;
|
||||
} while(strpos($breakable_chars, $source[$scanpos]) === false);
|
||||
|
||||
// Move the base position up to the scan position (plus one to
|
||||
// skip over the breakable character), saving the old base
|
||||
// position for later
|
||||
$word_start_pos = $basepos;
|
||||
$basepos = $scanpos + 1;
|
||||
// Continue if the word is empty
|
||||
if(strlen($word) === 0) continue;
|
||||
// Normalise the word to be lowercase
|
||||
$word = strtolower($word);
|
||||
|
||||
var_dump($word);
|
||||
|
||||
// Initialise the entry in the index if it doesn't exist
|
||||
if(!isset($index[$word]))
|
||||
{
|
||||
$index[$word] = [
|
||||
"freq" => 0,
|
||||
"offsets" => []
|
||||
];
|
||||
}
|
||||
// Update the index entry
|
||||
$index[$word]["freq"]++;
|
||||
$index[$word]["offsets"][] = $word_start_pos;
|
||||
}
|
||||
|
||||
var_dump($index);
|
||||
});
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@
|
|||
"author": "Starbeamrainbowlabs",
|
||||
"description": "Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine.",
|
||||
"id": "feature-search",
|
||||
"lastupdate": 1445980152,
|
||||
"lastupdate": 1446019292,
|
||||
"optional": false
|
||||
},
|
||||
{
|
||||
|
|
|
@ -18,47 +18,27 @@ register_module([
|
|||
|
||||
$index = [];
|
||||
|
||||
// Regex from
|
||||
$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
|
||||
$i = 0;
|
||||
foreach($terms as $term)
|
||||
{
|
||||
$nterm = strtolower($term);
|
||||
if(!isset($index[$nterm]))
|
||||
{
|
||||
$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
|
||||
}
|
||||
|
||||
$index[$nterm]["freq"]++;
|
||||
$index[$nterm]["offsets"][] = $i;
|
||||
|
||||
$i++;
|
||||
}
|
||||
|
||||
var_dump($env->page);
|
||||
var_dump($source);
|
||||
echo("source length: $source_length\n");
|
||||
|
||||
$basepos = 0;
|
||||
$scanpos = $basepos;
|
||||
while($basepos < $source_length)
|
||||
{
|
||||
$word = "";
|
||||
do {
|
||||
// Break if we reach the end of the source text
|
||||
if($scanpos >= $source_length) break;
|
||||
$word .= $source[$scanpos];
|
||||
$scanpos++;
|
||||
} while(strpos($breakable_chars, $source[$scanpos]) === false);
|
||||
|
||||
// Move the base position up to the scan position (plus one to
|
||||
// skip over the breakable character), saving the old base
|
||||
// position for later
|
||||
$word_start_pos = $basepos;
|
||||
$basepos = $scanpos + 1;
|
||||
// Continue if the word is empty
|
||||
if(strlen($word) === 0) continue;
|
||||
// Normalise the word to be lowercase
|
||||
$word = strtolower($word);
|
||||
|
||||
var_dump($word);
|
||||
|
||||
// Initialise the entry in the index if it doesn't exist
|
||||
if(!isset($index[$word]))
|
||||
{
|
||||
$index[$word] = [
|
||||
"freq" => 0,
|
||||
"offsets" => []
|
||||
];
|
||||
}
|
||||
// Update the index entry
|
||||
$index[$word]["freq"]++;
|
||||
$index[$word]["offsets"][] = $word_start_pos;
|
||||
}
|
||||
|
||||
var_dump($index);
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue