mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-25 17:23:00 +00:00
Clear out the id index when rebuilding the inverted index - it sometimes goes all funny otherwise, apparently
This commit is contained in:
parent
2a3beccc72
commit
d83bbb3527
4 changed files with 107 additions and 9 deletions
|
@ -1093,6 +1093,20 @@ class ids
|
|||
file_put_contents($paths->idindex, json_encode($idindex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the id index completely. Will break the inverted search index!
|
||||
*/
|
||||
public static function clear()
|
||||
{
|
||||
global $paths, $idindex;
|
||||
// Delete the old id index
|
||||
unlink($paths->idindex);
|
||||
// Create the new id index
|
||||
file_put_contents($paths->idindex, "{}");
|
||||
// Reset the in-memory id index
|
||||
$idindex = new stdClass();
|
||||
}
|
||||
|
||||
/*
|
||||
* @summary Assigns an id to a pagename. Doesn't check to make sure that
|
||||
* pagename doesn't exist in the pageindex.
|
||||
|
@ -2935,15 +2949,21 @@ class search
|
|||
|
||||
header("content-type: text/event-stream");
|
||||
|
||||
// Clear the id index out
|
||||
ids::clear();
|
||||
|
||||
// Reindex each page in turn
|
||||
$invindex = [];
|
||||
foreach($pageindex as $pagename => $pagedetails)
|
||||
{
|
||||
echo("Adding $pagename to the new search index.\n\n");
|
||||
flush();
|
||||
$pagesource = utf8_encode(file_get_contents("$env->storage_prefix$pagename.md"));
|
||||
$index = self::index($pagesource);
|
||||
|
||||
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
|
||||
$pageid = ids::getid($pagename);
|
||||
self::merge_into_invindex($invindex, $pageid, $index);
|
||||
|
||||
echo("Added $pagename (id #$pageid) to the new search index.\n\n");
|
||||
flush();
|
||||
}
|
||||
|
||||
echo("Search index rebuilding complete.\n\n");
|
||||
|
@ -3129,12 +3149,41 @@ class search
|
|||
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||
$pagedata["rank"] = 0;
|
||||
|
||||
$pageOffsets = [];
|
||||
|
||||
// Loop over each search term found on this page
|
||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||
{
|
||||
// Add the number of occurrences of this search term to the ranking
|
||||
$pagedata["rank"] += $entry["freq"];
|
||||
|
||||
// todo rank by context here
|
||||
// Add the offsets to a listof all offsets on this page
|
||||
foreach($entry["offsets"] as $offset)
|
||||
$pageOffsets[] = $offset;
|
||||
}
|
||||
/*
|
||||
// Sort the list of offsets
|
||||
$pageOffsets = array_unique($pageOffsets);
|
||||
sort($pageOffsets);
|
||||
var_dump($pageOffsets);
|
||||
|
||||
// Calcualate the clump distances via a variable moving window size
|
||||
$pageOffsetsCount = count($pageOffsets);
|
||||
$clumpDistanceWindow = min($count, $pageOffsetsCount); // a.k.a. count($query_terms) - see above
|
||||
$clumpDistances = [];
|
||||
for($i = 0; $i < $pageOffsetsCount - $clumpDistanceWindow; $i++)
|
||||
$clumpDistances[] = $pageOffsets[$i] - $pageOffsets[$i + $clumpDistanceWindow];
|
||||
|
||||
// Sort the new list of clump distances
|
||||
sort($clumpDistances);
|
||||
// Calcualate a measureof how clumped the offsets are
|
||||
$tightClumpLimit = floor((count($clumpDistances) - 1) / 0.25);
|
||||
$tightClumpsMeasure = $clumpDistances[$tightClumpLimit] - $clumpDistances[0];
|
||||
$clumpsRange = $clumpDistances[count($clumpDistances) - 1] - $clumpDistances[0];
|
||||
|
||||
$clumpiness = $tightClumpsMeasure / $clumpsRange;
|
||||
echo("{$pagedata["pagename"]} - $clumpiness");
|
||||
*/
|
||||
|
||||
// Consider matches in the title / tags
|
||||
if(isset($pagedata["title-matches"]))
|
||||
|
|
14
core.php
14
core.php
|
@ -772,6 +772,20 @@ class ids
|
|||
file_put_contents($paths->idindex, json_encode($idindex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the id index completely. Will break the inverted search index!
|
||||
*/
|
||||
public static function clear()
|
||||
{
|
||||
global $paths, $idindex;
|
||||
// Delete the old id index
|
||||
unlink($paths->idindex);
|
||||
// Create the new id index
|
||||
file_put_contents($paths->idindex, "{}");
|
||||
// Reset the in-memory id index
|
||||
$idindex = new stdClass();
|
||||
}
|
||||
|
||||
/*
|
||||
* @summary Assigns an id to a pagename. Doesn't check to make sure that
|
||||
* pagename doesn't exist in the pageindex.
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
"author": "Starbeamrainbowlabs",
|
||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||
"id": "feature-search",
|
||||
"lastupdate": 1490041182,
|
||||
"lastupdate": 1490302006,
|
||||
"optional": false
|
||||
},
|
||||
{
|
||||
|
|
|
@ -404,15 +404,21 @@ class search
|
|||
|
||||
header("content-type: text/event-stream");
|
||||
|
||||
// Clear the id index out
|
||||
ids::clear();
|
||||
|
||||
// Reindex each page in turn
|
||||
$invindex = [];
|
||||
foreach($pageindex as $pagename => $pagedetails)
|
||||
{
|
||||
echo("Adding $pagename to the new search index.\n\n");
|
||||
flush();
|
||||
$pagesource = utf8_encode(file_get_contents("$env->storage_prefix$pagename.md"));
|
||||
$index = self::index($pagesource);
|
||||
|
||||
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
|
||||
$pageid = ids::getid($pagename);
|
||||
self::merge_into_invindex($invindex, $pageid, $index);
|
||||
|
||||
echo("Added $pagename (id #$pageid) to the new search index.\n\n");
|
||||
flush();
|
||||
}
|
||||
|
||||
echo("Search index rebuilding complete.\n\n");
|
||||
|
@ -598,12 +604,41 @@ class search
|
|||
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||
$pagedata["rank"] = 0;
|
||||
|
||||
$pageOffsets = [];
|
||||
|
||||
// Loop over each search term found on this page
|
||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||
{
|
||||
// Add the number of occurrences of this search term to the ranking
|
||||
$pagedata["rank"] += $entry["freq"];
|
||||
|
||||
// todo rank by context here
|
||||
// Add the offsets to a listof all offsets on this page
|
||||
foreach($entry["offsets"] as $offset)
|
||||
$pageOffsets[] = $offset;
|
||||
}
|
||||
/*
|
||||
// Sort the list of offsets
|
||||
$pageOffsets = array_unique($pageOffsets);
|
||||
sort($pageOffsets);
|
||||
var_dump($pageOffsets);
|
||||
|
||||
// Calcualate the clump distances via a variable moving window size
|
||||
$pageOffsetsCount = count($pageOffsets);
|
||||
$clumpDistanceWindow = min($count, $pageOffsetsCount); // a.k.a. count($query_terms) - see above
|
||||
$clumpDistances = [];
|
||||
for($i = 0; $i < $pageOffsetsCount - $clumpDistanceWindow; $i++)
|
||||
$clumpDistances[] = $pageOffsets[$i] - $pageOffsets[$i + $clumpDistanceWindow];
|
||||
|
||||
// Sort the new list of clump distances
|
||||
sort($clumpDistances);
|
||||
// Calcualate a measureof how clumped the offsets are
|
||||
$tightClumpLimit = floor((count($clumpDistances) - 1) / 0.25);
|
||||
$tightClumpsMeasure = $clumpDistances[$tightClumpLimit] - $clumpDistances[0];
|
||||
$clumpsRange = $clumpDistances[count($clumpDistances) - 1] - $clumpDistances[0];
|
||||
|
||||
$clumpiness = $tightClumpsMeasure / $clumpsRange;
|
||||
echo("{$pagedata["pagename"]} - $clumpiness");
|
||||
*/
|
||||
|
||||
// Consider matches in the title / tags
|
||||
if(isset($pagedata["title-matches"]))
|
||||
|
|
Loading…
Reference in a new issue