mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-29 06:23:01 +00:00
Clear out the id index when rebuilding the inverted index - it sometimes goes all funny otherwise, apparently
This commit is contained in:
parent
2a3beccc72
commit
d83bbb3527
4 changed files with 107 additions and 9 deletions
|
@ -1093,6 +1093,20 @@ class ids
|
||||||
file_put_contents($paths->idindex, json_encode($idindex));
|
file_put_contents($paths->idindex, json_encode($idindex));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clears the id index completely. Will break the inverted search index!
|
||||||
|
*/
|
||||||
|
public static function clear()
|
||||||
|
{
|
||||||
|
global $paths, $idindex;
|
||||||
|
// Delete the old id index
|
||||||
|
unlink($paths->idindex);
|
||||||
|
// Create the new id index
|
||||||
|
file_put_contents($paths->idindex, "{}");
|
||||||
|
// Reset the in-memory id index
|
||||||
|
$idindex = new stdClass();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @summary Assigns an id to a pagename. Doesn't check to make sure that
|
* @summary Assigns an id to a pagename. Doesn't check to make sure that
|
||||||
* pagename doesn't exist in the pageindex.
|
* pagename doesn't exist in the pageindex.
|
||||||
|
@ -2935,15 +2949,21 @@ class search
|
||||||
|
|
||||||
header("content-type: text/event-stream");
|
header("content-type: text/event-stream");
|
||||||
|
|
||||||
|
// Clear the id index out
|
||||||
|
ids::clear();
|
||||||
|
|
||||||
|
// Reindex each page in turn
|
||||||
$invindex = [];
|
$invindex = [];
|
||||||
foreach($pageindex as $pagename => $pagedetails)
|
foreach($pageindex as $pagename => $pagedetails)
|
||||||
{
|
{
|
||||||
echo("Adding $pagename to the new search index.\n\n");
|
|
||||||
flush();
|
|
||||||
$pagesource = utf8_encode(file_get_contents("$env->storage_prefix$pagename.md"));
|
$pagesource = utf8_encode(file_get_contents("$env->storage_prefix$pagename.md"));
|
||||||
$index = self::index($pagesource);
|
$index = self::index($pagesource);
|
||||||
|
|
||||||
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
|
$pageid = ids::getid($pagename);
|
||||||
|
self::merge_into_invindex($invindex, $pageid, $index);
|
||||||
|
|
||||||
|
echo("Added $pagename (id #$pageid) to the new search index.\n\n");
|
||||||
|
flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
echo("Search index rebuilding complete.\n\n");
|
echo("Search index rebuilding complete.\n\n");
|
||||||
|
@ -3129,12 +3149,41 @@ class search
|
||||||
$pagedata["pagename"] = ids::getpagename($pageid);
|
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||||
$pagedata["rank"] = 0;
|
$pagedata["rank"] = 0;
|
||||||
|
|
||||||
|
$pageOffsets = [];
|
||||||
|
|
||||||
|
// Loop over each search term found on this page
|
||||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||||
{
|
{
|
||||||
|
// Add the number of occurrences of this search term to the ranking
|
||||||
$pagedata["rank"] += $entry["freq"];
|
$pagedata["rank"] += $entry["freq"];
|
||||||
|
|
||||||
// todo rank by context here
|
// Add the offsets to a listof all offsets on this page
|
||||||
|
foreach($entry["offsets"] as $offset)
|
||||||
|
$pageOffsets[] = $offset;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
// Sort the list of offsets
|
||||||
|
$pageOffsets = array_unique($pageOffsets);
|
||||||
|
sort($pageOffsets);
|
||||||
|
var_dump($pageOffsets);
|
||||||
|
|
||||||
|
// Calcualate the clump distances via a variable moving window size
|
||||||
|
$pageOffsetsCount = count($pageOffsets);
|
||||||
|
$clumpDistanceWindow = min($count, $pageOffsetsCount); // a.k.a. count($query_terms) - see above
|
||||||
|
$clumpDistances = [];
|
||||||
|
for($i = 0; $i < $pageOffsetsCount - $clumpDistanceWindow; $i++)
|
||||||
|
$clumpDistances[] = $pageOffsets[$i] - $pageOffsets[$i + $clumpDistanceWindow];
|
||||||
|
|
||||||
|
// Sort the new list of clump distances
|
||||||
|
sort($clumpDistances);
|
||||||
|
// Calcualate a measureof how clumped the offsets are
|
||||||
|
$tightClumpLimit = floor((count($clumpDistances) - 1) / 0.25);
|
||||||
|
$tightClumpsMeasure = $clumpDistances[$tightClumpLimit] - $clumpDistances[0];
|
||||||
|
$clumpsRange = $clumpDistances[count($clumpDistances) - 1] - $clumpDistances[0];
|
||||||
|
|
||||||
|
$clumpiness = $tightClumpsMeasure / $clumpsRange;
|
||||||
|
echo("{$pagedata["pagename"]} - $clumpiness");
|
||||||
|
*/
|
||||||
|
|
||||||
// Consider matches in the title / tags
|
// Consider matches in the title / tags
|
||||||
if(isset($pagedata["title-matches"]))
|
if(isset($pagedata["title-matches"]))
|
||||||
|
|
14
core.php
14
core.php
|
@ -772,6 +772,20 @@ class ids
|
||||||
file_put_contents($paths->idindex, json_encode($idindex));
|
file_put_contents($paths->idindex, json_encode($idindex));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clears the id index completely. Will break the inverted search index!
|
||||||
|
*/
|
||||||
|
public static function clear()
|
||||||
|
{
|
||||||
|
global $paths, $idindex;
|
||||||
|
// Delete the old id index
|
||||||
|
unlink($paths->idindex);
|
||||||
|
// Create the new id index
|
||||||
|
file_put_contents($paths->idindex, "{}");
|
||||||
|
// Reset the in-memory id index
|
||||||
|
$idindex = new stdClass();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @summary Assigns an id to a pagename. Doesn't check to make sure that
|
* @summary Assigns an id to a pagename. Doesn't check to make sure that
|
||||||
* pagename doesn't exist in the pageindex.
|
* pagename doesn't exist in the pageindex.
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||||
"id": "feature-search",
|
"id": "feature-search",
|
||||||
"lastupdate": 1490041182,
|
"lastupdate": 1490302006,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -404,15 +404,21 @@ class search
|
||||||
|
|
||||||
header("content-type: text/event-stream");
|
header("content-type: text/event-stream");
|
||||||
|
|
||||||
|
// Clear the id index out
|
||||||
|
ids::clear();
|
||||||
|
|
||||||
|
// Reindex each page in turn
|
||||||
$invindex = [];
|
$invindex = [];
|
||||||
foreach($pageindex as $pagename => $pagedetails)
|
foreach($pageindex as $pagename => $pagedetails)
|
||||||
{
|
{
|
||||||
echo("Adding $pagename to the new search index.\n\n");
|
|
||||||
flush();
|
|
||||||
$pagesource = utf8_encode(file_get_contents("$env->storage_prefix$pagename.md"));
|
$pagesource = utf8_encode(file_get_contents("$env->storage_prefix$pagename.md"));
|
||||||
$index = self::index($pagesource);
|
$index = self::index($pagesource);
|
||||||
|
|
||||||
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
|
$pageid = ids::getid($pagename);
|
||||||
|
self::merge_into_invindex($invindex, $pageid, $index);
|
||||||
|
|
||||||
|
echo("Added $pagename (id #$pageid) to the new search index.\n\n");
|
||||||
|
flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
echo("Search index rebuilding complete.\n\n");
|
echo("Search index rebuilding complete.\n\n");
|
||||||
|
@ -598,12 +604,41 @@ class search
|
||||||
$pagedata["pagename"] = ids::getpagename($pageid);
|
$pagedata["pagename"] = ids::getpagename($pageid);
|
||||||
$pagedata["rank"] = 0;
|
$pagedata["rank"] = 0;
|
||||||
|
|
||||||
|
$pageOffsets = [];
|
||||||
|
|
||||||
|
// Loop over each search term found on this page
|
||||||
foreach($pagedata["nterms"] as $pterm => $entry)
|
foreach($pagedata["nterms"] as $pterm => $entry)
|
||||||
{
|
{
|
||||||
|
// Add the number of occurrences of this search term to the ranking
|
||||||
$pagedata["rank"] += $entry["freq"];
|
$pagedata["rank"] += $entry["freq"];
|
||||||
|
|
||||||
// todo rank by context here
|
// Add the offsets to a listof all offsets on this page
|
||||||
|
foreach($entry["offsets"] as $offset)
|
||||||
|
$pageOffsets[] = $offset;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
// Sort the list of offsets
|
||||||
|
$pageOffsets = array_unique($pageOffsets);
|
||||||
|
sort($pageOffsets);
|
||||||
|
var_dump($pageOffsets);
|
||||||
|
|
||||||
|
// Calcualate the clump distances via a variable moving window size
|
||||||
|
$pageOffsetsCount = count($pageOffsets);
|
||||||
|
$clumpDistanceWindow = min($count, $pageOffsetsCount); // a.k.a. count($query_terms) - see above
|
||||||
|
$clumpDistances = [];
|
||||||
|
for($i = 0; $i < $pageOffsetsCount - $clumpDistanceWindow; $i++)
|
||||||
|
$clumpDistances[] = $pageOffsets[$i] - $pageOffsets[$i + $clumpDistanceWindow];
|
||||||
|
|
||||||
|
// Sort the new list of clump distances
|
||||||
|
sort($clumpDistances);
|
||||||
|
// Calcualate a measureof how clumped the offsets are
|
||||||
|
$tightClumpLimit = floor((count($clumpDistances) - 1) / 0.25);
|
||||||
|
$tightClumpsMeasure = $clumpDistances[$tightClumpLimit] - $clumpDistances[0];
|
||||||
|
$clumpsRange = $clumpDistances[count($clumpDistances) - 1] - $clumpDistances[0];
|
||||||
|
|
||||||
|
$clumpiness = $tightClumpsMeasure / $clumpsRange;
|
||||||
|
echo("{$pagedata["pagename"]} - $clumpiness");
|
||||||
|
*/
|
||||||
|
|
||||||
// Consider matches in the title / tags
|
// Consider matches in the title / tags
|
||||||
if(isset($pagedata["title-matches"]))
|
if(isset($pagedata["title-matches"]))
|
||||||
|
|
Loading…
Reference in a new issue