Optimise the search context extractor, but evediently there's more work to be done.

This commit is contained in:
Starbeamrainbowlabs 2018-06-26 00:06:20 +01:00
parent 93494b6729
commit 75b6b6c55f
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
5 changed files with 77 additions and 94 deletions

View File

@ -21,6 +21,9 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
- Added `password_cost`, `password_cost_time`, and `password_cost_time_interval` settings
- `password_cost` is recalculated automatically every week by default (it keeps track of this via the `password_cost_time_lastcheck` 'setting')
- The `css` setting will now keep a value of auto, even when `peppermint.json` is automatically updated by _Pepperminty Wiki_.
- Optimised the search context extractor.
- Tuned the default value for `search_characters_context` down to 75 (this won't be the case for existing wikis, so you'll need to adjust it manually)
- Added new `search_characters_context_total` setting to control the maximum characters in a search context
## v0.16
_(No changes since v0.16-beta1)_

View File

@ -172,7 +172,8 @@ $guiConfig = <<<'GUICONFIG'
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
@ -396,7 +397,7 @@ if($settings->sessionprefix == "auto")
/////////////////////////////////////////////////////////////////////////////
/** The version of Pepperminty Wiki currently running. */
$version = "v0.17-dev";
$commit = "49b91aa6f999409dbcf6d165f379ebb95fec7dcb";
$commit = "93494b672938d2fb456138e03db5be3803bc51b7";
/// Environment ///
/** Holds information about the current request environment. */
$env = new stdClass();
@ -4433,6 +4434,7 @@ class search
if(isset($invindex[$qterm]))
{
// Loop over each page in the inverted index entry
reset($invindex); // Reset array/object pointer
foreach($invindex[$qterm] as $pageid => $page_entry)
{
// Create an entry in the matching pages array if it doesn't exist
@ -4444,6 +4446,7 @@ class search
// Loop over the pageindex and search the titles / tags
reset($pageindex); // Reset array/object pointer
foreach ($pageindex as $pagename => $pagedata)
{
// Get the current page's id
@ -4565,9 +4568,7 @@ class search
if($all_offsets === false)
continue;
foreach($all_offsets as $offset)
{
$matches[] = [ $nterm, $offset ];
}
}
// Sort the matches by offset
@ -4579,58 +4580,47 @@ class search
$sourceLength = mb_strlen($source);
$contexts = [];
$basepos = 0;
$matches_count = count($matches);
while($basepos < $matches_count)
{
// Store the next match along - all others will be relative to that one
$group = [$matches[$basepos]];
$total_context_length = 0;
for($i = 0; $i < $matches_count; $i++) {
$next_context = [
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
];
// Start scanning at the next one along - we always store the first match
$scanpos = $basepos + 1;
$distance = 0;
while(true)
{
// Break out if we reach the end
if($scanpos >= $matches_count) break;
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
// This next context overlaps with the previous one
// Extend the last one instead of adding a new one
// Find the distance between the current one and the last one
$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
// The array pointer is pointing at the last element now because we called end() above
// Store it if the distance is below the threshold
if($distance < $settings->search_characters_context)
$group[] = $matches[$scanpos];
else
break;
$scanpos++;
// Update the total context length counter appropriately
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
$contexts[key($contexts)]["to"] = $next_context["to"];
}
else { // No overlap here! Business as usual.
$contexts[] = $next_context;
// Update the total context length counter as normal
$total_context_length += $next_context["to"] - $next_context["from"];
}
$context_start = $group[0][1] - $settings->search_characters_context;
$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
if($context_start < 0) $context_start = 0;
if($context_end > $sourceLength) $context_end = $sourceLength;
//echo("Got context. Start: $context_start, End: $context_end\n");
//echo("Group:"); var_dump($group);
$context = substr($source, $context_start, $context_end - $context_start);
// Strip the markdown from the context - it's most likely going to
// be broken anyway.
//$context = self::strip_markup($context);
// Escape special characters to protect against attacks
$context = htmlentities($context);
$contexts[] = $context;
$basepos = $scanpos + 1;
end($contexts);
$last_context = &$contexts[key($contexts)];
if($total_context_length > $settings->search_characters_context_total) {
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
break;
}
}
return implode(" ... ", $contexts);
$contexts_text = [];
foreach($contexts as $context) {
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
}
return implode(" ... ", $contexts_text);
}
/**

View File

@ -104,7 +104,7 @@
"author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"id": "feature-search",
"lastupdate": 1529964034,
"lastupdate": 1529967930,
"optional": false
},
{

View File

@ -790,6 +790,7 @@ class search
if(isset($invindex[$qterm]))
{
// Loop over each page in the inverted index entry
reset($invindex); // Reset array/object pointer
foreach($invindex[$qterm] as $pageid => $page_entry)
{
// Create an entry in the matching pages array if it doesn't exist
@ -801,6 +802,7 @@ class search
// Loop over the pageindex and search the titles / tags
reset($pageindex); // Reset array/object pointer
foreach ($pageindex as $pagename => $pagedata)
{
// Get the current page's id
@ -922,9 +924,7 @@ class search
if($all_offsets === false)
continue;
foreach($all_offsets as $offset)
{
$matches[] = [ $nterm, $offset ];
}
}
// Sort the matches by offset
@ -936,58 +936,47 @@ class search
$sourceLength = mb_strlen($source);
$contexts = [];
$basepos = 0;
$matches_count = count($matches);
while($basepos < $matches_count)
{
// Store the next match along - all others will be relative to that one
$group = [$matches[$basepos]];
$total_context_length = 0;
for($i = 0; $i < $matches_count; $i++) {
$next_context = [
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
];
// Start scanning at the next one along - we always store the first match
$scanpos = $basepos + 1;
$distance = 0;
while(true)
{
// Break out if we reach the end
if($scanpos >= $matches_count) break;
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
// This next context overlaps with the previous one
// Extend the last one instead of adding a new one
// Find the distance between the current one and the last one
$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
// The array pointer is pointing at the last element now because we called end() above
// Store it if the distance is below the threshold
if($distance < $settings->search_characters_context)
$group[] = $matches[$scanpos];
else
break;
$scanpos++;
// Update the total context length counter appropriately
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
$contexts[key($contexts)]["to"] = $next_context["to"];
}
else { // No overlap here! Business as usual.
$contexts[] = $next_context;
// Update the total context length counter as normal
$total_context_length += $next_context["to"] - $next_context["from"];
}
$context_start = $group[0][1] - $settings->search_characters_context;
$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
if($context_start < 0) $context_start = 0;
if($context_end > $sourceLength) $context_end = $sourceLength;
//echo("Got context. Start: $context_start, End: $context_end\n");
//echo("Group:"); var_dump($group);
$context = substr($source, $context_start, $context_end - $context_start);
// Strip the markdown from the context - it's most likely going to
// be broken anyway.
//$context = self::strip_markup($context);
// Escape special characters to protect against attacks
$context = htmlentities($context);
$contexts[] = $context;
$basepos = $scanpos + 1;
end($contexts);
$last_context = &$contexts[key($contexts)];
if($total_context_length > $settings->search_characters_context_total) {
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
break;
}
}
return implode(" ... ", $contexts);
$contexts_text = [];
foreach($contexts as $context) {
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
}
return implode(" ... ", $contexts_text);
}
/**

View File

@ -149,7 +149,8 @@
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },