mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-12-22 13:45:02 +00:00
Optimise the search context extractor, but evediently there's more work to be done.
This commit is contained in:
parent
93494b6729
commit
75b6b6c55f
5 changed files with 77 additions and 94 deletions
|
@ -21,6 +21,9 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
|
|||
- Added `password_cost`, `password_cost_time`, and `password_cost_time_interval` settings
|
||||
- `password_cost` is recalculated automatically every week by default (it keeps track of this via the `password_cost_time_lastcheck` 'setting')
|
||||
- The `css` setting will now keep a value of auto, even when `peppermint.json` is automatically updated by _Pepperminty Wiki_.
|
||||
- Optimised the search context extractor.
|
||||
- Tuned the default value for `search_characters_context` down to 75 (this won't be the case for existing wikis, so you'll need to adjust it manually)
|
||||
- Added new `search_characters_context_total` setting to control the maximum characters in a search context
|
||||
|
||||
## v0.16
|
||||
_(No changes since v0.16-beta1)_
|
||||
|
|
|
@ -172,7 +172,8 @@ $guiConfig = <<<'GUICONFIG'
|
|||
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
|
||||
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
|
||||
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
|
||||
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
|
||||
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
|
||||
"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
|
||||
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
|
||||
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
|
||||
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
||||
|
@ -396,7 +397,7 @@ if($settings->sessionprefix == "auto")
|
|||
/////////////////////////////////////////////////////////////////////////////
|
||||
/** The version of Pepperminty Wiki currently running. */
|
||||
$version = "v0.17-dev";
|
||||
$commit = "49b91aa6f999409dbcf6d165f379ebb95fec7dcb";
|
||||
$commit = "93494b672938d2fb456138e03db5be3803bc51b7";
|
||||
/// Environment ///
|
||||
/** Holds information about the current request environment. */
|
||||
$env = new stdClass();
|
||||
|
@ -4433,6 +4434,7 @@ class search
|
|||
if(isset($invindex[$qterm]))
|
||||
{
|
||||
// Loop over each page in the inverted index entry
|
||||
reset($invindex); // Reset array/object pointer
|
||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||
{
|
||||
// Create an entry in the matching pages array if it doesn't exist
|
||||
|
@ -4444,6 +4446,7 @@ class search
|
|||
|
||||
|
||||
// Loop over the pageindex and search the titles / tags
|
||||
reset($pageindex); // Reset array/object pointer
|
||||
foreach ($pageindex as $pagename => $pagedata)
|
||||
{
|
||||
// Get the current page's id
|
||||
|
@ -4565,9 +4568,7 @@ class search
|
|||
if($all_offsets === false)
|
||||
continue;
|
||||
foreach($all_offsets as $offset)
|
||||
{
|
||||
$matches[] = [ $nterm, $offset ];
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the matches by offset
|
||||
|
@ -4579,58 +4580,47 @@ class search
|
|||
$sourceLength = mb_strlen($source);
|
||||
|
||||
$contexts = [];
|
||||
$basepos = 0;
|
||||
|
||||
$matches_count = count($matches);
|
||||
while($basepos < $matches_count)
|
||||
{
|
||||
// Store the next match along - all others will be relative to that one
|
||||
$group = [$matches[$basepos]];
|
||||
$total_context_length = 0;
|
||||
for($i = 0; $i < $matches_count; $i++) {
|
||||
$next_context = [
|
||||
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
|
||||
"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
|
||||
];
|
||||
|
||||
// Start scanning at the next one along - we always store the first match
|
||||
$scanpos = $basepos + 1;
|
||||
$distance = 0;
|
||||
|
||||
while(true)
|
||||
{
|
||||
// Break out if we reach the end
|
||||
if($scanpos >= $matches_count) break;
|
||||
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
|
||||
// This next context overlaps with the previous one
|
||||
// Extend the last one instead of adding a new one
|
||||
|
||||
// Find the distance between the current one and the last one
|
||||
$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
|
||||
// The array pointer is pointing at the last element now because we called end() above
|
||||
|
||||
// Store it if the distance is below the threshold
|
||||
if($distance < $settings->search_characters_context)
|
||||
$group[] = $matches[$scanpos];
|
||||
else
|
||||
break;
|
||||
|
||||
$scanpos++;
|
||||
// Update the total context length counter appropriately
|
||||
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
|
||||
$contexts[key($contexts)]["to"] = $next_context["to"];
|
||||
}
|
||||
else { // No overlap here! Business as usual.
|
||||
$contexts[] = $next_context;
|
||||
// Update the total context length counter as normal
|
||||
$total_context_length += $next_context["to"] - $next_context["from"];
|
||||
}
|
||||
|
||||
$context_start = $group[0][1] - $settings->search_characters_context;
|
||||
$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
|
||||
|
||||
if($context_start < 0) $context_start = 0;
|
||||
if($context_end > $sourceLength) $context_end = $sourceLength;
|
||||
|
||||
//echo("Got context. Start: $context_start, End: $context_end\n");
|
||||
//echo("Group:"); var_dump($group);
|
||||
|
||||
$context = substr($source, $context_start, $context_end - $context_start);
|
||||
|
||||
// Strip the markdown from the context - it's most likely going to
|
||||
// be broken anyway.
|
||||
//$context = self::strip_markup($context);
|
||||
|
||||
// Escape special characters to protect against attacks
|
||||
$context = htmlentities($context);
|
||||
|
||||
$contexts[] = $context;
|
||||
|
||||
$basepos = $scanpos + 1;
|
||||
end($contexts);
|
||||
$last_context = &$contexts[key($contexts)];
|
||||
if($total_context_length > $settings->search_characters_context_total) {
|
||||
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
|
||||
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return implode(" ... ", $contexts);
|
||||
$contexts_text = [];
|
||||
foreach($contexts as $context) {
|
||||
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
|
||||
}
|
||||
|
||||
return implode(" ... ", $contexts_text);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -104,7 +104,7 @@
|
|||
"author": "Starbeamrainbowlabs",
|
||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||
"id": "feature-search",
|
||||
"lastupdate": 1529964034,
|
||||
"lastupdate": 1529967930,
|
||||
"optional": false
|
||||
},
|
||||
{
|
||||
|
|
|
@ -790,6 +790,7 @@ class search
|
|||
if(isset($invindex[$qterm]))
|
||||
{
|
||||
// Loop over each page in the inverted index entry
|
||||
reset($invindex); // Reset array/object pointer
|
||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||
{
|
||||
// Create an entry in the matching pages array if it doesn't exist
|
||||
|
@ -801,6 +802,7 @@ class search
|
|||
|
||||
|
||||
// Loop over the pageindex and search the titles / tags
|
||||
reset($pageindex); // Reset array/object pointer
|
||||
foreach ($pageindex as $pagename => $pagedata)
|
||||
{
|
||||
// Get the current page's id
|
||||
|
@ -922,9 +924,7 @@ class search
|
|||
if($all_offsets === false)
|
||||
continue;
|
||||
foreach($all_offsets as $offset)
|
||||
{
|
||||
$matches[] = [ $nterm, $offset ];
|
||||
}
|
||||
}
|
||||
|
||||
// Sort the matches by offset
|
||||
|
@ -936,58 +936,47 @@ class search
|
|||
$sourceLength = mb_strlen($source);
|
||||
|
||||
$contexts = [];
|
||||
$basepos = 0;
|
||||
|
||||
$matches_count = count($matches);
|
||||
while($basepos < $matches_count)
|
||||
{
|
||||
// Store the next match along - all others will be relative to that one
|
||||
$group = [$matches[$basepos]];
|
||||
$total_context_length = 0;
|
||||
for($i = 0; $i < $matches_count; $i++) {
|
||||
$next_context = [
|
||||
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
|
||||
"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
|
||||
];
|
||||
|
||||
// Start scanning at the next one along - we always store the first match
|
||||
$scanpos = $basepos + 1;
|
||||
$distance = 0;
|
||||
|
||||
while(true)
|
||||
{
|
||||
// Break out if we reach the end
|
||||
if($scanpos >= $matches_count) break;
|
||||
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
|
||||
// This next context overlaps with the previous one
|
||||
// Extend the last one instead of adding a new one
|
||||
|
||||
// Find the distance between the current one and the last one
|
||||
$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
|
||||
// The array pointer is pointing at the last element now because we called end() above
|
||||
|
||||
// Store it if the distance is below the threshold
|
||||
if($distance < $settings->search_characters_context)
|
||||
$group[] = $matches[$scanpos];
|
||||
else
|
||||
break;
|
||||
|
||||
$scanpos++;
|
||||
// Update the total context length counter appropriately
|
||||
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
|
||||
$contexts[key($contexts)]["to"] = $next_context["to"];
|
||||
}
|
||||
else { // No overlap here! Business as usual.
|
||||
$contexts[] = $next_context;
|
||||
// Update the total context length counter as normal
|
||||
$total_context_length += $next_context["to"] - $next_context["from"];
|
||||
}
|
||||
|
||||
$context_start = $group[0][1] - $settings->search_characters_context;
|
||||
$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
|
||||
|
||||
if($context_start < 0) $context_start = 0;
|
||||
if($context_end > $sourceLength) $context_end = $sourceLength;
|
||||
|
||||
//echo("Got context. Start: $context_start, End: $context_end\n");
|
||||
//echo("Group:"); var_dump($group);
|
||||
|
||||
$context = substr($source, $context_start, $context_end - $context_start);
|
||||
|
||||
// Strip the markdown from the context - it's most likely going to
|
||||
// be broken anyway.
|
||||
//$context = self::strip_markup($context);
|
||||
|
||||
// Escape special characters to protect against attacks
|
||||
$context = htmlentities($context);
|
||||
|
||||
$contexts[] = $context;
|
||||
|
||||
$basepos = $scanpos + 1;
|
||||
end($contexts);
|
||||
$last_context = &$contexts[key($contexts)];
|
||||
if($total_context_length > $settings->search_characters_context_total) {
|
||||
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
|
||||
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return implode(" ... ", $contexts);
|
||||
$contexts_text = [];
|
||||
foreach($contexts as $context) {
|
||||
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
|
||||
}
|
||||
|
||||
return implode(" ... ", $contexts_text);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -149,7 +149,8 @@
|
|||
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
|
||||
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
|
||||
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
|
||||
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
|
||||
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
|
||||
"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
|
||||
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
|
||||
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
|
||||
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
||||
|
|
Loading…
Reference in a new issue