mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-22 04:23:01 +00:00
Optimise the search context extractor, but evediently there's more work to be done.
This commit is contained in:
parent
93494b6729
commit
75b6b6c55f
5 changed files with 77 additions and 94 deletions
|
@ -21,6 +21,9 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
|
||||||
- Added `password_cost`, `password_cost_time`, and `password_cost_time_interval` settings
|
- Added `password_cost`, `password_cost_time`, and `password_cost_time_interval` settings
|
||||||
- `password_cost` is recalculated automatically every week by default (it keeps track of this via the `password_cost_time_lastcheck` 'setting')
|
- `password_cost` is recalculated automatically every week by default (it keeps track of this via the `password_cost_time_lastcheck` 'setting')
|
||||||
- The `css` setting will now keep a value of auto, even when `peppermint.json` is automatically updated by _Pepperminty Wiki_.
|
- The `css` setting will now keep a value of auto, even when `peppermint.json` is automatically updated by _Pepperminty Wiki_.
|
||||||
|
- Optimised the search context extractor.
|
||||||
|
- Tuned the default value for `search_characters_context` down to 75 (this won't be the case for existing wikis, so you'll need to adjust it manually)
|
||||||
|
- Added new `search_characters_context_total` setting to control the maximum characters in a search context
|
||||||
|
|
||||||
## v0.16
|
## v0.16
|
||||||
_(No changes since v0.16-beta1)_
|
_(No changes since v0.16-beta1)_
|
||||||
|
|
|
@ -172,7 +172,8 @@ $guiConfig = <<<'GUICONFIG'
|
||||||
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
|
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
|
||||||
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
|
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
|
||||||
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
|
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
|
||||||
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
|
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
|
||||||
|
"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
|
||||||
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
|
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
|
||||||
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
|
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
|
||||||
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
||||||
|
@ -396,7 +397,7 @@ if($settings->sessionprefix == "auto")
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
/** The version of Pepperminty Wiki currently running. */
|
/** The version of Pepperminty Wiki currently running. */
|
||||||
$version = "v0.17-dev";
|
$version = "v0.17-dev";
|
||||||
$commit = "49b91aa6f999409dbcf6d165f379ebb95fec7dcb";
|
$commit = "93494b672938d2fb456138e03db5be3803bc51b7";
|
||||||
/// Environment ///
|
/// Environment ///
|
||||||
/** Holds information about the current request environment. */
|
/** Holds information about the current request environment. */
|
||||||
$env = new stdClass();
|
$env = new stdClass();
|
||||||
|
@ -4433,6 +4434,7 @@ class search
|
||||||
if(isset($invindex[$qterm]))
|
if(isset($invindex[$qterm]))
|
||||||
{
|
{
|
||||||
// Loop over each page in the inverted index entry
|
// Loop over each page in the inverted index entry
|
||||||
|
reset($invindex); // Reset array/object pointer
|
||||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||||
{
|
{
|
||||||
// Create an entry in the matching pages array if it doesn't exist
|
// Create an entry in the matching pages array if it doesn't exist
|
||||||
|
@ -4444,6 +4446,7 @@ class search
|
||||||
|
|
||||||
|
|
||||||
// Loop over the pageindex and search the titles / tags
|
// Loop over the pageindex and search the titles / tags
|
||||||
|
reset($pageindex); // Reset array/object pointer
|
||||||
foreach ($pageindex as $pagename => $pagedata)
|
foreach ($pageindex as $pagename => $pagedata)
|
||||||
{
|
{
|
||||||
// Get the current page's id
|
// Get the current page's id
|
||||||
|
@ -4565,9 +4568,7 @@ class search
|
||||||
if($all_offsets === false)
|
if($all_offsets === false)
|
||||||
continue;
|
continue;
|
||||||
foreach($all_offsets as $offset)
|
foreach($all_offsets as $offset)
|
||||||
{
|
|
||||||
$matches[] = [ $nterm, $offset ];
|
$matches[] = [ $nterm, $offset ];
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort the matches by offset
|
// Sort the matches by offset
|
||||||
|
@ -4579,58 +4580,47 @@ class search
|
||||||
$sourceLength = mb_strlen($source);
|
$sourceLength = mb_strlen($source);
|
||||||
|
|
||||||
$contexts = [];
|
$contexts = [];
|
||||||
$basepos = 0;
|
|
||||||
$matches_count = count($matches);
|
$matches_count = count($matches);
|
||||||
while($basepos < $matches_count)
|
$total_context_length = 0;
|
||||||
{
|
for($i = 0; $i < $matches_count; $i++) {
|
||||||
// Store the next match along - all others will be relative to that one
|
$next_context = [
|
||||||
$group = [$matches[$basepos]];
|
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
|
||||||
|
"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
|
||||||
|
];
|
||||||
|
|
||||||
// Start scanning at the next one along - we always store the first match
|
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
|
||||||
$scanpos = $basepos + 1;
|
// This next context overlaps with the previous one
|
||||||
$distance = 0;
|
// Extend the last one instead of adding a new one
|
||||||
|
|
||||||
while(true)
|
|
||||||
{
|
|
||||||
// Break out if we reach the end
|
|
||||||
if($scanpos >= $matches_count) break;
|
|
||||||
|
|
||||||
// Find the distance between the current one and the last one
|
// The array pointer is pointing at the last element now because we called end() above
|
||||||
$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
|
|
||||||
|
|
||||||
// Store it if the distance is below the threshold
|
// Update the total context length counter appropriately
|
||||||
if($distance < $settings->search_characters_context)
|
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
|
||||||
$group[] = $matches[$scanpos];
|
$contexts[key($contexts)]["to"] = $next_context["to"];
|
||||||
else
|
}
|
||||||
break;
|
else { // No overlap here! Business as usual.
|
||||||
|
$contexts[] = $next_context;
|
||||||
$scanpos++;
|
// Update the total context length counter as normal
|
||||||
|
$total_context_length += $next_context["to"] - $next_context["from"];
|
||||||
}
|
}
|
||||||
|
|
||||||
$context_start = $group[0][1] - $settings->search_characters_context;
|
|
||||||
$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
|
|
||||||
|
|
||||||
if($context_start < 0) $context_start = 0;
|
end($contexts);
|
||||||
if($context_end > $sourceLength) $context_end = $sourceLength;
|
$last_context = &$contexts[key($contexts)];
|
||||||
|
if($total_context_length > $settings->search_characters_context_total) {
|
||||||
//echo("Got context. Start: $context_start, End: $context_end\n");
|
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
|
||||||
//echo("Group:"); var_dump($group);
|
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
|
||||||
|
break;
|
||||||
$context = substr($source, $context_start, $context_end - $context_start);
|
}
|
||||||
|
|
||||||
// Strip the markdown from the context - it's most likely going to
|
|
||||||
// be broken anyway.
|
|
||||||
//$context = self::strip_markup($context);
|
|
||||||
|
|
||||||
// Escape special characters to protect against attacks
|
|
||||||
$context = htmlentities($context);
|
|
||||||
|
|
||||||
$contexts[] = $context;
|
|
||||||
|
|
||||||
$basepos = $scanpos + 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return implode(" ... ", $contexts);
|
$contexts_text = [];
|
||||||
|
foreach($contexts as $context) {
|
||||||
|
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return implode(" ... ", $contexts_text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -104,7 +104,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||||
"id": "feature-search",
|
"id": "feature-search",
|
||||||
"lastupdate": 1529964034,
|
"lastupdate": 1529967930,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -790,6 +790,7 @@ class search
|
||||||
if(isset($invindex[$qterm]))
|
if(isset($invindex[$qterm]))
|
||||||
{
|
{
|
||||||
// Loop over each page in the inverted index entry
|
// Loop over each page in the inverted index entry
|
||||||
|
reset($invindex); // Reset array/object pointer
|
||||||
foreach($invindex[$qterm] as $pageid => $page_entry)
|
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||||
{
|
{
|
||||||
// Create an entry in the matching pages array if it doesn't exist
|
// Create an entry in the matching pages array if it doesn't exist
|
||||||
|
@ -801,6 +802,7 @@ class search
|
||||||
|
|
||||||
|
|
||||||
// Loop over the pageindex and search the titles / tags
|
// Loop over the pageindex and search the titles / tags
|
||||||
|
reset($pageindex); // Reset array/object pointer
|
||||||
foreach ($pageindex as $pagename => $pagedata)
|
foreach ($pageindex as $pagename => $pagedata)
|
||||||
{
|
{
|
||||||
// Get the current page's id
|
// Get the current page's id
|
||||||
|
@ -922,9 +924,7 @@ class search
|
||||||
if($all_offsets === false)
|
if($all_offsets === false)
|
||||||
continue;
|
continue;
|
||||||
foreach($all_offsets as $offset)
|
foreach($all_offsets as $offset)
|
||||||
{
|
|
||||||
$matches[] = [ $nterm, $offset ];
|
$matches[] = [ $nterm, $offset ];
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort the matches by offset
|
// Sort the matches by offset
|
||||||
|
@ -936,58 +936,47 @@ class search
|
||||||
$sourceLength = mb_strlen($source);
|
$sourceLength = mb_strlen($source);
|
||||||
|
|
||||||
$contexts = [];
|
$contexts = [];
|
||||||
$basepos = 0;
|
|
||||||
$matches_count = count($matches);
|
$matches_count = count($matches);
|
||||||
while($basepos < $matches_count)
|
$total_context_length = 0;
|
||||||
{
|
for($i = 0; $i < $matches_count; $i++) {
|
||||||
// Store the next match along - all others will be relative to that one
|
$next_context = [
|
||||||
$group = [$matches[$basepos]];
|
"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
|
||||||
|
"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
|
||||||
|
];
|
||||||
|
|
||||||
// Start scanning at the next one along - we always store the first match
|
if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
|
||||||
$scanpos = $basepos + 1;
|
// This next context overlaps with the previous one
|
||||||
$distance = 0;
|
// Extend the last one instead of adding a new one
|
||||||
|
|
||||||
while(true)
|
|
||||||
{
|
|
||||||
// Break out if we reach the end
|
|
||||||
if($scanpos >= $matches_count) break;
|
|
||||||
|
|
||||||
// Find the distance between the current one and the last one
|
// The array pointer is pointing at the last element now because we called end() above
|
||||||
$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
|
|
||||||
|
|
||||||
// Store it if the distance is below the threshold
|
// Update the total context length counter appropriately
|
||||||
if($distance < $settings->search_characters_context)
|
$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
|
||||||
$group[] = $matches[$scanpos];
|
$contexts[key($contexts)]["to"] = $next_context["to"];
|
||||||
else
|
}
|
||||||
break;
|
else { // No overlap here! Business as usual.
|
||||||
|
$contexts[] = $next_context;
|
||||||
$scanpos++;
|
// Update the total context length counter as normal
|
||||||
|
$total_context_length += $next_context["to"] - $next_context["from"];
|
||||||
}
|
}
|
||||||
|
|
||||||
$context_start = $group[0][1] - $settings->search_characters_context;
|
|
||||||
$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
|
|
||||||
|
|
||||||
if($context_start < 0) $context_start = 0;
|
end($contexts);
|
||||||
if($context_end > $sourceLength) $context_end = $sourceLength;
|
$last_context = &$contexts[key($contexts)];
|
||||||
|
if($total_context_length > $settings->search_characters_context_total) {
|
||||||
//echo("Got context. Start: $context_start, End: $context_end\n");
|
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
|
||||||
//echo("Group:"); var_dump($group);
|
$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
|
||||||
|
break;
|
||||||
$context = substr($source, $context_start, $context_end - $context_start);
|
}
|
||||||
|
|
||||||
// Strip the markdown from the context - it's most likely going to
|
|
||||||
// be broken anyway.
|
|
||||||
//$context = self::strip_markup($context);
|
|
||||||
|
|
||||||
// Escape special characters to protect against attacks
|
|
||||||
$context = htmlentities($context);
|
|
||||||
|
|
||||||
$contexts[] = $context;
|
|
||||||
|
|
||||||
$basepos = $scanpos + 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return implode(" ... ", $contexts);
|
$contexts_text = [];
|
||||||
|
foreach($contexts as $context) {
|
||||||
|
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return implode(" ... ", $contexts_text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -149,7 +149,8 @@
|
||||||
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
|
"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
|
||||||
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
|
"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
|
||||||
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
|
"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
|
||||||
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
|
"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
|
||||||
|
"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
|
||||||
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
|
"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
|
||||||
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
|
"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
|
||||||
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
||||||
|
|
Loading…
Reference in a new issue