Optimise the search context extractor, but evediently there's more work to be done.

2024-11-22 04:23:01 +00:00 · 2018-06-26 00:06:20 +01:00 · 2018-06-26 00:06:20 +01:00 · 75b6b6c55f
commit 75b6b6c55f
parent 93494b6729
5 changed files with 77 additions and 94 deletions
--- a/Changelog.md
+++ b/Changelog.md
@ -21,6 +21,9 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
     - Added `password_cost`, `password_cost_time`, and `password_cost_time_interval` settings
     - `password_cost` is recalculated automatically every week by default (it keeps track of this via the `password_cost_time_lastcheck` 'setting')
 - The `css` setting will now keep a value of auto, even when `peppermint.json` is automatically updated by _Pepperminty Wiki_.
+ - Optimised the search context extractor.
+     - Tuned the default value for `search_characters_context` down to 75 (this won't be the case for existing wikis, so you'll need to adjust it manually)
+     - Added new `search_characters_context_total` setting to control the maximum characters in a search context

 ## v0.16
 _(No changes since v0.16-beta1)_
--- a/build/index.php
+++ b/build/index.php
@ -172,7 +172,8 @@ $guiConfig = <<<'GUICONFIG'
 	"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
 	"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
 	"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
-	"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
+	"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
+	"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
 	"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
 	"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
 	"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
@ -396,7 +397,7 @@ if($settings->sessionprefix == "auto")
 /////////////////////////////////////////////////////////////////////////////
 /** The version of Pepperminty Wiki currently running. */
 $version = "v0.17-dev";
-$commit = "49b91aa6f999409dbcf6d165f379ebb95fec7dcb";
+$commit = "93494b672938d2fb456138e03db5be3803bc51b7";
 /// Environment ///
 /** Holds information about the current request environment. */
 $env = new stdClass();
@ -4433,6 +4434,7 @@ class search
 			if(isset($invindex[$qterm]))
 			{
 				// Loop over each page in the inverted index entry
+				reset($invindex); // Reset array/object pointer
 				foreach($invindex[$qterm] as $pageid => $page_entry)
 				{
 					// Create an entry in the matching pages array if it doesn't exist
@ -4444,6 +4446,7 @@ class search
 			
 			
 			// Loop over the pageindex and search the titles / tags
+			reset($pageindex); // Reset array/object pointer
 			foreach ($pageindex as $pagename => $pagedata)
 			{
 				// Get the current page's id
@ -4565,9 +4568,7 @@ class search
 			if($all_offsets === false)
 				continue;
 			foreach($all_offsets as $offset)
-			{
 				$matches[] = [ $nterm, $offset ];
-			}
 		}
 		
 		// Sort the matches by offset
@ -4579,58 +4580,47 @@ class search
 		$sourceLength = mb_strlen($source);
 		
 		$contexts = [];
-		$basepos = 0;
+		
 		$matches_count = count($matches);
-		while($basepos < $matches_count)
-		{
-			// Store the next match along - all others will be relative to that one
-			$group = [$matches[$basepos]];
+		$total_context_length = 0;
+		for($i = 0; $i < $matches_count; $i++) {
+			$next_context = [
+				"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
+				"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
+			];
 			
-			// Start scanning at the next one along - we always store the first match
-			$scanpos = $basepos + 1;
-			$distance = 0;
+			if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
+				// This next context overlaps with the previous one
+				// Extend the last one instead of adding a new one
 				
-			while(true)
-			{
-				// Break out if we reach the end
-				if($scanpos >= $matches_count) break;
+				// The array pointer is pointing at the last element now because we called end() above
 				
-				// Find the distance between the current one and the last one
-				$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
-				
-				// Store it if the distance is below the threshold
-				if($distance < $settings->search_characters_context)
-					$group[] = $matches[$scanpos];
-				else
-					break;
-				
-				$scanpos++;
+				// Update the total context length counter appropriately
+				$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
+				$contexts[key($contexts)]["to"] = $next_context["to"];
+			}
+			else { // No overlap here! Business as usual.
+				$contexts[] = $next_context;
+				// Update the total context length counter as normal
+				$total_context_length += $next_context["to"] - $next_context["from"];
 			}
 			
-			$context_start = $group[0][1] - $settings->search_characters_context;
-			$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
 			
-			if($context_start < 0) $context_start = 0;
-			if($context_end > $sourceLength) $context_end = $sourceLength;
-			
-			//echo("Got context. Start: $context_start, End: $context_end\n");
-			//echo("Group:"); var_dump($group);
-			
-			$context = substr($source, $context_start, $context_end - $context_start);
-			
-			// Strip the markdown from the context - it's most likely going to
-			// be broken anyway.
-			//$context = self::strip_markup($context);
-			
-			// Escape special characters to protect against attacks
-			$context = htmlentities($context);
-			
-			$contexts[] = $context;
-			
-			$basepos = $scanpos + 1;
+			end($contexts);
+			$last_context = &$contexts[key($contexts)];
+			if($total_context_length > $settings->search_characters_context_total) {
+				// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
+				$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
+				break;
+			}
 		}
 		
-		return implode(" ... ", $contexts);
+		$contexts_text = [];
+		foreach($contexts as $context) {
+			$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
+		}
+		
+		return implode(" ... ", $contexts_text);
 	}
 	
 	/**
--- a/module_index.json
+++ b/module_index.json
@ -104,7 +104,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
        "id": "feature-search",
-        "lastupdate": 1529964034,
+        "lastupdate": 1529967930,
        "optional": false
    },
    {
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -790,6 +790,7 @@ class search
 			if(isset($invindex[$qterm]))
 			{
 				// Loop over each page in the inverted index entry
+				reset($invindex); // Reset array/object pointer
 				foreach($invindex[$qterm] as $pageid => $page_entry)
 				{
 					// Create an entry in the matching pages array if it doesn't exist
@ -801,6 +802,7 @@ class search
 			
 			
 			// Loop over the pageindex and search the titles / tags
+			reset($pageindex); // Reset array/object pointer
 			foreach ($pageindex as $pagename => $pagedata)
 			{
 				// Get the current page's id
@ -922,9 +924,7 @@ class search
 			if($all_offsets === false)
 				continue;
 			foreach($all_offsets as $offset)
-			{
 				$matches[] = [ $nterm, $offset ];
-			}
 		}
 		
 		// Sort the matches by offset
@ -936,58 +936,47 @@ class search
 		$sourceLength = mb_strlen($source);
 		
 		$contexts = [];
-		$basepos = 0;
+		
 		$matches_count = count($matches);
-		while($basepos < $matches_count)
-		{
-			// Store the next match along - all others will be relative to that one
-			$group = [$matches[$basepos]];
+		$total_context_length = 0;
+		for($i = 0; $i < $matches_count; $i++) {
+			$next_context = [
+				"from" => max(0, $matches[$i][1] - $settings->search_characters_context),
+				"to" => min($sourceLength, $matches[$i][1] + count($matches[0]) + $settings->search_characters_context)
+			];
 			
-			// Start scanning at the next one along - we always store the first match
-			$scanpos = $basepos + 1;
-			$distance = 0;
+			if(end($contexts) !== false && end($contexts)["to"] > $next_context["from"]) {
+				// This next context overlaps with the previous one
+				// Extend the last one instead of adding a new one
 				
-			while(true)
-			{
-				// Break out if we reach the end
-				if($scanpos >= $matches_count) break;
+				// The array pointer is pointing at the last element now because we called end() above
 				
-				// Find the distance between the current one and the last one
-				$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
-				
-				// Store it if the distance is below the threshold
-				if($distance < $settings->search_characters_context)
-					$group[] = $matches[$scanpos];
-				else
-					break;
-				
-				$scanpos++;
+				// Update the total context length counter appropriately
+				$total_context_length += $next_context["to"] - $contexts[key($contexts)]["to"];
+				$contexts[key($contexts)]["to"] = $next_context["to"];
+			}
+			else { // No overlap here! Business as usual.
+				$contexts[] = $next_context;
+				// Update the total context length counter as normal
+				$total_context_length += $next_context["to"] - $next_context["from"];
 			}
 			
-			$context_start = $group[0][1] - $settings->search_characters_context;
-			$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
 			
-			if($context_start < 0) $context_start = 0;
-			if($context_end > $sourceLength) $context_end = $sourceLength;
-			
-			//echo("Got context. Start: $context_start, End: $context_end\n");
-			//echo("Group:"); var_dump($group);
-			
-			$context = substr($source, $context_start, $context_end - $context_start);
-			
-			// Strip the markdown from the context - it's most likely going to
-			// be broken anyway.
-			//$context = self::strip_markup($context);
-			
-			// Escape special characters to protect against attacks
-			$context = htmlentities($context);
-			
-			$contexts[] = $context;
-			
-			$basepos = $scanpos + 1;
+			end($contexts);
+			$last_context = &$contexts[key($contexts)];
+			if($total_context_length > $settings->search_characters_context_total) {
+				// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
+				$last_context["to"] -= $total_context_length - $settings->search_characters_context_total;
+				break;
+			}
 		}
 		
-		return implode(" ... ", $contexts);
+		$contexts_text = [];
+		foreach($contexts as $context) {
+			$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
+		}
+		
+		return implode(" ... ", $contexts_text);
 	}
 	
 	/**
--- a/peppermint.guiconfig.json
+++ b/peppermint.guiconfig.json
@ -149,7 +149,8 @@
 	"max_preview_size": {"type": "number", "description": "The maximum allowed size of generated preview images in pixels.", "default": 2048},
 	"avatars_show": {"type": "checkbox", "description": "Whether or not to show avatars requires the 'user-preferences' and 'upload' modules, though uploads themselvess can be turned off so long as all avatars have already been uploaded - it's only the 'preview' action that's actually used.", "default": true},
 	"avatars_size": {"type": "number", "description": "The image size to render avatars at. Does not affect the size they're stored at - only the inline rendered size (e.g. on the recent changes page etc.)", "default": 32},
-	"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 200},
+	"search_characters_context": {"type": "number", "description": "The number of characters that should be displayed either side of a matching term in the context below each search result.", "default": 75},
+	"search_characters_context_total": {"type": "number", "description": "The total number of characters that a search result context should display at most.", "default": 250},
 	"search_title_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's title.", "default": 10},
 	"search_tags_matches_weighting": {"type": "number", "description": "The weighting to give to search term matches found in a page's tags.", "default": 3},
 	"dynamic_page_suggestion_count": {"type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },