Seriously optimise the search system via some profiling.

2024-12-22 13:45:02 +00:00 · 2018-06-26 14:15:19 +01:00 · 2018-06-26 14:15:19 +01:00 · 3d3b6c491a
commit 3d3b6c491a
parent 67648199d7
4 changed files with 50 additions and 42 deletions
--- a/build/index.php
+++ b/build/index.php
@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
 /////////////////////////////////////////////////////////////////////////////
 /** The version of Pepperminty Wiki currently running. */
 $version = "v0.17-dev";
-$commit = "75b6b6c55fa9710d82b6623971581db7c6c5309b";
+$commit = "67648199d7ebd8a1b2ec400af0192dc0bb94b233";
 /// Environment ///
 /** Holds information about the current request environment. */
 $env = new stdClass();
@ -792,7 +792,7 @@ function starts_with($haystack, $needle)
 function mb_stripos_all($haystack, $needle) {
 	$s = 0; $i = 0;
 	while(is_integer($i)) {
-		$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
+		$i = mb_stripos($haystack, $needle, $s);
 		if(is_integer($i)) {
 			$aStrPos[] = $i;
 			$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
@ -1268,10 +1268,12 @@ class ids
 	public static function getid($pagename)
 	{
 		global $idindex;
-
+		
+		$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
 		foreach ($idindex as $id => $entry)
 		{
-			if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
+			// We don't need to normalise here because we normralise when assigning ids
+			if($entry == $pagename_norm)
 				return $id;
 		}
 		
@ -3782,12 +3784,16 @@ register_module([
 			header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
 			header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
 			
+			$start = microtime(true);
 			foreach($results as &$result) {
 				$result["context"] = search::extract_context(
+					$invindex, ids::getid($result["pagename"]),
 					$_GET["query"],
 					file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
 				);
 			}
+			$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
+			header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
 			
 			$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
 			
@ -4208,7 +4214,7 @@ class search
 	public static function tokenize($source)
 	{
 		/** Normalises input characters for searching & indexing */
-		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
 		
 		// We don't need to normalise here because the transliterator handles 
 		// this for us. Also, we can't move the literator to a static variable 
@ -4237,9 +4243,6 @@ class search
 	{
 		global $pageindex, $env, $paths, $settings;
 		
-		/** Normalises input characters for searching & indexing */
-		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
-		
 		if($output) {
 			header("content-type: text/event-stream");
 			ob_end_flush();
@ -4415,7 +4418,7 @@ class search
 		global $settings, $pageindex;
 		
 		/** Normalises input characters for searching & indexing */
-		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
 		
 		$query_terms = self::tokenize($query);
 		$matching_pages = [];
@ -4436,7 +4439,7 @@ class search
 			if(isset($invindex[$qterm]))
 			{
 				// Loop over each page in the inverted index entry
-				reset($invindex); // Reset array/object pointer
+				reset($invindex[$qterm]); // Reset array/object pointer
 				foreach($invindex[$qterm] as $pageid => $page_entry)
 				{
 					// Create an entry in the matching pages array if it doesn't exist
@ -4486,7 +4489,7 @@ class search
 			}
 		}
 		
-		
+		reset($matching_pages);
 		foreach($matching_pages as $pageid => &$pagedata)
 		{
 			$pagedata["pagename"] = ids::getpagename($pageid);
@ -4495,6 +4498,7 @@ class search
 			$pageOffsets = [];
 			
 			// Loop over each search term found on this page
+			reset($pagedata["nterms"]);
 			foreach($pagedata["nterms"] as $pterm => $entry)
 			{
 				// Add the number of occurrences of this search term to the ranking
@ -4554,23 +4558,23 @@ class search
 	 * @param	string	$source	The page source to extract the context from.
 	 * @return	string			The generated context string.
 	 */
-	public static function extract_context($query, $source)
+	public static function extract_context($invindex, $pageid, $query, $source)
 	{
 		global $settings;
 		
 		$nterms = self::tokenize($query);
 		$matches = [];
-		// Loop over each nterm and find it in the source
-		foreach($nterms as $nterm)
-		{
-			if(in_array($nterm, static::$stop_words))
+		
+		foreach($nterms as $nterm) {
+			// Skip over words that don't appear in the inverted index (e.g. stop words)
+			if(!isset($invindex[$nterm]))
 				continue;
-			$all_offsets = mb_stripos_all($source, $nterm);
-			// Skip over adding matches if there aren't any
-			if($all_offsets === false)
+			// Skip if the page isn't found in the inverted index for this word
+			if(!isset($invindex[$nterm][$pageid]))
 				continue;
-			foreach($all_offsets as $offset)
-				$matches[] = [ $nterm, $offset ];
+			
+			foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
+				$matches[] = [ $nterm, $next_offset ];
 		}
 		
 		// Sort the matches by offset
--- a/core.php
+++ b/core.php
@ -405,7 +405,7 @@ function starts_with($haystack, $needle)
 function mb_stripos_all($haystack, $needle) {
 	$s = 0; $i = 0;
 	while(is_integer($i)) {
-		$i = function_exists("mb_stripos") ? mb_stripos($haystack, $needle, $s) : stripos($haystack, $needle, $s);
+		$i = mb_stripos($haystack, $needle, $s);
 		if(is_integer($i)) {
 			$aStrPos[] = $i;
 			$s = $i + (function_exists("mb_strlen") ? mb_strlen($needle) : strlen($needle));
@ -881,10 +881,12 @@ class ids
 	public static function getid($pagename)
 	{
 		global $idindex;
-
+		
+		$pagename_norm = Normalizer::normalize($pagename, Normalizer::FORM_C);
 		foreach ($idindex as $id => $entry)
 		{
-			if(Normalizer::normalize($entry, Normalizer::FORM_C) == Normalizer::normalize($pagename, Normalizer::FORM_C))
+			// We don't need to normalise here because we normralise when assigning ids
+			if($entry == $pagename_norm)
 				return $id;
 		}
 		
--- a/module_index.json
+++ b/module_index.json
@ -104,7 +104,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
        "id": "feature-search",
-        "lastupdate": 1529968213,
+        "lastupdate": 1530018727,
        "optional": false
    },
    {
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -138,12 +138,16 @@ register_module([
 			header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
 			header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
 			
+			$start = microtime(true);
 			foreach($results as &$result) {
 				$result["context"] = search::extract_context(
+					$invindex, ids::getid($result["pagename"]),
 					$_GET["query"],
 					file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
 				);
 			}
+			$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
+			header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
 			
 			$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
 			
@ -564,7 +568,7 @@ class search
 	public static function tokenize($source)
 	{
 		/** Normalises input characters for searching & indexing */
-		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
 		
 		// We don't need to normalise here because the transliterator handles 
 		// this for us. Also, we can't move the literator to a static variable 
@ -593,9 +597,6 @@ class search
 	{
 		global $pageindex, $env, $paths, $settings;
 		
-		/** Normalises input characters for searching & indexing */
-		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
-		
 		if($output) {
 			header("content-type: text/event-stream");
 			ob_end_flush();
@ -771,7 +772,7 @@ class search
 		global $settings, $pageindex;
 		
 		/** Normalises input characters for searching & indexing */
-		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
 		
 		$query_terms = self::tokenize($query);
 		$matching_pages = [];
@ -792,7 +793,7 @@ class search
 			if(isset($invindex[$qterm]))
 			{
 				// Loop over each page in the inverted index entry
-				reset($invindex); // Reset array/object pointer
+				reset($invindex[$qterm]); // Reset array/object pointer
 				foreach($invindex[$qterm] as $pageid => $page_entry)
 				{
 					// Create an entry in the matching pages array if it doesn't exist
@ -842,7 +843,7 @@ class search
 			}
 		}
 		
-		
+		reset($matching_pages);
 		foreach($matching_pages as $pageid => &$pagedata)
 		{
 			$pagedata["pagename"] = ids::getpagename($pageid);
@ -851,6 +852,7 @@ class search
 			$pageOffsets = [];
 			
 			// Loop over each search term found on this page
+			reset($pagedata["nterms"]);
 			foreach($pagedata["nterms"] as $pterm => $entry)
 			{
 				// Add the number of occurrences of this search term to the ranking
@ -910,23 +912,23 @@ class search
 	 * @param	string	$source	The page source to extract the context from.
 	 * @return	string			The generated context string.
 	 */
-	public static function extract_context($query, $source)
+	public static function extract_context($invindex, $pageid, $query, $source)
 	{
 		global $settings;
 		
 		$nterms = self::tokenize($query);
 		$matches = [];
-		// Loop over each nterm and find it in the source
-		foreach($nterms as $nterm)
-		{
-			if(in_array($nterm, static::$stop_words))
+		
+		foreach($nterms as $nterm) {
+			// Skip over words that don't appear in the inverted index (e.g. stop words)
+			if(!isset($invindex[$nterm]))
 				continue;
-			$all_offsets = mb_stripos_all($source, $nterm);
-			// Skip over adding matches if there aren't any
-			if($all_offsets === false)
+			// Skip if the page isn't found in the inverted index for this word
+			if(!isset($invindex[$nterm][$pageid]))
 				continue;
-			foreach($all_offsets as $offset)
-				$matches[] = [ $nterm, $offset ];
+			
+			foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset)
+				$matches[] = [ $nterm, $next_offset ];
 		}
 		
 		// Sort the matches by offset