Start refactoring query_invindex & rename it to invindex_query

....but of course it's not finished yet. We're doing well, but there are a few thorny issues to go. Mainly: We need to seriously optimise ids::getpagename(), 'cause we'll need it a *lot* when we get to implementing the size, before, and after colon : directives.
2024-11-22 16:33:00 +00:00 · 2019-08-18 21:25:48 +01:00 · 2019-08-18 21:25:48 +01:00 · b93dd3d9cc
commit b93dd3d9cc
parent 0807cce256
1 changed files with 255 additions and 74 deletions
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -136,7 +136,7 @@ register_module([
 			$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
 			$time_start = microtime(true);
-			$results = search::query_invindex($_GET["query"], $invindex);
+			$results = search::invindex_query($_GET["query"], $invindex);
 			$resultCount = count($results);
 			$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
@ -306,7 +306,7 @@ register_module([
 			$searchIndex = search::invindex_load($paths->searchindex);
 			$env->perfdata->searchindex_decode_time = (microtime(true) - $env->perfdata->searchindex_decode_start) * 1000;
 			$env->perfdata->searchindex_query_start = microtime(true);
-			$searchResults = search::query_invindex($_GET["query"], $searchIndex);
+			$searchResults = search::invindex_query($_GET["query"], $searchIndex);
 			$env->perfdata->searchindex_query_time = (microtime(true) - $env->perfdata->searchindex_query_start) * 1000;
 			header("content-type: application/json");
@ -882,91 +882,295 @@ class search
 		$this->invindex->set("|termlist|", json_encode($termlist));
 	}
 	/*
 	 * ███████ ████████  █████  ███████
 	 * ██         ██    ██   ██ ██
 	 * ███████    ██    ███████ ███████
 	 *      ██    ██    ██   ██      ██
 	 * ███████    ██    ██   ██ ███████
 	 */
 	/**
 	 * Splits a *transliterated* query string into tokens.
 	 * Actually based on my earlier explode_adv https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
 	 * @param	string	$query	The queyr string to split.
 	 */
 	private function stas_split($query) {
 		$chars = str_split($query);
 		$terms = [];
 		$next_term = "";
 		$toggle_state = false; // true = now inside, false = now outside
 		foreach($chars as $char)
 		{
 			if($char == '"') {
 				// Invert the toggle block state
 				$toggle_state = !$toggle_state;
 			}
 			// If this char is whitespace *and* we're outside a toggle block, then it's a token
 			if(ctype_space($char) && !$toggle_state) {
 				// If the string is empty, then don't bother
 				if(empty($next_term)) continue;
 				$terms[] = $next_term;
 				$next_term = "";
 			}
 			// If it's not whitespace, or it is whitespace and we're inside a toggle block....
 			else if(!ctype_space($char) || ($toggle_state && ctype_space($char)))
 				$next_term .= $char; // ...then add the char to the next part
 		}
 		if(strlen($next_term) > 0)
 			$terms[] = $next_term;
 		return $terms;
 	}
 	/**
 	 * Parses an array of query tokens into an associative array of search directives.
 	 * Supported syntax derived from these sources:
 		 * https://help.duckduckgo.com/duckduckgo-help-pages/results/syntax/
 		 * https://docs.microsoft.com/en-us/windows/win32/lwef/-search-2x-wds-aqsreference
 	 * @param	string[]	$tokens	The array of query tokens to parse.
 	 */
 	private function stas_parse($tokens) {
 		/* Supported Syntax *
 		 * 
 		 * -term				exclude a term
 		 * +term				double the weighting of a term
 		 * terms !dest terms	redirect entire query (minus the !bang) to interwiki with registered shortcut dest
 		 * prefix:term			apply prefix operator to term
 		 */
 		var_dump($tokens);
 		$result = [
 			"terms" => [],
 			"exclude" => [],
 			"interwiki" => null
 		];
 		// foreach($operators as $op)
 		// 	$result[$op] = [];
 		$count = count($tokens);
 		for($i = count($tokens) - 1; $i >= 0; $i--) {
 			// Look for excludes
 			if($tokens[$i][0] == "-") {
 				$result["exclude"][] = substr($tokens[$i], 1);
 				continue;
 			}
 			// Look for weighted terms
 			if($tokens[$i][0] == "+") {
 				$result["terms"][] = [
 					"term" => substr($tokens[$i], 1),
 					"weight" => 2,
 					"location" => "all"
 				];
 				continue;
 			}
 			// Look for interwiki searches
 			if($tokens[$i][0] == "!" || substr($tokens[$i], -1) == "!") {
 				// You can only go to 1 interwiki destination at once, so we replace any previous finding with this one
 				$result["interwiki"] = trim($tokens[$i], "!");
 			}
 			// Look for colon directives in the form directive:term
 			// Also supports prefix:"quoted term with spaces", quotes stripped automatically
 			/*** Example directives *** (. = implemented, * = not implemented)
 			 . intitle		search only page titles for term
 			 . intags		search only tags for term
 			 . inbody		search page body only for term
 			 * before		search only pages that were last modified before term
 			 * after		search only pages that were last modified after term
 			 * size			search only pages that match the size spec term (e.g. 1k+ -> more than 1k bytes, 2k- -> less than 2k bytes, >5k -> more than 5k bytes, <10k -> less than 10k bytes)
 			 **************************/
 			if(strpos($tokens[$i], ":") !== false) {
 				$parts = explode(":", $tokens[$i], 2);
 				if(!isset($result[$parts[0]]))
 					$result[$parts[0]] = [];
 				switch($parts[0]) {
 					case "intitle":
 						$result["terms"][] = [
 							"term" => $parts[1],
 							"weight" => $settings->search_title_matches_weighting * mb_strlen($parts[1]),
 							"location" => "title"
 						];
 						break;
 					case "intags":
 						$result["terms"][] = [
 							"term" => $parts[1],
 							"weight" => $settings->search_tags_matches_weighting * mb_strlen($parts[1]),
 							"location" => "tags"
 						];
 						break;
 					case "inbody":
 						$result["terms"][] = [
 							"term" => $parts[1],
 							"weight" => 1,
 							"location" => "body"
 						];
 						break;
 					default:
 						$result[$parts[0]][] = trim($parts[1], '"');
 						break;
 				}
 				continue;
 			}
 			// Doesn't appear to be particularly special *shrugs*
 			// Set the weight to -1 if it's a stop word
 			$result["terms"][] = [
 				"term" => $tokens[$i],
 				"weight" => in_array($tokens[$i], self::$stop_words) ? 0 : -1,
 				"location" => "all"
 			];
 		}
 		return $result;
 	}
 	/**
 	 * Searches the given inverted index for the specified search terms.
 	 * @param	string	$query		The search query.
 	 * @param	array	$invindex	The inverted index to search.
 	 * @return	array	An array of matching pages.
 	 */
-	public static function query_invindex($query, &$invindex)
+	public static function invindex_query($query)
 	{
 		global $settings, $pageindex;
 		/** Normalises input characters for searching & indexing */
 		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
-		$query_terms = self::tokenize($query);
+		$query_stas = $this->stas_parse(
 			$this->stas_split($literator->transliterate($query))
 		);
 		/* Sub-array format:
 		 * [
 		 * 	nterms : [ nterm => frequency, nterm => frequency, .... ],
 		 * 	offsets_body : int[],
 		 * 	matches_title : int,
 		 * 	matches_tags : int
 		 * ]
 		 */
 		$matching_pages = [];
 		$match_template = [
 			"nterms" => [],
 			"offsets_body" => [],
 			"rank_title" => 0,
 			"rank_tags" => 0
 		];
-		
+		// Query the inverted index
-		// Loop over each term in the query and find the matching page entries
+		foreach($query_stas as $term_def) {
-		$count = count($query_terms);
+			if($term_def["weight"] == -1)
-		for($i = 0; $i < $count; $i++)
+				continue; // Skip stop words
 		{
 			$qterm = $query_terms[$i];
-			// Stop words aren't worth the bother - make sure we don't search
+			if(!in_array($term_def["location"], ["all", "inbody"]))
-			// the title or the tags for them
+				continue; // Skip terms we shouldn't search the page body for
 			if(in_array($qterm, self::$stop_words))
 				continue;
-			// Only search the inverted index if it actually exists there
+			if(!$this->$invindex->has($term_def["term"]))
-			if(isset($invindex[$qterm])) {
+				continue; // Skip if it's not in the index
-				// Loop over each page in the inverted index entry
+			
-				reset($invindex[$qterm]); // Reset array/object pointer
+			// For each page that contains this term.....
-				foreach($invindex[$qterm] as $pageid => $page_entry) {
+			$term_pageids = json_decode($this->invindex->get($term_def["term"]));
-					// Create an entry in the matching pages array if it doesn't exist
+			foreach($term_pageids as $pageid) {
-					if(!isset($matching_pages[$pageid]))
+				// Check to see if it contains any words we should exclude
-						$matching_pages[$pageid] = [ "nterms" => [] ];
+				$skip = false;
-					$matching_pages[$pageid]["nterms"][$qterm] = $page_entry;
+				foreach($query_stas["exclude"] as $exlc_term) {
 					if($this->invindex->has("$excl_term|$pageid")) {
 						$skip = true;
 						break;
 					}
 				}
 				if($skip) continue;
 				// Get the list of offsets
 				$page_offsets = json_decode($this->invindex->get("{$term_def["term"]}|$pageid"));
 				if(!isset($matching_pages[$pageid]))
 					$matching_pages[$pageid] = $match_template; // Arrays are assigned by copy in php
 				// Add it to the appropriate $matching_pages entry, not forgetting to apply the weighting
 				$matching_pages[$pageid]["offsets_body"] = array_merge(
 					$matching_pages[$pageid]["offsets_body"],
 					$page_offsets
 				);
 				$matching_pages[$pageid]["nterms"][$term_def["term"]] = count($page_offsets) * $term_def["weight"];
 			}
 		}
 		// Query page titles & tags
 		foreach($terms as $term_def) {
 			// No need to skip stop words here, since we're doing a normal 
 			// sequential search anyway
 			if(!in_array($term_def["location"], ["all", "intitle", "intags"]))
 				continue; // Skip terms we shouldn't search the page body for
 			// Loop over the pageindex and search the titles / tags
 			reset($pageindex); // Reset array/object pointer
-			foreach ($pageindex as $pagename => $pagedata)
+			foreach ($pageindex as $pagename => $pagedata) {
 			{
 				// Setup a variable to hold the current page's id
-				$pageid = false; // Only fill this out if we find a match
+				$pageid = null; // Cache the page id
 				$lit_title = $literator->transliterate($pagename);
 				$lit_tags = $literator->transliterate(implode(" ", $pagedata->tags));
 				// Make sure that the title & tags don't contain a term we should exclude
 				$skip = false;
 				foreach($query_stas["exclude"] as $excl_term) {
 					if(mb_strpos($lit_title, $excl_term) !== false) {
 						$skip = true;
 						// Delete it from the candidate matches (it might be present in the tags / title but not the body)
 						if(isset($matching_pages[$excl_term]))
 							unset($matching_pages[$excl_term]);
 						break;
 					}
 				}
 				if($skip) continue;
 				// Consider matches in the page title
-				// FUTURE: We may be able to optimise this further by using preg_match_all + preg_quote instead of mb_stripos_all. Experimentation / benchmarking is required to figure out which one is faster
+				if(in_array($term_def["location"], ["all", "intitle"])) {
-				$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
+					// FUTURE: We may be able to optimise this further by using preg_match_all + preg_quote instead of mb_stripos_all. Experimentation / benchmarking is required to figure out which one is faster
-				$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
+					$title_matches = mb_stripos_all($lit_title, $term_def["term"]);
-				if($title_matches_count > 0)
+					$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
-				{
+					if($title_matches_count > 0) {
-					$pageid = ids::getid($pagename); // Fill out the page id
+						$pageid = ids::getid($pagename); // Fetch the page id
-					// We found the qterm in the title
+						// We found the qterm in the title
-					if(!isset($matching_pages[$pageid]))
+						if(!isset($matching_pages[$pageid]))
-						$matching_pages[$pageid] = [ "nterms" => [] ];
+						$matching_pages[$pageid] = $match_template; // Assign by copy
-					
+						
-					// Set up a counter for page title matches if it doesn't exist already
+						$matching_pages[$pageid]["rank_title"] += $title_matches_count * $term_def["weight"];
-					if(!isset($matching_pages[$pageid]["title-matches"]))
+					}
 						$matching_pages[$pageid]["title-matches"] = 0;
 					$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
 				}
 				if(!in_array($term_def["location"], ["all", "intags"]))
 					continue; // If we shouldn't search the tags, no point in continuing
 				// Consider matches in the page's tags
-				$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
+				$tag_matches = isset($pagedata->tags) ? mb_stripos_all($lit_tags, $term_def["term"]) : false;
 				$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
-				if($tag_matches_count > 0) // And we found the qterm in the tags
+				if($tag_matches_count > 0) {// And we found the qterm in the tags
-				{
+					if($pageid === null) // Fill out the page id if it hasn't been already
 					if($pageid == false) // Fill out the page id if it hasn't been already
 						$pageid = ids::getid($pagename);
 					if(!isset($matching_pages[$pageid]))
-						$matching_pages[$pageid] = [ "nterms" => [] ];
+						$matching_pages[$pageid] = $match_template; // Assign by copy
-					// Set up a counter for tag match if there isn't one already
+					$matching_pages[$pageid]["rank_tags"] += $tag_matches_count * $term_def["weight"];
 					if(!isset($matching_pages[$pageid]["tag-matches"]))
 						$matching_pages[$pageid]["tag-matches"] = 0;
 					$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
 				}
 			}
 		}
 		// TODO: Implement the rest of STAS here
 		// TODO: We got up to here; finish refactoring invindex_query
 		reset($matching_pages);
 		foreach($matching_pages as $pageid => &$pagedata)
 		{
@ -987,29 +1191,6 @@ class search
 				foreach($entry["offsets"] as $offset)
 					$pageOffsets[] = $offset;
 			}
 			/*
 			// Sort the list of offsets
 			$pageOffsets = array_unique($pageOffsets);
 			sort($pageOffsets);
 			var_dump($pageOffsets);
 			// Calcualate the clump distances via a variable moving window size
 			$pageOffsetsCount = count($pageOffsets);
 			$clumpDistanceWindow = min($count, $pageOffsetsCount); // a.k.a. count($query_terms) - see above
 			$clumpDistances = [];
 			for($i = 0; $i < $pageOffsetsCount - $clumpDistanceWindow; $i++)
 				$clumpDistances[] = $pageOffsets[$i] - $pageOffsets[$i + $clumpDistanceWindow];
 			// Sort the new list of clump distances
 			sort($clumpDistances);
 			// Calcualate a measure of how clumped the offsets are
 			$tightClumpLimit = floor((count($clumpDistances) - 1) / 0.25);
 			$tightClumpsMeasure = $clumpDistances[$tightClumpLimit] - $clumpDistances[0];
 			$clumpsRange = $clumpDistances[count($clumpDistances) - 1] - $clumpDistances[0];
 			$clumpiness = $tightClumpsMeasure / $clumpsRange;
 			echo("{$pagedata["pagename"]} - $clumpiness");
 			*/
 			// Consider matches in the title / tags
 			if(isset($pagedata["title-matches"]))