Refactor stas_split to be more fasterererer

Informal testing shows that it's gone from taking ~18% of the total time to ~4% of the total time :D
2024-11-22 16:33:00 +00:00 · 2019-12-15 17:56:56 +00:00 · 2019-12-15 17:56:56 +00:00 · c80f26962e
commit c80f26962e
parent 843f0f7ee9
3 changed files with 46 additions and 36 deletions
--- a/Changelog.md
+++ b/Changelog.md
@ -8,6 +8,7 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
     - Another search index rebuild is required
 - Optimisation: Don't generate the list of pages for the datalist if it isn't going to be displayed (especially noticeable on wikis with lots of pages)
 - Optimisation: Don't load the statistics index if it's not needed (also esp. noticeable on wikis with lots of pages)
 - Optimisation: Refactor `stas_split()` to be faster (informal testing shows ~18% → 4% total time)
 - [Module Api] Optimisation: Remove `search::transliterate` because it has a huge overhead. Use `search::$literator->transliterate()` instead.
 ## v0.20
--- a/module_index.json
+++ b/module_index.json
@ -135,7 +135,7 @@
        "version": "0.11",
        "author": "Starbeamrainbowlabs",
        "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
-        "lastupdate": 1575838820,
+        "lastupdate": 1576432241,
        "optional": false,
        "extra_data": []
    },
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -360,11 +360,13 @@ register_module([
 					case -1: $style .= "color: grey; text-decoration: wavy line-through;"; $title = "stop word"; break;
 					case 1: $style .= "color: blue;"; $title = "normal word"; break;
 				}
-				switch($term["location"]) {
+				if($term["weight"] !== -1) {
-					case "body": $style = "color: cyan"; $title = "body only"; break;
+					switch($term["location"]) {
-					case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
+						case "body": $style = "color: cyan"; $title = "body only"; break;
-					case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
+						case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
-					case "all": $title .= ", searching everywhere";
+						case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
 						case "all": $title .= ", searching everywhere";
 					}
 				}
 				$result .= "<span title='$title' style='$style'>$token</span> ";
@ -1067,36 +1069,29 @@ class search
 	/**
 	 * Splits a query string into tokens. Does not require that the input string be transliterated.
-	 * Actually based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
+	 * Was based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
-	 * @param	string	$query	The queyr string to split.
+	 * Now improved to be strtok-based, since it's much faster.
 	 * Example I used when writing this: https://www.php.net/manual/en/function.strtok.php#94463
 	 * @param	string	$query	The query string to split.
 	 */
 	public function stas_split($query) {
-		$chars = str_split(self::$literator->transliterate($query));
+		$query = self::$literator->transliterate($query);
 		$terms = [];
-		$next_term = "";
+		$next_token = strtok($query, " \r\n\t");
-		$toggle_state = false; // true = now inside, false = now outside
+		while(true) {
 		foreach($chars as $char)
 		{
 			if($char == '"') {
 				// Invert the toggle block state
 				$toggle_state = !$toggle_state;
 			}
-			// If this char is whitespace *and* we're outside a toggle block, then it's a token
+			if(strpos($next_token, '"') !== false)
-			if(ctype_space($char) && !$toggle_state) {
+				$next_token .= " " . strtok('"') . '"';
-				// If the string is empty, then don't bother
+			if(strpos($next_token, "'") !== false)
-				if(empty($next_term)) continue;
+				$next_token .= " " . strtok("'") . "'";
-				$terms[] = $next_term;
+
-				$next_term = "";
+			$terms[] = $next_token;
-			}
+			
-			// If it's not whitespace, or it is whitespace and we're inside a toggle block....
+			$next_token = strtok(" \r\n\t");
-			else if(!ctype_space($char) || ($toggle_state && ctype_space($char)))
+			if($next_token === false) break;
 				$next_term .= $char; // ...then add the char to the next part
 		}
 		if(strlen($next_term) > 0)
 			$terms[] = $next_term;
 		return $terms;
 	}
@ -1132,17 +1127,31 @@ class search
 		for($i = count($tokens) - 1; $i >= 0; $i--) {
 			// Look for excludes
 			if($tokens[$i][0] == "-") {
-				$result["exclude"][] = substr($tokens[$i], 1);
+				if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
 					$result["tokens"][] = [
 						"term" => substr($tokens[$i], 1),
 						"weight" => -1,
 						"location" => "all"
 					];
 				}
 				else
 					$result["exclude"][] = substr($tokens[$i], 1);
 				continue;
 			}
 			// Look for weighted terms
 			if($tokens[$i][0] == "+") {
-				$result["terms"][] = [
+				if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
-					"term" => substr($tokens[$i], 1),
+					$result["tokens"] = [ "term" => substr($tokens[$i], 1), "weight" => -1, "location" => "all" ];
-					"weight" => 2,
+				}
-					"location" => "all"
+				else {
-				];
+					$result["terms"][] = [
 						"term" => substr($tokens[$i], 1),
 						"weight" => 2,
 						"location" => "all"
 					];
 				}
 				continue;
 			}