Search: Transliterate characters so you don't have to remember the diacritics when searching

2025-04-17 14:14:55 +00:00 · 2018-06-25 22:53:53 +01:00 · 2018-06-25 22:53:53 +01:00 · 49b91aa6f9
commit 49b91aa6f9
parent bdf47a2540
4 changed files with 83 additions and 21 deletions
--- a/Changelog.md
+++ b/Changelog.md
@ -7,6 +7,8 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
 - [Module API] Added `save_settings()` convenience method

 ### Fixed
+ - Updated the search system to transliterate characters to better support searching pages that are written in other languages.
+     - You'll want to rebuild your search index via the button in the configuration panel, or the `invindex-rebuild` action.
 - [Security] Made the site secret generator cryptographically secure. If you created your wiki before this change, you might want to change your site secret in `peppermint.json` to something more secure with a site like [random.org](https://www.random.org/).
     - The PHP function `openssl_pseudo_random_bytes()` was being used before, but [apparently that's not cryptographically secure](https://paragonie.com/blog/2015/07/how-safely-generate-random-strings-and-integers-in-php).
 - [Module API] Fix `full_url()` logic
--- a/build/index.php
+++ b/build/index.php
@ -396,7 +396,7 @@ if($settings->sessionprefix == "auto")
 /////////////////////////////////////////////////////////////////////////////
 /** The version of Pepperminty Wiki currently running. */
 $version = "v0.17-dev";
-$commit = "b6eda24adaf3607cf3437be0b8419215b52d662b";
+$commit = "bdf47a2540bdbb36bd69869fd2a1b90d033d9966";
 /// Environment ///
 /** Holds information about the current request environment. */
 $env = new stdClass();
@ -3768,9 +3768,18 @@ register_module([
 			
 			$search_start = microtime(true);
 			
+			
+			$time_start = microtime(true);
 			$invindex = search::load_invindex($paths->searchindex);
+			$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
+			
+			$start = microtime(true);
 			$results = search::query_invindex($_GET["query"], $invindex);
 			$resultCount = count($results);
+			$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
+			
+			header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
+			header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
 			
 			foreach($results as &$result) {
 				$result["context"] = search::extract_context(
@ -4154,6 +4163,7 @@ class search
 	 */
 	public static function index($source)
 	{
+		// We don't need to normalise or transliterate here because self::tokenize() does this for us
 		$source = html_entity_decode($source, ENT_QUOTES);
 		$source_length = mb_strlen($source);
 		
@ -4189,7 +4199,13 @@ class search
 	 */
 	public static function tokenize($source)
 	{
-		$source = Normalizer::normalize(strtolower($source), Normalizer::FORM_C);
+		/** Normalises input characters for searching & indexing */
+		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		
+		// We don't need to normalise here because the transliterator handles 
+		// this for us. Also, we can't move the literator to a static variable 
+		// because PHP doesn't like it very much
+		$source = $literator->transliterate($source);
 		$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
 		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
 	}
@ -4213,6 +4229,9 @@ class search
 	{
 		global $pageindex, $env, $paths, $settings;
 		
+		/** Normalises input characters for searching & indexing */
+		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		
 		if($output) {
 			header("content-type: text/event-stream");
 			ob_end_flush();
@ -4234,8 +4253,8 @@ class search
 				$i++; $missing_files++;
 				continue;
 			}
-			$pagesource = Normalizer::normalize(file_get_contents($page_filename), Normalizer::FORM_C);
-			$index = self::index($pagesource);
+			// We do not transliterate or normalise here because the indexer will take care of this for us
+			$index = self::index(file_get_contents($page_filename));
 			
 			$pageid = ids::getid($pagename);
 			self::merge_into_invindex($invindex, $pageid, $index);
@ -4387,6 +4406,9 @@ class search
 	{
 		global $settings, $pageindex;
 		
+		/** Normalises input characters for searching & indexing */
+		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		
 		$query_terms = self::tokenize($query);
 		$matching_pages = [];
 		
@ -4422,7 +4444,9 @@ class search
 				// Get the current page's id
 				$pageid = ids::getid($pagename);
 				// Consider matches in the page title
-				if(stripos($pagename, $qterm) !== false)
+				$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
+				$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
+				if($title_matches_count > 0)
 				{
 					// We found the qterm in the title
 					if(!isset($matching_pages[$pageid]))
@ -4432,12 +4456,14 @@ class search
 					if(!isset($matching_pages[$pageid]["title-matches"]))
 						$matching_pages[$pageid]["title-matches"] = 0;
 					
-					$matching_pages[$pageid]["title-matches"] += count(mb_stripos_all($pagename, $qterm)) * strlen($qterm);
+					$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
 				}
 				
 				// Consider matches in the page's tags
-				if(isset($pagedata->tags) and // If this page has tags
-				   stripos(implode(" ", $pagedata->tags), $qterm) !== false) // And we found the qterm in the tags
+				$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
+				$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
+				
+				if($tag_matches_count > 0) // And we found the qterm in the tags
 				{
 					if(!isset($matching_pages[$pageid]))
 						$matching_pages[$pageid] = [ "nterms" => [] ];
@ -4445,7 +4471,7 @@ class search
 					// Set up a counter for tag match if there isn't one already
 					if(!isset($matching_pages[$pageid]["tag-matches"]))
 						$matching_pages[$pageid]["tag-matches"] = 0;
-					$matching_pages[$pageid]["tag-matches"] += count(mb_stripos_all(implode(" ", $pagedata->tags), $qterm)) * strlen($qterm);
+					$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
 				}
 			}
 		}
@ -7047,7 +7073,12 @@ DIFFSCRIPT;
 		 . "</p>");
 	}
 ]);
-
+/**
+ * Generates a unique hash of a page's content for edit conflict detection
+ * purposes.
+ * @param	string	$page_data	The page text to hash.
+ * @return	string				A hash of the given page text.
+ */
 function generate_page_hash($page_data) {
 	return sha1($page_data);
 }
@ -7744,6 +7775,9 @@ register_module([
 	}
 ]);

+/**
+ * Recalculates and updates the password hashing cost.
+ */
 function do_password_hash_code_update() {
 	global $settings, $paths;
 	
--- a/module_index.json
+++ b/module_index.json
@ -104,7 +104,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
        "id": "feature-search",
-        "lastupdate": 1523105081,
+        "lastupdate": 1529963426,
        "optional": false
    },
    {
@ -176,7 +176,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Allows you to edit pages by adding the edit and save actions. You should probably include this one.",
        "id": "page-edit",
-        "lastupdate": 1526037910,
+        "lastupdate": 1527246338,
        "optional": false
    },
    {
@ -212,7 +212,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Adds a pair of actions (login and checklogin) that allow users to login. You need this one if you want your users to be able to login.",
        "id": "page-login",
-        "lastupdate": 1526227977,
+        "lastupdate": 1527246396,
        "optional": false
    },
    {
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -125,9 +125,18 @@ register_module([
 			
 			$search_start = microtime(true);
 			
+			
+			$time_start = microtime(true);
 			$invindex = search::load_invindex($paths->searchindex);
+			$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
+			
+			$start = microtime(true);
 			$results = search::query_invindex($_GET["query"], $invindex);
 			$resultCount = count($results);
+			$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
+			
+			header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
+			header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
 			
 			foreach($results as &$result) {
 				$result["context"] = search::extract_context(
@ -511,6 +520,7 @@ class search
 	 */
 	public static function index($source)
 	{
+		// We don't need to normalise or transliterate here because self::tokenize() does this for us
 		$source = html_entity_decode($source, ENT_QUOTES);
 		$source_length = mb_strlen($source);
 		
@ -546,7 +556,13 @@ class search
 	 */
 	public static function tokenize($source)
 	{
-		$source = Normalizer::normalize(strtolower($source), Normalizer::FORM_C);
+		/** Normalises input characters for searching & indexing */
+		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		
+		// We don't need to normalise here because the transliterator handles 
+		// this for us. Also, we can't move the literator to a static variable 
+		// because PHP doesn't like it very much
+		$source = $literator->transliterate($source);
 		$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
 		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
 	}
@ -570,6 +586,9 @@ class search
 	{
 		global $pageindex, $env, $paths, $settings;
 		
+		/** Normalises input characters for searching & indexing */
+		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		
 		if($output) {
 			header("content-type: text/event-stream");
 			ob_end_flush();
@ -591,8 +610,8 @@ class search
 				$i++; $missing_files++;
 				continue;
 			}
-			$pagesource = Normalizer::normalize(file_get_contents($page_filename), Normalizer::FORM_C);
-			$index = self::index($pagesource);
+			// We do not transliterate or normalise here because the indexer will take care of this for us
+			$index = self::index(file_get_contents($page_filename));
 			
 			$pageid = ids::getid($pagename);
 			self::merge_into_invindex($invindex, $pageid, $index);
@ -744,6 +763,9 @@ class search
 	{
 		global $settings, $pageindex;
 		
+		/** Normalises input characters for searching & indexing */
+		static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
+		
 		$query_terms = self::tokenize($query);
 		$matching_pages = [];
 		
@ -779,7 +801,9 @@ class search
 				// Get the current page's id
 				$pageid = ids::getid($pagename);
 				// Consider matches in the page title
-				if(stripos($pagename, $qterm) !== false)
+				$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
+				$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
+				if($title_matches_count > 0)
 				{
 					// We found the qterm in the title
 					if(!isset($matching_pages[$pageid]))
@ -789,12 +813,14 @@ class search
 					if(!isset($matching_pages[$pageid]["title-matches"]))
 						$matching_pages[$pageid]["title-matches"] = 0;
 					
-					$matching_pages[$pageid]["title-matches"] += count(mb_stripos_all($pagename, $qterm)) * strlen($qterm);
+					$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
 				}
 				
 				// Consider matches in the page's tags
-				if(isset($pagedata->tags) and // If this page has tags
-				   stripos(implode(" ", $pagedata->tags), $qterm) !== false) // And we found the qterm in the tags
+				$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
+				$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
+				
+				if($tag_matches_count > 0) // And we found the qterm in the tags
 				{
 					if(!isset($matching_pages[$pageid]))
 						$matching_pages[$pageid] = [ "nterms" => [] ];
@ -802,7 +828,7 @@ class search
 					// Set up a counter for tag match if there isn't one already
 					if(!isset($matching_pages[$pageid]["tag-matches"]))
 						$matching_pages[$pageid]["tag-matches"] = 0;
-					$matching_pages[$pageid]["tag-matches"] += count(mb_stripos_all(implode(" ", $pagedata->tags), $qterm)) * strlen($qterm);
+					$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
 				}
 			}
 		}