From cdee30c2861c0ae91573214105caa659da38ac7d Mon Sep 17 00:00:00 2001
From: Starbeamrainbowlabs <sbrl@starbeamrainbowlabs.com>
Date: Sat, 30 Jun 2018 00:08:57 +0100
Subject: [PATCH] Add $capture_offsets option to tokenize().

TODO: Utilise this in the indexer & update the changelog mentioning that
_all_ inverted indexes will need to be rebuilt
---
 build/index.php            | 17 ++++++++++++-----
 module_index.json          |  2 +-
 modules/feature-search.php | 11 ++++++++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/build/index.php b/build/index.php
index 319bb53..52cb691 100644
--- a/build/index.php
+++ b/build/index.php
@@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
 /////////////////////////////////////////////////////////////////////////////
 /** The version of Pepperminty Wiki currently running. */
 $version = "v0.17-dev";
-$commit = "80f2cc77a8fa474394492f08ea7f4c998d076acc";
+$commit = "8403ffd5c3de9725756bcfc5929ce9239be1379b";
 /// Environment ///
 /** Holds information about the current request environment. */
 $env = new stdClass();
@@ -4194,14 +4194,16 @@ class search
 		{
 			$nterm = $term;
 			
+			
 			// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
-			if(in_array($nterm, self::$stop_words)) continue;
+			if(in_array($nterm, self::$stop_words)) { $i++; continue; }
 			
 			if(!isset($index[$nterm]))
 			{
 				$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
 			}
 			
+			// FIXME: Here we use the index of the token in the array, when we want the number of characters into the page!
 			$index[$nterm]["freq"]++;
 			$index[$nterm]["offsets"][] = $i;
 			
@@ -4213,20 +4215,25 @@ class search
 	
 	/**
 	 * Converts a source string into a series of raw tokens.
-	 * @param	string	$source	The source string to process.
+	 * @param	string	$source				The source string to process.
+	 * @param	boolean	$capture_offsets	Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ].
 	 * @return	array	An array of raw tokens extracted from the specified source string.
 	 */
-	public static function tokenize($source)
+	public static function tokenize($source, $capture_offsets = false)
 	{
 		/** Normalises input characters for searching & indexing */
 		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
 		
+		$flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items
+		if($capture_offsets)
+			$flags |= PREG_SPLIT_OFFSET_CAPTURE;
+		
 		// We don't need to normalise here because the transliterator handles 
 		// this for us. Also, we can't move the literator to a static variable 
 		// because PHP doesn't like it very much
 		$source = $literator->transliterate($source);
 		$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
-		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
+		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags);
 	}
 	
 	/**
diff --git a/module_index.json b/module_index.json
index 9a853b9..41bce1c 100755
--- a/module_index.json
+++ b/module_index.json
@@ -104,7 +104,7 @@
         "author": "Starbeamrainbowlabs",
         "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
         "id": "feature-search",
-        "lastupdate": 1530270319,
+        "lastupdate": 1530313680,
         "optional": false
     },
     {
diff --git a/modules/feature-search.php b/modules/feature-search.php
index 5228760..31a716d 100644
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@@ -569,20 +569,25 @@ class search
 	
 	/**
 	 * Converts a source string into a series of raw tokens.
-	 * @param	string	$source	The source string to process.
+	 * @param	string	$source				The source string to process.
+	 * @param	boolean	$capture_offsets	Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ].
 	 * @return	array	An array of raw tokens extracted from the specified source string.
 	 */
-	public static function tokenize($source)
+	public static function tokenize($source, $capture_offsets = false)
 	{
 		/** Normalises input characters for searching & indexing */
 		static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
 		
+		$flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items
+		if($capture_offsets)
+			$flags |= PREG_SPLIT_OFFSET_CAPTURE;
+		
 		// We don't need to normalise here because the transliterator handles 
 		// this for us. Also, we can't move the literator to a static variable 
 		// because PHP doesn't like it very much
 		$source = $literator->transliterate($source);
 		$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
-		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
+		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags);
 	}
 	
 	/**