From cdee30c2861c0ae91573214105caa659da38ac7d Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Sat, 30 Jun 2018 00:08:57 +0100 Subject: [PATCH] Add $capture_offsets option to tokenize(). TODO: Utilise this in the indexer & update the changelog mentioning that _all_ inverted indexes will need to be rebuilt --- build/index.php | 17 ++++++++++++----- module_index.json | 2 +- modules/feature-search.php | 11 ++++++++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/build/index.php b/build/index.php index 319bb53..52cb691 100644 --- a/build/index.php +++ b/build/index.php @@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto") ///////////////////////////////////////////////////////////////////////////// /** The version of Pepperminty Wiki currently running. */ $version = "v0.17-dev"; -$commit = "80f2cc77a8fa474394492f08ea7f4c998d076acc"; +$commit = "8403ffd5c3de9725756bcfc5929ce9239be1379b"; /// Environment /// /** Holds information about the current request environment. */ $env = new stdClass(); @@ -4194,14 +4194,16 @@ class search { $nterm = $term; + // Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words) - if(in_array($nterm, self::$stop_words)) continue; + if(in_array($nterm, self::$stop_words)) { $i++; continue; } if(!isset($index[$nterm])) { $index[$nterm] = [ "freq" => 0, "offsets" => [] ]; } + // FIXME: Here we use the index of the token in the array, when we want the number of characters into the page! $index[$nterm]["freq"]++; $index[$nterm]["offsets"][] = $i; @@ -4213,20 +4215,25 @@ class search /** * Converts a source string into a series of raw tokens. - * @param string $source The source string to process. + * @param string $source The source string to process. + * @param boolean $capture_offsets Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ]. * @return array An array of raw tokens extracted from the specified source string. */ - public static function tokenize($source) + public static function tokenize($source, $capture_offsets = false) { /** Normalises input characters for searching & indexing */ static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); + $flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items + if($capture_offsets) + $flags |= PREG_SPLIT_OFFSET_CAPTURE; + // We don't need to normalise here because the transliterator handles // this for us. Also, we can't move the literator to a static variable // because PHP doesn't like it very much $source = $literator->transliterate($source); $source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source); - return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY); + return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags); } /** diff --git a/module_index.json b/module_index.json index 9a853b9..41bce1c 100755 --- a/module_index.json +++ b/module_index.json @@ -104,7 +104,7 @@ "author": "Starbeamrainbowlabs", "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.", "id": "feature-search", - "lastupdate": 1530270319, + "lastupdate": 1530313680, "optional": false }, { diff --git a/modules/feature-search.php b/modules/feature-search.php index 5228760..31a716d 100644 --- a/modules/feature-search.php +++ b/modules/feature-search.php @@ -569,20 +569,25 @@ class search /** * Converts a source string into a series of raw tokens. - * @param string $source The source string to process. + * @param string $source The source string to process. + * @param boolean $capture_offsets Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ]. * @return array An array of raw tokens extracted from the specified source string. */ - public static function tokenize($source) + public static function tokenize($source, $capture_offsets = false) { /** Normalises input characters for searching & indexing */ static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); + $flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items + if($capture_offsets) + $flags |= PREG_SPLIT_OFFSET_CAPTURE; + // We don't need to normalise here because the transliterator handles // this for us. Also, we can't move the literator to a static variable // because PHP doesn't like it very much $source = $literator->transliterate($source); $source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source); - return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY); + return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags); } /**