mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-22 16:33:00 +00:00
Add $capture_offsets option to tokenize().
TODO: Utilise this in the indexer & update the changelog mentioning that _all_ inverted indexes will need to be rebuilt
This commit is contained in:
parent
8403ffd5c3
commit
cdee30c286
3 changed files with 21 additions and 9 deletions
|
@ -397,7 +397,7 @@ if($settings->sessionprefix == "auto")
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
/** The version of Pepperminty Wiki currently running. */
|
/** The version of Pepperminty Wiki currently running. */
|
||||||
$version = "v0.17-dev";
|
$version = "v0.17-dev";
|
||||||
$commit = "80f2cc77a8fa474394492f08ea7f4c998d076acc";
|
$commit = "8403ffd5c3de9725756bcfc5929ce9239be1379b";
|
||||||
/// Environment ///
|
/// Environment ///
|
||||||
/** Holds information about the current request environment. */
|
/** Holds information about the current request environment. */
|
||||||
$env = new stdClass();
|
$env = new stdClass();
|
||||||
|
@ -4194,14 +4194,16 @@ class search
|
||||||
{
|
{
|
||||||
$nterm = $term;
|
$nterm = $term;
|
||||||
|
|
||||||
|
|
||||||
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
||||||
if(in_array($nterm, self::$stop_words)) continue;
|
if(in_array($nterm, self::$stop_words)) { $i++; continue; }
|
||||||
|
|
||||||
if(!isset($index[$nterm]))
|
if(!isset($index[$nterm]))
|
||||||
{
|
{
|
||||||
$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
|
$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME: Here we use the index of the token in the array, when we want the number of characters into the page!
|
||||||
$index[$nterm]["freq"]++;
|
$index[$nterm]["freq"]++;
|
||||||
$index[$nterm]["offsets"][] = $i;
|
$index[$nterm]["offsets"][] = $i;
|
||||||
|
|
||||||
|
@ -4214,19 +4216,24 @@ class search
|
||||||
/**
|
/**
|
||||||
* Converts a source string into a series of raw tokens.
|
* Converts a source string into a series of raw tokens.
|
||||||
* @param string $source The source string to process.
|
* @param string $source The source string to process.
|
||||||
|
* @param boolean $capture_offsets Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ].
|
||||||
* @return array An array of raw tokens extracted from the specified source string.
|
* @return array An array of raw tokens extracted from the specified source string.
|
||||||
*/
|
*/
|
||||||
public static function tokenize($source)
|
public static function tokenize($source, $capture_offsets = false)
|
||||||
{
|
{
|
||||||
/** Normalises input characters for searching & indexing */
|
/** Normalises input characters for searching & indexing */
|
||||||
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
|
$flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items
|
||||||
|
if($capture_offsets)
|
||||||
|
$flags |= PREG_SPLIT_OFFSET_CAPTURE;
|
||||||
|
|
||||||
// We don't need to normalise here because the transliterator handles
|
// We don't need to normalise here because the transliterator handles
|
||||||
// this for us. Also, we can't move the literator to a static variable
|
// this for us. Also, we can't move the literator to a static variable
|
||||||
// because PHP doesn't like it very much
|
// because PHP doesn't like it very much
|
||||||
$source = $literator->transliterate($source);
|
$source = $literator->transliterate($source);
|
||||||
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
||||||
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
|
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -104,7 +104,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||||
"id": "feature-search",
|
"id": "feature-search",
|
||||||
"lastupdate": 1530270319,
|
"lastupdate": 1530313680,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -570,19 +570,24 @@ class search
|
||||||
/**
|
/**
|
||||||
* Converts a source string into a series of raw tokens.
|
* Converts a source string into a series of raw tokens.
|
||||||
* @param string $source The source string to process.
|
* @param string $source The source string to process.
|
||||||
|
* @param boolean $capture_offsets Whether to capture & return the character offsets of the tokens detected. If true, then each token returned will be an array in the form [ token, char_offset ].
|
||||||
* @return array An array of raw tokens extracted from the specified source string.
|
* @return array An array of raw tokens extracted from the specified source string.
|
||||||
*/
|
*/
|
||||||
public static function tokenize($source)
|
public static function tokenize($source, $capture_offsets = false)
|
||||||
{
|
{
|
||||||
/** Normalises input characters for searching & indexing */
|
/** Normalises input characters for searching & indexing */
|
||||||
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
|
$flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items
|
||||||
|
if($capture_offsets)
|
||||||
|
$flags |= PREG_SPLIT_OFFSET_CAPTURE;
|
||||||
|
|
||||||
// We don't need to normalise here because the transliterator handles
|
// We don't need to normalise here because the transliterator handles
|
||||||
// this for us. Also, we can't move the literator to a static variable
|
// this for us. Also, we can't move the literator to a static variable
|
||||||
// because PHP doesn't like it very much
|
// because PHP doesn't like it very much
|
||||||
$source = $literator->transliterate($source);
|
$source = $literator->transliterate($source);
|
||||||
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
||||||
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
|
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue