Refactor stas_split to be more fasterererer

Informal testing shows that it's gone from taking ~18% of the total time 
to ~4% of the total time :D
This commit is contained in:
Starbeamrainbowlabs 2019-12-15 17:56:56 +00:00
parent 843f0f7ee9
commit c80f26962e
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
3 changed files with 46 additions and 36 deletions

View File

@ -8,6 +8,7 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
- Another search index rebuild is required
- Optimisation: Don't generate the list of pages for the datalist if it isn't going to be displayed (especially noticeable on wikis with lots of pages)
- Optimisation: Don't load the statistics index if it's not needed (also esp. noticeable on wikis with lots of pages)
- Optimisation: Refactor `stas_split()` to be faster (informal testing shows ~18% → 4% total time)
- [Module Api] Optimisation: Remove `search::transliterate` because it has a huge overhead. Use `search::$literator->transliterate()` instead.
## v0.20

View File

@ -135,7 +135,7 @@
"version": "0.11",
"author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"lastupdate": 1575838820,
"lastupdate": 1576432241,
"optional": false,
"extra_data": []
},

View File

@ -360,11 +360,13 @@ register_module([
case -1: $style .= "color: grey; text-decoration: wavy line-through;"; $title = "stop word"; break;
case 1: $style .= "color: blue;"; $title = "normal word"; break;
}
switch($term["location"]) {
case "body": $style = "color: cyan"; $title = "body only"; break;
case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
case "all": $title .= ", searching everywhere";
if($term["weight"] !== -1) {
switch($term["location"]) {
case "body": $style = "color: cyan"; $title = "body only"; break;
case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
case "all": $title .= ", searching everywhere";
}
}
$result .= "<span title='$title' style='$style'>$token</span> ";
@ -1067,36 +1069,29 @@ class search
/**
* Splits a query string into tokens. Does not require that the input string be transliterated.
* Actually based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
* @param string $query The queyr string to split.
* Was based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
* Now improved to be strtok-based, since it's much faster.
* Example I used when writing this: https://www.php.net/manual/en/function.strtok.php#94463
* @param string $query The query string to split.
*/
public function stas_split($query) {
$chars = str_split(self::$literator->transliterate($query));
$query = self::$literator->transliterate($query);
$terms = [];
$next_term = "";
$toggle_state = false; // true = now inside, false = now outside
foreach($chars as $char)
{
if($char == '"') {
// Invert the toggle block state
$toggle_state = !$toggle_state;
}
$next_token = strtok($query, " \r\n\t");
while(true) {
// If this char is whitespace *and* we're outside a toggle block, then it's a token
if(ctype_space($char) && !$toggle_state) {
// If the string is empty, then don't bother
if(empty($next_term)) continue;
$terms[] = $next_term;
$next_term = "";
}
// If it's not whitespace, or it is whitespace and we're inside a toggle block....
else if(!ctype_space($char) || ($toggle_state && ctype_space($char)))
$next_term .= $char; // ...then add the char to the next part
if(strpos($next_token, '"') !== false)
$next_token .= " " . strtok('"') . '"';
if(strpos($next_token, "'") !== false)
$next_token .= " " . strtok("'") . "'";
$terms[] = $next_token;
$next_token = strtok(" \r\n\t");
if($next_token === false) break;
}
if(strlen($next_term) > 0)
$terms[] = $next_term;
return $terms;
}
@ -1132,17 +1127,31 @@ class search
for($i = count($tokens) - 1; $i >= 0; $i--) {
// Look for excludes
if($tokens[$i][0] == "-") {
$result["exclude"][] = substr($tokens[$i], 1);
if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
$result["tokens"][] = [
"term" => substr($tokens[$i], 1),
"weight" => -1,
"location" => "all"
];
}
else
$result["exclude"][] = substr($tokens[$i], 1);
continue;
}
// Look for weighted terms
if($tokens[$i][0] == "+") {
$result["terms"][] = [
"term" => substr($tokens[$i], 1),
"weight" => 2,
"location" => "all"
];
if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
$result["tokens"] = [ "term" => substr($tokens[$i], 1), "weight" => -1, "location" => "all" ];
}
else {
$result["terms"][] = [
"term" => substr($tokens[$i], 1),
"weight" => 2,
"location" => "all"
];
}
continue;
}