mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-22 16:33:00 +00:00
Refactor stas_split to be more fasterererer
Informal testing shows that it's gone from taking ~18% of the total time to ~4% of the total time :D
This commit is contained in:
parent
843f0f7ee9
commit
c80f26962e
3 changed files with 46 additions and 36 deletions
|
@ -8,6 +8,7 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
|
||||||
- Another search index rebuild is required
|
- Another search index rebuild is required
|
||||||
- Optimisation: Don't generate the list of pages for the datalist if it isn't going to be displayed (especially noticeable on wikis with lots of pages)
|
- Optimisation: Don't generate the list of pages for the datalist if it isn't going to be displayed (especially noticeable on wikis with lots of pages)
|
||||||
- Optimisation: Don't load the statistics index if it's not needed (also esp. noticeable on wikis with lots of pages)
|
- Optimisation: Don't load the statistics index if it's not needed (also esp. noticeable on wikis with lots of pages)
|
||||||
|
- Optimisation: Refactor `stas_split()` to be faster (informal testing shows ~18% → 4% total time)
|
||||||
- [Module Api] Optimisation: Remove `search::transliterate` because it has a huge overhead. Use `search::$literator->transliterate()` instead.
|
- [Module Api] Optimisation: Remove `search::transliterate` because it has a huge overhead. Use `search::$literator->transliterate()` instead.
|
||||||
|
|
||||||
## v0.20
|
## v0.20
|
||||||
|
|
|
@ -135,7 +135,7 @@
|
||||||
"version": "0.11",
|
"version": "0.11",
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||||
"lastupdate": 1575838820,
|
"lastupdate": 1576432241,
|
||||||
"optional": false,
|
"optional": false,
|
||||||
"extra_data": []
|
"extra_data": []
|
||||||
},
|
},
|
||||||
|
|
|
@ -360,11 +360,13 @@ register_module([
|
||||||
case -1: $style .= "color: grey; text-decoration: wavy line-through;"; $title = "stop word"; break;
|
case -1: $style .= "color: grey; text-decoration: wavy line-through;"; $title = "stop word"; break;
|
||||||
case 1: $style .= "color: blue;"; $title = "normal word"; break;
|
case 1: $style .= "color: blue;"; $title = "normal word"; break;
|
||||||
}
|
}
|
||||||
switch($term["location"]) {
|
if($term["weight"] !== -1) {
|
||||||
case "body": $style = "color: cyan"; $title = "body only"; break;
|
switch($term["location"]) {
|
||||||
case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
|
case "body": $style = "color: cyan"; $title = "body only"; break;
|
||||||
case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
|
case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
|
||||||
case "all": $title .= ", searching everywhere";
|
case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
|
||||||
|
case "all": $title .= ", searching everywhere";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$result .= "<span title='$title' style='$style'>$token</span> ";
|
$result .= "<span title='$title' style='$style'>$token</span> ";
|
||||||
|
@ -1067,36 +1069,29 @@ class search
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splits a query string into tokens. Does not require that the input string be transliterated.
|
* Splits a query string into tokens. Does not require that the input string be transliterated.
|
||||||
* Actually based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
|
* Was based on my earlier explode_adv: https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
|
||||||
* @param string $query The queyr string to split.
|
* Now improved to be strtok-based, since it's much faster.
|
||||||
|
* Example I used when writing this: https://www.php.net/manual/en/function.strtok.php#94463
|
||||||
|
* @param string $query The query string to split.
|
||||||
*/
|
*/
|
||||||
public function stas_split($query) {
|
public function stas_split($query) {
|
||||||
$chars = str_split(self::$literator->transliterate($query));
|
$query = self::$literator->transliterate($query);
|
||||||
|
|
||||||
$terms = [];
|
$terms = [];
|
||||||
$next_term = "";
|
$next_token = strtok($query, " \r\n\t");
|
||||||
$toggle_state = false; // true = now inside, false = now outside
|
while(true) {
|
||||||
foreach($chars as $char)
|
|
||||||
{
|
|
||||||
if($char == '"') {
|
|
||||||
// Invert the toggle block state
|
|
||||||
$toggle_state = !$toggle_state;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If this char is whitespace *and* we're outside a toggle block, then it's a token
|
if(strpos($next_token, '"') !== false)
|
||||||
if(ctype_space($char) && !$toggle_state) {
|
$next_token .= " " . strtok('"') . '"';
|
||||||
// If the string is empty, then don't bother
|
if(strpos($next_token, "'") !== false)
|
||||||
if(empty($next_term)) continue;
|
$next_token .= " " . strtok("'") . "'";
|
||||||
$terms[] = $next_term;
|
|
||||||
$next_term = "";
|
$terms[] = $next_token;
|
||||||
}
|
|
||||||
// If it's not whitespace, or it is whitespace and we're inside a toggle block....
|
$next_token = strtok(" \r\n\t");
|
||||||
else if(!ctype_space($char) || ($toggle_state && ctype_space($char)))
|
if($next_token === false) break;
|
||||||
$next_term .= $char; // ...then add the char to the next part
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(strlen($next_term) > 0)
|
|
||||||
$terms[] = $next_term;
|
|
||||||
|
|
||||||
return $terms;
|
return $terms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1132,17 +1127,31 @@ class search
|
||||||
for($i = count($tokens) - 1; $i >= 0; $i--) {
|
for($i = count($tokens) - 1; $i >= 0; $i--) {
|
||||||
// Look for excludes
|
// Look for excludes
|
||||||
if($tokens[$i][0] == "-") {
|
if($tokens[$i][0] == "-") {
|
||||||
$result["exclude"][] = substr($tokens[$i], 1);
|
if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
|
||||||
|
$result["tokens"][] = [
|
||||||
|
"term" => substr($tokens[$i], 1),
|
||||||
|
"weight" => -1,
|
||||||
|
"location" => "all"
|
||||||
|
];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
$result["exclude"][] = substr($tokens[$i], 1);
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Look for weighted terms
|
// Look for weighted terms
|
||||||
if($tokens[$i][0] == "+") {
|
if($tokens[$i][0] == "+") {
|
||||||
$result["terms"][] = [
|
if(in_array(substr($tokens[$i], 1), self::$stop_words)) {
|
||||||
"term" => substr($tokens[$i], 1),
|
$result["tokens"] = [ "term" => substr($tokens[$i], 1), "weight" => -1, "location" => "all" ];
|
||||||
"weight" => 2,
|
}
|
||||||
"location" => "all"
|
else {
|
||||||
];
|
$result["terms"][] = [
|
||||||
|
"term" => substr($tokens[$i], 1),
|
||||||
|
"weight" => 2,
|
||||||
|
"location" => "all"
|
||||||
|
];
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue