1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-11-22 04:23:01 +00:00

Fix a *huge* number of bugs in the new search system, but it's not ready just yet

This commit is contained in:
Starbeamrainbowlabs 2019-08-22 21:38:17 +01:00
parent e08e775d98
commit edf1be5801
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
8 changed files with 237 additions and 126 deletions

View file

@ -41,7 +41,7 @@ $paths = new stdClass();
/** The pageindex. Contains extensive information about all pages currently in this wiki. Individual entries for pages may be extended with arbitrary properties. */ /** The pageindex. Contains extensive information about all pages currently in this wiki. Individual entries for pages may be extended with arbitrary properties. */
$paths->pageindex = "pageindex.json"; $paths->pageindex = "pageindex.json";
/** The inverted index used for searching. Use the `search` class to interact with this - otherwise your brain might explode :P */ /** The inverted index used for searching. Use the `search` class to interact with this - otherwise your brain might explode :P */
$paths->searchindex = "invindex.json"; $paths->searchindex = "invindex.sqlite";
/** The index that maps ids to page names. Use the `ids` class to interact with it :-) */ /** The index that maps ids to page names. Use the `ids` class to interact with it :-) */
$paths->idindex = "idindex.json"; $paths->idindex = "idindex.json";
/** The cache of the most recently calculated statistics. */ /** The cache of the most recently calculated statistics. */

View file

@ -110,6 +110,39 @@ function glob_recursive($pattern, $flags = 0)
return $files; return $files;
} }
/**
* Resolves a relative path against a given base directory.
* @apiVersion 0.20.0
* @source https://stackoverflow.com/a/44312137/1460422
* @param string $path The relative path to resolve.
* @param string|null $basePath The base directory to resolve against.
* @return string An absolute path.
*/
function path_resolve(string $path, string $basePath = null) {
// Make absolute path
if (substr($path, 0, 1) !== DIRECTORY_SEPARATOR) {
if ($basePath === null) {
// Get PWD first to avoid getcwd() resolving symlinks if in symlinked folder
$path=(getenv('PWD') ?: getcwd()).DIRECTORY_SEPARATOR.$path;
} elseif (strlen($basePath)) {
$path=$basePath.DIRECTORY_SEPARATOR.$path;
}
}
// Resolve '.' and '..'
$components=array();
foreach(explode(DIRECTORY_SEPARATOR, rtrim($path, DIRECTORY_SEPARATOR)) as $name) {
if ($name === '..') {
array_pop($components);
} elseif ($name !== '.' && !(count($components) && $name === '')) {
// … && !(count($components) && $name === '') - we want to keep initial '/' for abs paths
$components[]=$name;
}
}
return implode(DIRECTORY_SEPARATOR, $components);
}
/** /**
* Gets the name of the parent page to the specified page. * Gets the name of the parent page to the specified page.
* @apiVersion 0.15.0 * @apiVersion 0.15.0

View file

@ -82,10 +82,10 @@
{ {
"id": "feature-guiconfig", "id": "feature-guiconfig",
"name": "Settings GUI", "name": "Settings GUI",
"version": "0.1.4", "version": "0.1.5",
"author": "Starbeamrainbowlabs", "author": "Starbeamrainbowlabs",
"description": "The module everyone has been waiting for! Adds a web based gui that lets mods change the wiki settings.", "description": "The module everyone has been waiting for! Adds a web based gui that lets mods change the wiki settings.",
"lastupdate": 1557575008, "lastupdate": 1566498857,
"optional": false, "optional": false,
"extra_data": [] "extra_data": []
}, },
@ -132,10 +132,10 @@
{ {
"id": "feature-search", "id": "feature-search",
"name": "Search", "name": "Search",
"version": "0.8", "version": "0.10",
"author": "Starbeamrainbowlabs", "author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.", "description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"lastupdate": 1565909052, "lastupdate": 1566506237,
"optional": false, "optional": false,
"extra_data": [] "extra_data": []
}, },
@ -202,20 +202,20 @@
{ {
"id": "page-delete", "id": "page-delete",
"name": "Page deleter", "name": "Page deleter",
"version": "0.10.1", "version": "0.10.2",
"author": "Starbeamrainbowlabs", "author": "Starbeamrainbowlabs",
"description": "Adds an action to allow administrators to delete pages.", "description": "Adds an action to allow administrators to delete pages.",
"lastupdate": 1559400151, "lastupdate": 1566498558,
"optional": false, "optional": false,
"extra_data": [] "extra_data": []
}, },
{ {
"id": "page-edit", "id": "page-edit",
"name": "Page editor", "name": "Page editor",
"version": "0.17.4", "version": "0.17.5",
"author": "Starbeamrainbowlabs", "author": "Starbeamrainbowlabs",
"description": "Allows you to edit pages by adding the edit and save actions. You should probably include this one.", "description": "Allows you to edit pages by adding the edit and save actions. You should probably include this one.",
"lastupdate": 1565287856, "lastupdate": 1566498562,
"optional": false, "optional": false,
"extra_data": { "extra_data": {
"diff.min.js": "https:\/\/cdnjs.cloudflare.com\/ajax\/libs\/jsdiff\/2.2.2\/diff.min.js" "diff.min.js": "https:\/\/cdnjs.cloudflare.com\/ajax\/libs\/jsdiff\/2.2.2\/diff.min.js"
@ -234,10 +234,10 @@
{ {
"id": "page-help", "id": "page-help",
"name": "Help page", "name": "Help page",
"version": "0.9.3", "version": "0.9.4",
"author": "Starbeamrainbowlabs", "author": "Starbeamrainbowlabs",
"description": "Adds a rather useful help page. Access through the 'help' action. This module also exposes help content added to Pepperminty Wiki's inbuilt invisible help section system.", "description": "Adds a rather useful help page. Access through the 'help' action. This module also exposes help content added to Pepperminty Wiki's inbuilt invisible help section system.",
"lastupdate": 1492433537, "lastupdate": 1566498566,
"optional": false, "optional": false,
"extra_data": [] "extra_data": []
}, },
@ -326,8 +326,8 @@
"name": "Parsedown", "name": "Parsedown",
"version": "0.10", "version": "0.10",
"author": "Emanuil Rusev & Starbeamrainbowlabs", "author": "Emanuil Rusev & Starbeamrainbowlabs",
"description": "An upgraded (now default!) parser based on Emanuil Rusev's Parsedown Extra PHP library (https:\/\/github.com\/erusev\/parsedown-extra), which is licensed MIT. Please be careful, as this module adds some weight to your installation, and also *requires* write access to the disk on first load.", "description": "An upgraded (now default!) parser based on Emanuil Rusev's Parsedown Extra PHP library (https:\/\/github.com\/erusev\/parsedown-extra), which is licensed MIT. Please be careful, as this module adds some weight to your installation.",
"lastupdate": 1551564416, "lastupdate": 1566070821,
"optional": false, "optional": false,
"extra_data": { "extra_data": {
"Parsedown.php": "https:\/\/raw.githubusercontent.com\/erusev\/parsedown\/819c68899d593503180ed79ef4be5a4dcd8c5f92\/Parsedown.php", "Parsedown.php": "https:\/\/raw.githubusercontent.com\/erusev\/parsedown\/819c68899d593503180ed79ef4be5a4dcd8c5f92\/Parsedown.php",

View file

@ -1,7 +1,7 @@
<?php <?php
register_module([ register_module([
"name" => "Settings GUI", "name" => "Settings GUI",
"version" => "0.1.4", "version" => "0.1.5",
"author" => "Starbeamrainbowlabs", "author" => "Starbeamrainbowlabs",
"description" => "The module everyone has been waiting for! Adds a web based gui that lets mods change the wiki settings.", "description" => "The module everyone has been waiting for! Adds a web based gui that lets mods change the wiki settings.",
"id" => "feature-guiconfig", "id" => "feature-guiconfig",
@ -65,6 +65,8 @@ window.addEventListener("load", function(event) {
if(message.startsWith("Done! Saving new search index to")) if(message.startsWith("Done! Saving new search index to"))
rebuildActionEvents.close(); rebuildActionEvents.close();
}); });
// Close the connection on error & don't try again
rebuildActionEvents.addEventListener("error", (_event) => rebuildActionEvents.close());
}); });
}); });
SCRIPT; SCRIPT;

View file

@ -1,7 +1,7 @@
<?php <?php
register_module([ register_module([
"name" => "Search", "name" => "Search",
"version" => "0.9", "version" => "0.10",
"author" => "Starbeamrainbowlabs", "author" => "Starbeamrainbowlabs",
"description" => "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.", "description" => "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"id" => "feature-search", "id" => "feature-search",
@ -146,7 +146,7 @@ register_module([
$start = microtime(true); $start = microtime(true);
foreach($results as &$result) { foreach($results as &$result) {
$result["context"] = search::extract_context( $result["context"] = search::extract_context(
$invindex, $result["pagename"], $result["pagename"],
$_GET["query"], $_GET["query"],
file_get_contents($env->storage_prefix . $result["pagename"] . ".md") file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
); );
@ -386,9 +386,8 @@ register_module([
exit("Error: The type '$type' is not one of the supported output types. Available values: json, opensearch. Default: json"); exit("Error: The type '$type' is not one of the supported output types. Available values: json, opensearch. Default: json");
} }
$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query = $literator->transliterate($_GET["query"]); $query = search::transliterate($_GET["query"]);
// Rank each page name // Rank each page name
@ -397,7 +396,7 @@ register_module([
$results[] = [ $results[] = [
"pagename" => $pageName, "pagename" => $pageName,
// Costs: Insert: 1, Replace: 8, Delete: 6 // Costs: Insert: 1, Replace: 8, Delete: 6
"distance" => levenshtein($query, $literator->transliterate($pageName), 1, 8, 6) "distance" => levenshtein($query, search::transliterate($pageName), 1, 8, 6)
]; ];
} }
@ -490,13 +489,26 @@ class StorageBox {
*/ */
private $db; private $db;
/**
* A cache of values.
* @var object[]
*/
private $cache = [];
/**
* A cache of prepared SQL statements.
* @var \PDOStatement[]
*/
private $query_cache = [];
/** /**
* Initialises a new store connection. * Initialises a new store connection.
* @param string $filename The filename that the store is located in. * @param string $filename The filename that the store is located in.
*/ */
function __construct(string $filename) { function __construct(string $filename) {
$firstrun = !file_exists($filename); $firstrun = !file_exists($filename);
$this->db = new \PDO("sqlite:$filename"); $this->db = new \PDO("sqlite:" . path_resolve($filename, __DIR__)); // HACK: This might not work on some systems, because it depends on the current working directory
$this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
if($firstrun) { if($firstrun) {
$this->query("CREATE TABLE store (key TEXT UNIQUE NOT NULL, value TEXT)"); $this->query("CREATE TABLE store (key TEXT UNIQUE NOT NULL, value TEXT)");
} }
@ -508,11 +520,11 @@ class StorageBox {
* @return \PDOStatement The result of the query, as a PDOStatement. * @return \PDOStatement The result of the query, as a PDOStatement.
*/ */
private function query(string $sql, array $variables = []) { private function query(string $sql, array $variables = []) {
// FUTURE: Optionally cache prepared statements? // Add to the query cache if it doesn't exist
$statement = $this->db->prepare($sql); if(!isset($this->query_cache[$sql]))
$statement->execute($variables); $this->query_cache[$sql] = $this->db->prepare($sql);
$this->query_cache[$sql]->execute($variables);
return $statement; // fetchColumn(), fetchAll(), etc. are defined on the statement, not the return value of execute() return $this->query_cache[$sql]; // fetchColumn(), fetchAll(), etc. are defined on the statement, not the return value of execute()
} }
/** /**
@ -521,6 +533,8 @@ class StorageBox {
* @return bool Whether the key exists in the store or not. * @return bool Whether the key exists in the store or not.
*/ */
public function has(string $key) : bool { public function has(string $key) : bool {
if(isset($this->cache[$key]))
return true;
return $this->query( return $this->query(
"SELECT COUNT(key) FROM store WHERE key = :key;", "SELECT COUNT(key) FROM store WHERE key = :key;",
[ "key" => $key ] [ "key" => $key ]
@ -529,29 +543,30 @@ class StorageBox {
/** /**
* Gets a value from the store. * Gets a value from the store.
* @param string $key The key to store the value under. * @param string $key The key value is stored under.
* @return string The value to store. * @return mixed The stored value.
*/ */
public function get(string $key) : string { public function get(string $key) {
return $this->query( // If it's not in the cache, insert it
if(!isset($this->cache[$key])) {
$this->cache[$key] = [ "modified" => false, "value" => json_decode($this->query(
"SELECT value FROM store WHERE key = :key;", "SELECT value FROM store WHERE key = :key;",
[ "key" => $key ] [ "key" => $key ]
)->fetchColumn(); )->fetchColumn()) ];
}
return $this->cache[$key]["value"];
} }
/** /**
* Sets a value in the data store. * Sets a value in the data store.
* Note that this does NOT save changes to disk until you close the connection!
* @param string $key The key to set the value of. * @param string $key The key to set the value of.
* @param string $value The value to store. * @param mixed $value The value to store.
*/ */
public function set(string $key, string $value) : void { public function set(string $key, $value) : void {
$this->query( if(!isset($this->cache[$key])) $this->cache[$key] = [];
"INSERT OR REPLACE INTO store(key, value) VALUES(:key, :value)", $this->cache[$key]["value"] = $value;
[ $this->cache[$key]["modified"] = true;
"key" => $key,
"value" => $value
]
);
} }
/** /**
@ -560,6 +575,10 @@ class StorageBox {
* @return bool Whether it was really deleted or not. Note that if it doesn't exist, then it can't be deleted. * @return bool Whether it was really deleted or not. Note that if it doesn't exist, then it can't be deleted.
*/ */
public function delete(string $key) : bool { public function delete(string $key) : bool {
// Remove it from the cache
if(isset($this->cache[$key]))
unset($this->cache[$key]);
// Remove it from disk
$this->query( $this->query(
"DELETE FROM store WHERE key = :key;", "DELETE FROM store WHERE key = :key;",
[ "key" => $key ] [ "key" => $key ]
@ -570,8 +589,33 @@ class StorageBox {
* Empties the store. * Empties the store.
*/ */
public function clear() : void { public function clear() : void {
// Empty the cache;
$this->cache = [];
// Empty the disk
$this->query("DELETE FROM store;"); $this->query("DELETE FROM store;");
} }
/**
* Syncs changes to disk and closes the PDO connection.
*/
public function close() : void {
$this->db->beginTransaction();
foreach($this->cache as $key => $value_data) {
// If it wasn't modified, there's no point in saving it, is there?
if(!$value_data["modified"])
continue;
$this->query(
"INSERT OR REPLACE INTO store(key, value) VALUES(:key, :value)",
[
"key" => $key,
"value" => json_encode($value_data["value"])
]
);
}
$this->db->commit();
$this->db = null;
}
} }
@ -644,6 +688,20 @@ class search
*/ */
private static $invindex = null; private static $invindex = null;
private static $literator = null;
/**
* Transliterates a string to make it more suitable for entry into the search index.
* @param string $str The string to transliterate.
* @return string The transliterated string.
*/
public static function transliterate(string $str) : string {
if(self::$literator == null)
self::$literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
return self::$literator->transliterate($_GET["query"]);
}
/** /**
* Converts a source string into an index of search terms that can be * Converts a source string into an index of search terms that can be
* merged into an inverted index. * merged into an inverted index.
@ -680,8 +738,6 @@ class search
* @return array An array of raw tokens extracted from the specified source string. * @return array An array of raw tokens extracted from the specified source string.
*/ */
public static function tokenize(string $source, bool $capture_offsets = false) : array { public static function tokenize(string $source, bool $capture_offsets = false) : array {
/** Normalises input characters for searching & indexing */
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items $flags = PREG_SPLIT_NO_EMPTY; // Don't return empty items
if($capture_offsets) if($capture_offsets)
@ -690,7 +746,7 @@ class search
// We don't need to normalise here because the transliterator handles // We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static member // this for us. Also, we can't move the literator to a static member
// variable because PHP doesn't like it very much // variable because PHP doesn't like it very much
$source = $literator->transliterate($source); $source = self::transliterate($source);
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source); $source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags); return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, $flags);
} }
@ -721,8 +777,10 @@ class search
ids::clear(); ids::clear();
// Clear the existing inverted index out // Clear the existing inverted index out
$this->invindex->clear(); if(self::$invindex == null)
$this->invindex->set("|termlist|", "[]"); self::invindex_load($paths->searchindex);
self::$invindex->clear();
self::$invindex->set("|termlist|", []);
// Reindex each page in turn // Reindex each page in turn
$i = 0; $max = count(get_object_vars($pageindex)); $i = 0; $max = count(get_object_vars($pageindex));
@ -750,6 +808,9 @@ class search
$i++; $i++;
} }
echo("data: Syncing to disk....\n\n");
self::invindex_close();
if($output) { if($output) {
echo("data: Search index rebuilding complete.\n\n"); echo("data: Search index rebuilding complete.\n\n");
echo("data: Couldn't find $missing_files pages on disk. If $settings->sitename couldn't find some pages on disk, then you might need to manually correct $settings->sitename's page index (stored in pageindex.json).\n\n"); echo("data: Couldn't find $missing_files pages on disk. If $settings->sitename couldn't find some pages on disk, then you might need to manually correct $settings->sitename's page index (stored in pageindex.json).\n\n");
@ -787,17 +848,28 @@ class search
} }
/** /**
* Reads in and parses an inverted index. * Loads a connection to an inverted index.
* @param string $invindex_filename The path to the inverted index to load. * @param string $invindex_filename The path to the inverted index to load.
* @todo Remove this function and make everything streamable * @todo Remove this function and make everything streamable
*/ */
public static function invindex_load(string $invindex_filename) { public static function invindex_load(string $invindex_filename) {
global $env; global $env, $paths;
$start_time = microtime(true); $start_time = microtime(true);
$this->invindex = new StorageBox($invindex_filename); self::$invindex = new StorageBox($invindex_filename);
$env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3); $env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3);
} }
/**
* Closes the currently open inverted index.
*/
public static function invindex_close() {
global $env;
$start_time = microtime(true);
self::$invindex->close();
$env->perfdata->searchindex_close_time = round((microtime(true) - $start_time)*1000, 3);
}
/** /**
* Merge an index into an inverted index. * Merge an index into an inverted index.
* @param int $pageid The id of the page to assign to the index that's being merged. * @param int $pageid The id of the page to assign to the index that's being merged.
@ -805,47 +877,49 @@ class search
* @param array $removals An array of index entries to remove from the inverted index. Useful for applying changes to an inverted index instead of deleting and remerging an entire page's index. * @param array $removals An array of index entries to remove from the inverted index. Useful for applying changes to an inverted index instead of deleting and remerging an entire page's index.
*/ */
public static function invindex_merge($pageid, &$index, &$removals = []) : void { public static function invindex_merge($pageid, &$index, &$removals = []) : void {
if($this->invindex == null) if(self::$invindex == null)
throw new Exception("Error: Can't merge into an inverted index that isn't loaded."); throw new Exception("Error: Can't merge into an inverted index that isn't loaded.");
$termlist = json_decode($this->invindex->get("|termlist|")); if(!self::$invindex->has("|termlist|"))
self::$invindex->set("|termlist|", []);
$termlist = self::$invindex->get("|termlist|");
// Remove all the subentries that were removed since last time // Remove all the subentries that were removed since last time
foreach($removals as $nterm) { foreach($removals as $nterm) {
// Delete the offsets // Delete the offsets
$this->invindex->delete("$nterm|$pageid"); self::$invindex->delete("$nterm|$pageid");
// Delete the item from the list of pageids containing this term // Delete the item from the list of pageids containing this term
$nterm_pageids = json_decode($this->invindex->get($nterm)); $nterm_pageids = self::$invindex->get($nterm);
array_splice($nterm_pageids, array_search($pageid, $nterm_pageids), 1); array_splice($nterm_pageids, array_search($pageid, $nterm_pageids), 1);
if(empty($nterm_pageids)) { // No need to keep the pageid list if there's nothing in it if(empty($nterm_pageids)) { // No need to keep the pageid list if there's nothing in it
$this->invindex->delete($nterm); self::$invindex->delete($nterm);
// Update the termlist if we're deleting the term completely // Update the termlist if we're deleting the term completely
$termlist_loc = array_search($nterm, $termlist); $termlist_loc = array_search($nterm, $termlist);
if($termlist_loc !== false) array_splice($termlist, $termlist_loc, 1); if($termlist_loc !== false) array_splice($termlist, $termlist_loc, 1);
} }
else else
$this->invindex->set($nterm, json_encode($nterm_pageids)); self::$invindex->set($nterm, $nterm_pageids);
} }
// Merge all the new / changed index entries into the inverted index // Merge all the new / changed index entries into the inverted index
foreach($index as $nterm => $newentry) { foreach($index as $nterm => $newentry) {
if(!$this->invindex->has($nterm)) { if(!self::$invindex->has($nterm)) {
$this->invindex->set($nterm, "[]"); self::$invindex->set($nterm, []);
$termlist[] = $nterm; $termlist[] = $nterm;
} }
// Update the nterm pageid list // Update the nterm pageid list
$nterm_pageids = json_decode($this->invindex->get($nterm)); $nterm_pageids = self::$invindex->get($nterm);
if(array_search($pageid, $nterm_pageids) === false) { if(array_search($pageid, $nterm_pageids) === false) {
$nterm_pageids[] = $pageid; $nterm_pageids[] = $pageid;
$this->invindex->set($nterm, json_encode($nterm_pageids)); self::$invindex->set($nterm, $nterm_pageids);
} }
// Store the offset list // Store the offset list
$this->invindex->set("$nterm|$pageid", json_encode($newentry)); self::$invindex->set("$nterm|$pageid", $newentry);
} }
$this->invindex->set("|termlist|", json_encode($termlist)); self::$invindex->set("|termlist|", $termlist);
} }
/** /**
@ -853,9 +927,9 @@ class search
* @param int $pageid The pageid to remove. * @param int $pageid The pageid to remove.
*/ */
public static function invindex_delete(int $pageid) { public static function invindex_delete(int $pageid) {
$termlist = json_decode($this->invindex->get("|termlist|")); $termlist = self::$invindex->get("|termlist|");
foreach($termlist as $nterm) { foreach($termlist as $nterm) {
$nterm_pageids = json_decode($this->invindex->get("$nterm")); $nterm_pageids = self::$invindex->get("$nterm");
$nterm_loc = array_search($pageid, $nterm_pageids); $nterm_loc = array_search($pageid, $nterm_pageids);
// If this nterm doesn't appear in the list, we're not interested // If this nterm doesn't appear in the list, we're not interested
if($nterm_loc === false) if($nterm_loc === false)
@ -865,18 +939,18 @@ class search
array_splice($nterm_pageids, $nterm_loc, 1); array_splice($nterm_pageids, $nterm_loc, 1);
// Delete the offset list // Delete the offset list
$this->invindex->delete("$nterm|$pageid"); self::$invindex->delete("$nterm|$pageid");
// If this term doesn't appear in any other documents, delete it // If this term doesn't appear in any other documents, delete it
if(count($nterm_pageids) === 0) { if(count($nterm_pageids) === 0) {
$this->invindex->delete($nterm); self::$invindex->delete($nterm);
array_splice($termlist, array_search($nterm, $termlist), 1); array_splice($termlist, array_search($nterm, $termlist), 1);
} }
else // Save the document id list back, since it still contains other pageids else // Save the document id list back, since it still contains other pageids
$this->invindex->set($nterm, json_encode($nterm_pageids)); self::$invindex->set($nterm, $nterm_pageids);
} }
// Save the termlist back to the store // Save the termlist back to the store
$this->invindex->set("|termlist|", json_encode($termlist)); self::$invindex->set("|termlist|", $termlist);
} }
@ -893,8 +967,8 @@ class search
* Actually based on my earlier explode_adv https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html * Actually based on my earlier explode_adv https://starbeamrainbowlabs.com/blog/article.php?article=posts/081-PHP-String-Splitting.html
* @param string $query The queyr string to split. * @param string $query The queyr string to split.
*/ */
private function stas_split($query) { public function stas_split($query) {
$chars = str_split($query); $chars = str_split(self::transliterate($query));
$terms = []; $terms = [];
$next_term = ""; $next_term = "";
$toggle_state = false; // true = now inside, false = now outside $toggle_state = false; // true = now inside, false = now outside
@ -931,7 +1005,7 @@ class search
* @param string[] $tokens The array of query tokens to parse. * @param string[] $tokens The array of query tokens to parse.
*/ */
private function stas_parse($tokens) { public function stas_parse($tokens) {
/* Supported Syntax * /* Supported Syntax *
* *
* -term exclude a term * -term exclude a term
@ -939,7 +1013,7 @@ class search
* terms !dest terms redirect entire query (minus the !bang) to interwiki with registered shortcut dest * terms !dest terms redirect entire query (minus the !bang) to interwiki with registered shortcut dest
* prefix:term apply prefix operator to term * prefix:term apply prefix operator to term
*/ */
var_dump($tokens); // var_dump($tokens);
$result = [ $result = [
"terms" => [], "terms" => [],
"exclude" => [], "exclude" => [],
@ -1038,11 +1112,8 @@ class search
{ {
global $settings, $pageindex; global $settings, $pageindex;
/** Normalises input characters for searching & indexing */ $query_stas = self::stas_parse(
static $literator; if($literator == null) $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD); self::stas_split(self::transliterate($query))
$query_stas = $this->stas_parse(
$this->stas_split($literator->transliterate($query))
); );
/* Sub-array format: /* Sub-array format:
@ -1062,23 +1133,23 @@ class search
]; ];
// Query the inverted index // Query the inverted index
foreach($query_stas as $term_def) { foreach($query_stas["terms"] as $term_def) {
if($term_def["weight"] == -1) if($term_def["weight"] == -1)
continue; // Skip stop words continue; // Skip stop words
if(!in_array($term_def["location"], ["all", "inbody"])) if(!in_array($term_def["location"], ["all", "inbody"]))
continue; // Skip terms we shouldn't search the page body for continue; // Skip terms we shouldn't search the page body for
if(!$this->$invindex->has($term_def["term"])) if(!self::$invindex->has($term_def["term"]))
continue; // Skip if it's not in the index continue; // Skip if it's not in the index
// For each page that contains this term..... // For each page that contains this term.....
$term_pageids = json_decode($this->invindex->get($term_def["term"])); $term_pageids = self::$invindex->get($term_def["term"]);
foreach($term_pageids as $pageid) { foreach($term_pageids as $pageid) {
// Check to see if it contains any words we should exclude // Check to see if it contains any words we should exclude
$skip = false; $skip = false;
foreach($query_stas["exclude"] as $exlc_term) { foreach($query_stas["exclude"] as $exlc_term) {
if($this->invindex->has("$excl_term|$pageid")) { if(self::$invindex->has("$excl_term|$pageid")) {
$skip = true; $skip = true;
break; break;
} }
@ -1086,7 +1157,7 @@ class search
if($skip) continue; if($skip) continue;
// Get the list of offsets // Get the list of offsets
$page_offsets = json_decode($this->invindex->get("{$term_def["term"]}|$pageid")); $page_offsets = self::$invindex->get("{$term_def["term"]}|$pageid");
if(!isset($matching_pages[$pageid])) if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = $match_template; // Arrays are assigned by copy in php $matching_pages[$pageid] = $match_template; // Arrays are assigned by copy in php
@ -1102,7 +1173,7 @@ class search
} }
// Query page titles & tags // Query page titles & tags
foreach($terms as $term_def) { foreach($query_stas["terms"] as $term_def) {
// No need to skip stop words here, since we're doing a normal // No need to skip stop words here, since we're doing a normal
// sequential search anyway // sequential search anyway
if(!in_array($term_def["location"], ["all", "intitle", "intags"])) if(!in_array($term_def["location"], ["all", "intitle", "intags"]))
@ -1114,8 +1185,8 @@ class search
// Setup a variable to hold the current page's id // Setup a variable to hold the current page's id
$pageid = null; // Cache the page id $pageid = null; // Cache the page id
$lit_title = $literator->transliterate($pagename); $lit_title = self::transliterate($pagename);
$lit_tags = $literator->transliterate(implode(" ", $pagedata->tags)); $lit_tags = isset($pagedata->tags) ? self::transliterate(implode(" ", $pagedata->tags)) : null;
// Make sure that the title & tags don't contain a term we should exclude // Make sure that the title & tags don't contain a term we should exclude
$skip = false; $skip = false;
@ -1145,6 +1216,10 @@ class search
} }
} }
// If this page doesn't have any tags, skip it
if($lit_tags == null)
continue;
if(!in_array($term_def["location"], ["all", "intags"])) if(!in_array($term_def["location"], ["all", "intags"]))
continue; // If we shouldn't search the tags, no point in continuing continue; // If we shouldn't search the tags, no point in continuing
@ -1204,30 +1279,29 @@ class search
/** /**
* Extracts a context string (in HTML) given a search query that could be displayed * Extracts a context string (in HTML) given a search query that could be displayed
* in a list of search results. * in a list of search results.
* @param string $invindex The inverted index to consult.
* @param string $pagename The name of the paget that this source belongs to. Used when consulting the inverted index. * @param string $pagename The name of the paget that this source belongs to. Used when consulting the inverted index.
* @param string $query The search queary to generate the context for. * @param string $query The search queary to generate the context for.
* @param string $source The page source to extract the context from. * @param string $source The page source to extract the context from.
* @return string The generated context string. * @return string The generated context string.
*/ */
public static function extract_context($invindex, $pagename, $query, $source) public static function extract_context($pagename, $query, $source)
{ {
global $settings; global $settings;
$pageid = ids::getid($pagename); $pageid = ids::getid($pagename);
$nterms = self::tokenize($query); $nterms = self::stas_parse(self::stas_split($query))["terms"];
// Query the inverted index for offsets
$matches = []; $matches = [];
foreach($nterms as $nterm) { foreach($nterms as $nterm) {
// Skip over words that don't appear in the inverted index (e.g. stop words)
if(!isset($invindex[$nterm]))
continue;
// Skip if the page isn't found in the inverted index for this word // Skip if the page isn't found in the inverted index for this word
if(!isset($invindex[$nterm][$pageid])) if(!self::$invindex->has("{$nterm["term"]}|$pageid"))
continue; continue;
foreach($invindex[$nterm][$pageid]["offsets"] as $next_offset) $nterm_offsets = self::$invindex->get("{$nterm["term"]}|$pageid")->offsets;
$matches[] = [ $nterm, $next_offset ];
foreach($nterm_offsets as $next_offset)
$matches[] = [ $nterm["term"], $next_offset ];
} }
// Sort the matches by offset // Sort the matches by offset
@ -1279,6 +1353,8 @@ class search
$contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]); $contexts_text[] = substr($source, $context["from"], $context["to"] - $context["from"]);
} }
// BUG: Make sure that a snippet is centred on the word in question if we have to cut it short
$result = implode("", $contexts_text); $result = implode("", $contexts_text);
end($contexts); // If there's at least one item in the list and were not at the very end of the page, add an extra ellipsis end($contexts); // If there's at least one item in the list and were not at the very end of the page, add an extra ellipsis
if(isset($contexts[0]) && $contexts[key($contexts)]["to"] < $sourceLength) $result .= ""; if(isset($contexts[0]) && $contexts[key($contexts)]["to"] < $sourceLength) $result .= "";
@ -1296,15 +1372,15 @@ class search
*/ */
public static function highlight_context($query, $context) public static function highlight_context($query, $context)
{ {
$qterms = self::tokenize($query); $qterms = self::stas_parse(self::stas_split($query))["terms"];
foreach($qterms as $qterm) foreach($qterms as $qterm) {
{ // Stop words are marked by STAS
if(in_array($qterm, static::$stop_words)) if($qterm["weight"] <= 0)
continue; continue;
// From http://stackoverflow.com/a/2483859/1460422 // From http://stackoverflow.com/a/2483859/1460422
$context = preg_replace("/" . preg_replace('/\\//u', "\/", preg_quote($qterm)) . "/iu", "<strong class='search-term-highlight'>$0</strong>", $context); $context = preg_replace("/" . preg_replace('/\\//u', "\/", preg_quote($qterm["term"])) . "/iu", "<strong class='search-term-highlight'>$0</strong>", $context);
} }
return $context; return $context;

View file

@ -1,7 +1,7 @@
<?php <?php
register_module([ register_module([
"name" => "Page deleter", "name" => "Page deleter",
"version" => "0.10.1", "version" => "0.10.2",
"author" => "Starbeamrainbowlabs", "author" => "Starbeamrainbowlabs",
"description" => "Adds an action to allow administrators to delete pages.", "description" => "Adds an action to allow administrators to delete pages.",
"id" => "page-delete", "id" => "page-delete",
@ -86,6 +86,7 @@ register_module([
$pageid = ids::getid($env->page); $pageid = ids::getid($env->page);
search::invindex_load($paths->searchindex); search::invindex_load($paths->searchindex);
search::invindex_delete($pageid); search::invindex_delete($pageid);
search::invindex_close();
} }
// Remove the page's name from the id index // Remove the page's name from the id index

View file

@ -1,7 +1,7 @@
<?php <?php
register_module([ register_module([
"name" => "Page editor", "name" => "Page editor",
"version" => "0.17.4", "version" => "0.17.5",
"author" => "Starbeamrainbowlabs", "author" => "Starbeamrainbowlabs",
"description" => "Allows you to edit pages by adding the edit and save actions. You should probably include this one.", "description" => "Allows you to edit pages by adding the edit and save actions. You should probably include this one.",
"id" => "page-edit", "id" => "page-edit",
@ -448,11 +448,11 @@ DIFFSCRIPT;
// Update the inverted search index // Update the inverted search index
if(module_exists("feature-search")) {
// Construct an index for the old and new page content // Construct an index for the old and new page content
$oldindex = []; $oldindex = [];
$oldpagedata = ""; // We need the old page data in order to pass it to the preprocessor $oldpagedata = ""; // We need the old page data in order to pass it to the preprocessor
if(file_exists("$env->storage_prefix$env->page.md")) if(file_exists("$env->storage_prefix$env->page.md")) {
{
$oldpagedata = file_get_contents("$env->storage_prefix$env->page.md"); $oldpagedata = file_get_contents("$env->storage_prefix$env->page.md");
$oldindex = search::index_generate($oldpagedata); $oldindex = search::index_generate($oldpagedata);
} }
@ -467,7 +467,8 @@ DIFFSCRIPT;
// Merge the changes into the inverted index // Merge the changes into the inverted index
search::invindex_merge(ids::getid($env->page), $additions, $removals); search::invindex_merge(ids::getid($env->page), $additions, $removals);
// Save the inverted index back to disk // Save the inverted index back to disk
search::invindex_close();
}
// -----~~~==~~~----- // -----~~~==~~~-----
if(file_put_contents("$env->storage_prefix$env->page.md", $pagedata) !== false) if(file_put_contents("$env->storage_prefix$env->page.md", $pagedata) !== false)

View file

@ -1,7 +1,7 @@
<?php <?php
register_module([ register_module([
"name" => "Help page", "name" => "Help page",
"version" => "0.9.3", "version" => "0.9.4",
"author" => "Starbeamrainbowlabs", "author" => "Starbeamrainbowlabs",
"description" => "Adds a rather useful help page. Access through the 'help' action. This module also exposes help content added to Pepperminty Wiki's inbuilt invisible help section system.", "description" => "Adds a rather useful help page. Access through the 'help' action. This module also exposes help content added to Pepperminty Wiki's inbuilt invisible help section system.",
"id" => "page-help", "id" => "page-help",
@ -58,10 +58,8 @@ register_module([
$content .= "<ul>\n"; $content .= "<ul>\n";
$content .= "<li>$settings->sitename's root directory is " . (!is_writeable(__DIR__) ? "not " : "") . "writeable.</li>\n"; $content .= "<li>$settings->sitename's root directory is " . (!is_writeable(__DIR__) ? "not " : "") . "writeable.</li>\n";
$content .= "<li>The page index is currently " . human_filesize(filesize($paths->pageindex)) . " in size, and took " . $env->perfdata->pageindex_decode_time . "ms to decode.</li>"; $content .= "<li>The page index is currently " . human_filesize(filesize($paths->pageindex)) . " in size, and took " . $env->perfdata->pageindex_decode_time . "ms to decode.</li>";
if(module_exists("feature-search")) if(module_exists("feature-search")) {
{ $content .= "<li>The search index is currently " . human_filesize(filesize($paths->searchindex)) . " in size.</li>";
search::measure_invindex_load_time($paths->searchindex);
$content .= "<li>The search index is currently " . human_filesize(filesize($paths->searchindex)) . " in size, and took " . $env->perfdata->searchindex_decode_time . "ms to decode.</li>";
} }
$content .= "<li>The id index is currently " . human_filesize(filesize($paths->idindex)) . " in size, and took " . $env->perfdata->idindex_decode_time . "ms to decode.</li>"; $content .= "<li>The id index is currently " . human_filesize(filesize($paths->idindex)) . " in size, and took " . $env->perfdata->idindex_decode_time . "ms to decode.</li>";