1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-06-09 11:54:56 +00:00

Start implementing similar pages system, but it's not finished yet

We have the backend suggestion system done, but not the UI.
I can tell that this is going to require lots of tweaking to get just 
right. I suspect it might be a good idea to explore some possible 
tweakable settings we can add to allow people to tweak the engine to 
better suit their individual setups.
This commit is contained in:
Starbeamrainbowlabs 2020-05-22 21:22:07 +01:00
parent 103c24dac3
commit a0f6e89643
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
5 changed files with 180 additions and 3 deletions

View file

@ -2,6 +2,12 @@
This file holds the changelog for Pepperminty Wiki. This is the master list of things that have changed (second only to the commit history!) - though the information for any particular release can also be found in the description of it's page for every release made on GitHub too. This file holds the changelog for Pepperminty Wiki. This is the master list of things that have changed (second only to the commit history!) - though the information for any particular release can also be found in the description of it's page for every release made on GitHub too.
## v0.22-dev
### Added
- [Module Api] Add new `search::invindex_term_getpageids`, and `search::invindex_term_getoffsets`, and `search::index_sort_freq` methods
## v0.21 ## v0.21
### Fixed ### Fixed

View file

@ -0,0 +1,137 @@
<?php
register_module([
"name" => "Similar Pages",
"version" => "0.1",
"author" => "Starbeamrainbowlabs",
"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",
"id" => "feature-similarpages",
"depends" => [ "lib-search-engine", "feature-search" ],
"code" => function() {
global $settings;
/**
* @api {get} ?action=raw&page={pageName} Get the raw source code of a page
* @apiName RawSource
* @apiGroup Page
* @apiPermission Anonymous
*
* @apiParam {string} page The page to return the source of.
*/
/*
* ██████ █████ ██ ██
* ██ ██ ██ ██ ██ ██
* ██████ ███████ ██ ██
* ██ ██ ██ ██ ██ ███ ██
* ██ ██ ██ ██ ███ ███
*/
add_action("suggest-similar", function() {
global $pageindex, $env;
$format = $_GET["format"] ?? "text";
// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here
$similarpages = similar_suggest(
$env->page,
file_get_contents($env->page_filename)
);
switch ($format) {
case "text":
header("content-type: text/plain");
foreach($similarpages as $pagename => $rank) {
echo("$pagename | $rank\n");
}
break;
case "csv":
header("content-type: text/csv");
echo("pagename,rank\n");
foreach($similarpages as $pagename => $rank)
echo("$pagename,$rank\n");
break;
case "json":
header("content-type: application/json");
echo(json_encode($similarpages));
default:
http_response_code(400);
header("content-type: text/plain");
exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");
break;
}
});
}
]);
/**
* Given a page name, returns a list fo similar pages.
* @param string $pagename The name of the page to return suggestions for.
* @param string $content The content of the given page.
* @return array A list of suggested page names in the format pagename => rank.
*/
function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {
global $settings;
$content_search = search::$literator->transliterate($content);
$index = search::index_generate($content_search);
$title_tokens = search::tokenize($pagename);
foreach($title_tokens as $token) {
if(in_array($token, search::$stop_words)) continue;
$index[$token] = [ "freq" => 10000, "fromtitle" => true ];
}
search::index_sort_freq($index, true);
search::invindex_load();
$our_pageid = ids::getid($pagename);
$pages = [];
$max_count = -1;
$i = 0;
foreach($index as $term => $data) {
error_log("[similar_suggest] checking $term | {$data["freq"]}");
// Only search the top 20% most common words
// Stop words are skipped automagically
// if($i > $max_count * 0.2) break;
// Skip words shorter than 3 characters
if(strlen($term) < 3) continue;
// if($i > 10) break;
// If this one is less than 0.2x the max frequency count, break out
if(!isset($data["fromtitle"]))
$max_count = max($max_count, $data["freq"]);
if($data["freq"] < $max_count * 0.2 || $data["freq"] <= 1) break;
// Check is it's present just in case (todo figure out if it's necessary)
if(!search::invindex_term_exists($term)) continue;
error_log("ok");
$otherpages = search::invindex_term_getpageids($term);
foreach($otherpages as $pageid) {
if($pageid == $our_pageid) continue;
if(!isset($pages[$pageid]))
$pages[$pageid] = 0;
$amount = search::invindex_term_getoffsets($term, $pageid)->freq;
if(isset($data["fromtitle"]))
$amount *= 5;
$pages[$pageid] += $amount;
}
$i++;
}
arsort($pages, SORT_NUMERIC);
$result = []; $i = 0;
foreach($pages as $pageid => $count) {
if($limit_output && $i > $settings->similarpages_count) break;
$result[ids::getpagename($pageid)] = $count;
$i++;
}
return $result;
}
?>

View file

@ -371,11 +371,20 @@ class search
* @param array $index The index to sort. * @param array $index The index to sort.
*/ */
public static function index_sort(&$index) { public static function index_sort(&$index) {
$sorter = new Collator(""); $sorter = self::$sorter;
uksort($index, function($a, $b) use($sorter) : int { uksort($index, function($a, $b) use($sorter) : int {
return $sorter->compare($a, $b); return $sorter->compare($a, $b);
}); });
} }
/**
* Sorts an index by frequency.
* @param array $index The index to sort.
*/
public static function index_sort_freq(&$index) {
uasort($index, function($a, $b) : int {
return $b["freq"] > $a["freq"];
});
}
/** /**
* Compares two *regular* indexes to find the differences between them. * Compares two *regular* indexes to find the differences between them.
@ -401,6 +410,8 @@ class search
*/ */
public static function invindex_load() { public static function invindex_load() {
global $env, $paths; global $env, $paths;
// If the inverted index is alreayd loaded, it doesn't need loading again
if(self::$invindex !== null) return;
$start_time = microtime(true); $start_time = microtime(true);
self::$invindex = new StorageBox($paths->searchindex); self::$invindex = new StorageBox($paths->searchindex);
$env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3); $env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3);
@ -414,6 +425,7 @@ class search
$start_time = microtime(true); $start_time = microtime(true);
self::$invindex->close(); self::$invindex->close();
self::$invindex = null;
$env->perfdata->searchindex_close_time = round((microtime(true) - $start_time)*1000, 3); $env->perfdata->searchindex_close_time = round((microtime(true) - $start_time)*1000, 3);
} }
@ -714,10 +726,30 @@ class search
return self::$invindex->has($term); return self::$invindex->has($term);
} }
/**
* Returns the page ids that contain the given (transliterated) search term.
* @param string $term The search term to look for.
* @return string[] The list of page ids that contain the given term.
*/
public static function invindex_term_getpageids(string $term) {
return self::$invindex->get_arr_simple($term);
}
/**
* Gets the offsets object for a given term on a given page.
* The return object is in the form { freq: 4, offsets: [2,3,4] }
* @param string $term The term to search for.
* @param int $pageid The id of the page to retrieve the offsets list for.
* @return object The offsets object as described above.
*/
public static function invindex_term_getoffsets(string $term, int $pageid) {
return self::$invindex->get("$term|$pageid");
}
/** /**
* Searches the given inverted index for the specified search terms. * Searches the given inverted index for the specified search terms.
* Note that this automatically pushes the query string through STAS which * Note that this automatically pushes the query string through STAS which
* can be a fairly expensive operation, so use 2nd argument if you need * can be a fairly expensive operation, so use 2nd argument if you need
* to debug the STAS parsing result if possible. * to debug the STAS parsing result if possible.
* @param string $query The search query. If an array is passed, it is assumed it has already been pre-parsed with search::stas_parse(). * @param string $query The search query. If an array is passed, it is assumed it has already been pre-parsed with search::stas_parse().
* @param &stdClass $query_stas An object to fill with the result of the STAS parsing. * @param &stdClass $query_stas An object to fill with the result of the STAS parsing.

View file

@ -235,6 +235,8 @@
"search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 }, "search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
"search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:<ul><li>Can't be added to the index</li><li>Can't be removed from the index</li><li>Is never suggested</li></ul>Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" }, "search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:<ul><li>Can't be added to the index</li><li>Can't be removed from the index</li><li>Is never suggested</li></ul>Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" },
"dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 }, "dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
"similarpages_enabled": { "type": "checkbox", "description": "Whether similar pages are displayed beneath the content and above the comments on a page", "default": true },
"similarpages_count": { "type": "number", "description": "The number of similar page suggestions to make.", "default": 3 },
"defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" }, "defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" },
"email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false }, "email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },
"email_subject_utf8": { "type": "checkbox", "description": "Whether to encode the subject of emails sent to allow them to contain unicode characters. Without this, email subjects will be transliterated to ASCII. If utf-8 email subjects are disabled, page names may not be represented properly.", "default": true }, "email_subject_utf8": { "type": "checkbox", "description": "Whether to encode the subject of emails sent to allow them to contain unicode characters. Without this, email subjects will be transliterated to ASCII. If utf-8 email subjects are disabled, page names may not be represented properly.", "default": true },

View file

@ -1 +1 @@
v0.21 v0.22-dev