mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-22 04:23:01 +00:00
Start implementing similar pages system, but it's not finished yet
We have the backend suggestion system done, but not the UI. I can tell that this is going to require lots of tweaking to get just right. I suspect it might be a good idea to explore some possible tweakable settings we can add to allow people to tweak the engine to better suit their individual setups.
This commit is contained in:
parent
103c24dac3
commit
a0f6e89643
5 changed files with 180 additions and 3 deletions
|
@ -2,6 +2,12 @@
|
||||||
This file holds the changelog for Pepperminty Wiki. This is the master list of things that have changed (second only to the commit history!) - though the information for any particular release can also be found in the description of it's page for every release made on GitHub too.
|
This file holds the changelog for Pepperminty Wiki. This is the master list of things that have changed (second only to the commit history!) - though the information for any particular release can also be found in the description of it's page for every release made on GitHub too.
|
||||||
|
|
||||||
|
|
||||||
|
## v0.22-dev
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- [Module Api] Add new `search::invindex_term_getpageids`, and `search::invindex_term_getoffsets`, and `search::index_sort_freq` methods
|
||||||
|
|
||||||
|
|
||||||
## v0.21
|
## v0.21
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|
137
modules/feature-similarpages.php
Normal file
137
modules/feature-similarpages.php
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
<?php
|
||||||
|
register_module([
|
||||||
|
"name" => "Similar Pages",
|
||||||
|
"version" => "0.1",
|
||||||
|
"author" => "Starbeamrainbowlabs",
|
||||||
|
"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",
|
||||||
|
"id" => "feature-similarpages",
|
||||||
|
"depends" => [ "lib-search-engine", "feature-search" ],
|
||||||
|
"code" => function() {
|
||||||
|
global $settings;
|
||||||
|
/**
|
||||||
|
* @api {get} ?action=raw&page={pageName} Get the raw source code of a page
|
||||||
|
* @apiName RawSource
|
||||||
|
* @apiGroup Page
|
||||||
|
* @apiPermission Anonymous
|
||||||
|
*
|
||||||
|
* @apiParam {string} page The page to return the source of.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ██████ █████ ██ ██
|
||||||
|
* ██ ██ ██ ██ ██ ██
|
||||||
|
* ██████ ███████ ██ █ ██
|
||||||
|
* ██ ██ ██ ██ ██ ███ ██
|
||||||
|
* ██ ██ ██ ██ ███ ███
|
||||||
|
*/
|
||||||
|
add_action("suggest-similar", function() {
|
||||||
|
global $pageindex, $env;
|
||||||
|
|
||||||
|
$format = $_GET["format"] ?? "text";
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here
|
||||||
|
$similarpages = similar_suggest(
|
||||||
|
$env->page,
|
||||||
|
file_get_contents($env->page_filename)
|
||||||
|
);
|
||||||
|
|
||||||
|
switch ($format) {
|
||||||
|
case "text":
|
||||||
|
header("content-type: text/plain");
|
||||||
|
foreach($similarpages as $pagename => $rank) {
|
||||||
|
echo("$pagename | $rank\n");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "csv":
|
||||||
|
header("content-type: text/csv");
|
||||||
|
echo("pagename,rank\n");
|
||||||
|
foreach($similarpages as $pagename => $rank)
|
||||||
|
echo("$pagename,$rank\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "json":
|
||||||
|
header("content-type: application/json");
|
||||||
|
echo(json_encode($similarpages));
|
||||||
|
|
||||||
|
default:
|
||||||
|
http_response_code(400);
|
||||||
|
header("content-type: text/plain");
|
||||||
|
exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a page name, returns a list fo similar pages.
|
||||||
|
* @param string $pagename The name of the page to return suggestions for.
|
||||||
|
* @param string $content The content of the given page.
|
||||||
|
* @return array A list of suggested page names in the format pagename => rank.
|
||||||
|
*/
|
||||||
|
function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {
|
||||||
|
global $settings;
|
||||||
|
$content_search = search::$literator->transliterate($content);
|
||||||
|
$index = search::index_generate($content_search);
|
||||||
|
$title_tokens = search::tokenize($pagename);
|
||||||
|
foreach($title_tokens as $token) {
|
||||||
|
if(in_array($token, search::$stop_words)) continue;
|
||||||
|
$index[$token] = [ "freq" => 10000, "fromtitle" => true ];
|
||||||
|
}
|
||||||
|
search::index_sort_freq($index, true);
|
||||||
|
search::invindex_load();
|
||||||
|
|
||||||
|
|
||||||
|
$our_pageid = ids::getid($pagename);
|
||||||
|
$pages = [];
|
||||||
|
$max_count = -1;
|
||||||
|
$i = 0;
|
||||||
|
foreach($index as $term => $data) {
|
||||||
|
error_log("[similar_suggest] checking $term | {$data["freq"]}");
|
||||||
|
// Only search the top 20% most common words
|
||||||
|
// Stop words are skipped automagically
|
||||||
|
// if($i > $max_count * 0.2) break;
|
||||||
|
// Skip words shorter than 3 characters
|
||||||
|
if(strlen($term) < 3) continue;
|
||||||
|
|
||||||
|
// if($i > 10) break;
|
||||||
|
|
||||||
|
// If this one is less than 0.2x the max frequency count, break out
|
||||||
|
if(!isset($data["fromtitle"]))
|
||||||
|
$max_count = max($max_count, $data["freq"]);
|
||||||
|
if($data["freq"] < $max_count * 0.2 || $data["freq"] <= 1) break;
|
||||||
|
|
||||||
|
// Check is it's present just in case (todo figure out if it's necessary)
|
||||||
|
if(!search::invindex_term_exists($term)) continue;
|
||||||
|
|
||||||
|
error_log("ok");
|
||||||
|
|
||||||
|
$otherpages = search::invindex_term_getpageids($term);
|
||||||
|
foreach($otherpages as $pageid) {
|
||||||
|
if($pageid == $our_pageid) continue;
|
||||||
|
if(!isset($pages[$pageid]))
|
||||||
|
$pages[$pageid] = 0;
|
||||||
|
|
||||||
|
$amount = search::invindex_term_getoffsets($term, $pageid)->freq;
|
||||||
|
if(isset($data["fromtitle"]))
|
||||||
|
$amount *= 5;
|
||||||
|
$pages[$pageid] += $amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
arsort($pages, SORT_NUMERIC);
|
||||||
|
|
||||||
|
$result = []; $i = 0;
|
||||||
|
foreach($pages as $pageid => $count) {
|
||||||
|
if($limit_output && $i > $settings->similarpages_count) break;
|
||||||
|
$result[ids::getpagename($pageid)] = $count;
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
?>
|
|
@ -371,11 +371,20 @@ class search
|
||||||
* @param array $index The index to sort.
|
* @param array $index The index to sort.
|
||||||
*/
|
*/
|
||||||
public static function index_sort(&$index) {
|
public static function index_sort(&$index) {
|
||||||
$sorter = new Collator("");
|
$sorter = self::$sorter;
|
||||||
uksort($index, function($a, $b) use($sorter) : int {
|
uksort($index, function($a, $b) use($sorter) : int {
|
||||||
return $sorter->compare($a, $b);
|
return $sorter->compare($a, $b);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Sorts an index by frequency.
|
||||||
|
* @param array $index The index to sort.
|
||||||
|
*/
|
||||||
|
public static function index_sort_freq(&$index) {
|
||||||
|
uasort($index, function($a, $b) : int {
|
||||||
|
return $b["freq"] > $a["freq"];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compares two *regular* indexes to find the differences between them.
|
* Compares two *regular* indexes to find the differences between them.
|
||||||
|
@ -401,6 +410,8 @@ class search
|
||||||
*/
|
*/
|
||||||
public static function invindex_load() {
|
public static function invindex_load() {
|
||||||
global $env, $paths;
|
global $env, $paths;
|
||||||
|
// If the inverted index is alreayd loaded, it doesn't need loading again
|
||||||
|
if(self::$invindex !== null) return;
|
||||||
$start_time = microtime(true);
|
$start_time = microtime(true);
|
||||||
self::$invindex = new StorageBox($paths->searchindex);
|
self::$invindex = new StorageBox($paths->searchindex);
|
||||||
$env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3);
|
$env->perfdata->searchindex_load_time = round((microtime(true) - $start_time)*1000, 3);
|
||||||
|
@ -414,6 +425,7 @@ class search
|
||||||
|
|
||||||
$start_time = microtime(true);
|
$start_time = microtime(true);
|
||||||
self::$invindex->close();
|
self::$invindex->close();
|
||||||
|
self::$invindex = null;
|
||||||
$env->perfdata->searchindex_close_time = round((microtime(true) - $start_time)*1000, 3);
|
$env->perfdata->searchindex_close_time = round((microtime(true) - $start_time)*1000, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -714,10 +726,30 @@ class search
|
||||||
return self::$invindex->has($term);
|
return self::$invindex->has($term);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the page ids that contain the given (transliterated) search term.
|
||||||
|
* @param string $term The search term to look for.
|
||||||
|
* @return string[] The list of page ids that contain the given term.
|
||||||
|
*/
|
||||||
|
public static function invindex_term_getpageids(string $term) {
|
||||||
|
return self::$invindex->get_arr_simple($term);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the offsets object for a given term on a given page.
|
||||||
|
* The return object is in the form { freq: 4, offsets: [2,3,4] }
|
||||||
|
* @param string $term The term to search for.
|
||||||
|
* @param int $pageid The id of the page to retrieve the offsets list for.
|
||||||
|
* @return object The offsets object as described above.
|
||||||
|
*/
|
||||||
|
public static function invindex_term_getoffsets(string $term, int $pageid) {
|
||||||
|
return self::$invindex->get("$term|$pageid");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Searches the given inverted index for the specified search terms.
|
* Searches the given inverted index for the specified search terms.
|
||||||
* Note that this automatically pushes the query string through STAS which
|
* Note that this automatically pushes the query string through STAS which
|
||||||
* can be a fairly expensive operation, so use 2nd argument if you need
|
* can be a fairly expensive operation, so use 2nd argument if you need
|
||||||
* to debug the STAS parsing result if possible.
|
* to debug the STAS parsing result if possible.
|
||||||
* @param string $query The search query. If an array is passed, it is assumed it has already been pre-parsed with search::stas_parse().
|
* @param string $query The search query. If an array is passed, it is assumed it has already been pre-parsed with search::stas_parse().
|
||||||
* @param &stdClass $query_stas An object to fill with the result of the STAS parsing.
|
* @param &stdClass $query_stas An object to fill with the result of the STAS parsing.
|
||||||
|
|
|
@ -235,6 +235,8 @@
|
||||||
"search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
|
"search_didyoumean_cost_replace": { "type": "number", "description": "The replace cost to use when calculating levenshtein distances. If this value is changed then the did you mean index must be rebuilt.", "default": 1 },
|
||||||
"search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:<ul><li>Can't be added to the index</li><li>Can't be removed from the index</li><li>Is never suggested</li></ul>Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" },
|
"search_didyoumean_seed_word": { "type": "text", "description": "The seed word for the didyoumean index tree. Has a number of special properties:<ul><li>Can't be added to the index</li><li>Can't be removed from the index</li><li>Is never suggested</li></ul>Since words are transliterated to lowercase ascii before indexing, it's recommended to set this to a word that contains characters that will never be present after transliteration.", "default": ":peppermint:" },
|
||||||
"dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
"dynamic_page_suggestion_count": { "type": "number", "description": "The number of dynamic page name suggestions to fetch from the server when typing in the page search box. Note that lowering this number doesn't <em>really</em> improve performance. Set to 0 to disable.", "default": 7 },
|
||||||
|
"similarpages_enabled": { "type": "checkbox", "description": "Whether similar pages are displayed beneath the content and above the comments on a page", "default": true },
|
||||||
|
"similarpages_count": { "type": "number", "description": "The number of similar page suggestions to make.", "default": 3 },
|
||||||
"defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" },
|
"defaultaction": { "type": "text", "description": "The default action. This action will be performed if no other action is specified. It is recommended you set this to \"view\" - that way the user automatically views the default page (see above).", "default": "view" },
|
||||||
"email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },
|
"email_debug_dontsend": { "type": "checkbox", "description": "If set to true, emails are logged to the standard error instead of being actually sent.", "default": false },
|
||||||
"email_subject_utf8": { "type": "checkbox", "description": "Whether to encode the subject of emails sent to allow them to contain unicode characters. Without this, email subjects will be transliterated to ASCII. If utf-8 email subjects are disabled, page names may not be represented properly.", "default": true },
|
"email_subject_utf8": { "type": "checkbox", "description": "Whether to encode the subject of emails sent to allow them to contain unicode characters. Without this, email subjects will be transliterated to ASCII. If utf-8 email subjects are disabled, page names may not be represented properly.", "default": true },
|
||||||
|
|
2
version
2
version
|
@ -1 +1 @@
|
||||||
v0.21
|
v0.22-dev
|
||||||
|
|
Loading…
Reference in a new issue