mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-12-22 13:45:02 +00:00
Starbeamrainbowlabs
a0f6e89643
We have the backend suggestion system done, but not the UI. I can tell that this is going to require lots of tweaking to get just right. I suspect it might be a good idea to explore some possible tweakable settings we can add to allow people to tweak the engine to better suit their individual setups.
137 lines
4.1 KiB
PHP
137 lines
4.1 KiB
PHP
<?php
|
|
register_module([
|
|
"name" => "Similar Pages",
|
|
"version" => "0.1",
|
|
"author" => "Starbeamrainbowlabs",
|
|
"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",
|
|
"id" => "feature-similarpages",
|
|
"depends" => [ "lib-search-engine", "feature-search" ],
|
|
"code" => function() {
|
|
global $settings;
|
|
/**
|
|
* @api {get} ?action=raw&page={pageName} Get the raw source code of a page
|
|
* @apiName RawSource
|
|
* @apiGroup Page
|
|
* @apiPermission Anonymous
|
|
*
|
|
* @apiParam {string} page The page to return the source of.
|
|
*/
|
|
|
|
/*
|
|
* ██████ █████ ██ ██
|
|
* ██ ██ ██ ██ ██ ██
|
|
* ██████ ███████ ██ █ ██
|
|
* ██ ██ ██ ██ ██ ███ ██
|
|
* ██ ██ ██ ██ ███ ███
|
|
*/
|
|
add_action("suggest-similar", function() {
|
|
global $pageindex, $env;
|
|
|
|
$format = $_GET["format"] ?? "text";
|
|
|
|
|
|
// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here
|
|
$similarpages = similar_suggest(
|
|
$env->page,
|
|
file_get_contents($env->page_filename)
|
|
);
|
|
|
|
switch ($format) {
|
|
case "text":
|
|
header("content-type: text/plain");
|
|
foreach($similarpages as $pagename => $rank) {
|
|
echo("$pagename | $rank\n");
|
|
}
|
|
break;
|
|
|
|
case "csv":
|
|
header("content-type: text/csv");
|
|
echo("pagename,rank\n");
|
|
foreach($similarpages as $pagename => $rank)
|
|
echo("$pagename,$rank\n");
|
|
break;
|
|
|
|
case "json":
|
|
header("content-type: application/json");
|
|
echo(json_encode($similarpages));
|
|
|
|
default:
|
|
http_response_code(400);
|
|
header("content-type: text/plain");
|
|
exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");
|
|
break;
|
|
}
|
|
});
|
|
}
|
|
]);
|
|
|
|
/**
|
|
* Given a page name, returns a list fo similar pages.
|
|
* @param string $pagename The name of the page to return suggestions for.
|
|
* @param string $content The content of the given page.
|
|
* @return array A list of suggested page names in the format pagename => rank.
|
|
*/
|
|
function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {
|
|
global $settings;
|
|
$content_search = search::$literator->transliterate($content);
|
|
$index = search::index_generate($content_search);
|
|
$title_tokens = search::tokenize($pagename);
|
|
foreach($title_tokens as $token) {
|
|
if(in_array($token, search::$stop_words)) continue;
|
|
$index[$token] = [ "freq" => 10000, "fromtitle" => true ];
|
|
}
|
|
search::index_sort_freq($index, true);
|
|
search::invindex_load();
|
|
|
|
|
|
$our_pageid = ids::getid($pagename);
|
|
$pages = [];
|
|
$max_count = -1;
|
|
$i = 0;
|
|
foreach($index as $term => $data) {
|
|
error_log("[similar_suggest] checking $term | {$data["freq"]}");
|
|
// Only search the top 20% most common words
|
|
// Stop words are skipped automagically
|
|
// if($i > $max_count * 0.2) break;
|
|
// Skip words shorter than 3 characters
|
|
if(strlen($term) < 3) continue;
|
|
|
|
// if($i > 10) break;
|
|
|
|
// If this one is less than 0.2x the max frequency count, break out
|
|
if(!isset($data["fromtitle"]))
|
|
$max_count = max($max_count, $data["freq"]);
|
|
if($data["freq"] < $max_count * 0.2 || $data["freq"] <= 1) break;
|
|
|
|
// Check is it's present just in case (todo figure out if it's necessary)
|
|
if(!search::invindex_term_exists($term)) continue;
|
|
|
|
error_log("ok");
|
|
|
|
$otherpages = search::invindex_term_getpageids($term);
|
|
foreach($otherpages as $pageid) {
|
|
if($pageid == $our_pageid) continue;
|
|
if(!isset($pages[$pageid]))
|
|
$pages[$pageid] = 0;
|
|
|
|
$amount = search::invindex_term_getoffsets($term, $pageid)->freq;
|
|
if(isset($data["fromtitle"]))
|
|
$amount *= 5;
|
|
$pages[$pageid] += $amount;
|
|
}
|
|
|
|
$i++;
|
|
}
|
|
|
|
arsort($pages, SORT_NUMERIC);
|
|
|
|
$result = []; $i = 0;
|
|
foreach($pages as $pageid => $count) {
|
|
if($limit_output && $i > $settings->similarpages_count) break;
|
|
$result[ids::getpagename($pageid)] = $count;
|
|
$i++;
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
?>
|