1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-06-08 11:44:00 +00:00
Pepperminty-Wiki/modules/feature-similarpages.php

138 lines
4.1 KiB
PHP
Raw Normal View History

<?php
register_module([
"name" => "Similar Pages",
"version" => "0.1",
"author" => "Starbeamrainbowlabs",
"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",
"id" => "feature-similarpages",
"depends" => [ "lib-search-engine", "feature-search" ],
"code" => function() {
global $settings;
/**
* @api {get} ?action=raw&page={pageName} Get the raw source code of a page
* @apiName RawSource
* @apiGroup Page
* @apiPermission Anonymous
*
* @apiParam {string} page The page to return the source of.
*/
/*
* ██████ █████ ██ ██
* ██ ██ ██ ██ ██ ██
* ██████ ███████ ██ ██
* ██ ██ ██ ██ ██ ███ ██
* ██ ██ ██ ██ ███ ███
*/
add_action("suggest-similar", function() {
global $pageindex, $env;
$format = $_GET["format"] ?? "text";
// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here
$similarpages = similar_suggest(
$env->page,
file_get_contents($env->page_filename)
);
switch ($format) {
case "text":
header("content-type: text/plain");
foreach($similarpages as $pagename => $rank) {
echo("$pagename | $rank\n");
}
break;
case "csv":
header("content-type: text/csv");
echo("pagename,rank\n");
foreach($similarpages as $pagename => $rank)
echo("$pagename,$rank\n");
break;
case "json":
header("content-type: application/json");
echo(json_encode($similarpages));
default:
http_response_code(400);
header("content-type: text/plain");
exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");
break;
}
});
}
]);
/**
* Given a page name, returns a list fo similar pages.
* @param string $pagename The name of the page to return suggestions for.
* @param string $content The content of the given page.
* @return array A list of suggested page names in the format pagename => rank.
*/
function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {
global $settings;
$content_search = search::$literator->transliterate($content);
$index = search::index_generate($content_search);
$title_tokens = search::tokenize($pagename);
foreach($title_tokens as $token) {
if(in_array($token, search::$stop_words)) continue;
$index[$token] = [ "freq" => 10000, "fromtitle" => true ];
}
search::index_sort_freq($index, true);
search::invindex_load();
$our_pageid = ids::getid($pagename);
$pages = [];
$max_count = -1;
$i = 0;
foreach($index as $term => $data) {
error_log("[similar_suggest] checking $term | {$data["freq"]}");
// Only search the top 20% most common words
// Stop words are skipped automagically
// if($i > $max_count * 0.2) break;
// Skip words shorter than 3 characters
if(strlen($term) < 3) continue;
// if($i > 10) break;
// If this one is less than 0.2x the max frequency count, break out
if(!isset($data["fromtitle"]))
$max_count = max($max_count, $data["freq"]);
if($data["freq"] < $max_count * 0.2 || $data["freq"] <= 1) break;
// Check is it's present just in case (todo figure out if it's necessary)
if(!search::invindex_term_exists($term)) continue;
error_log("ok");
$otherpages = search::invindex_term_getpageids($term);
foreach($otherpages as $pageid) {
if($pageid == $our_pageid) continue;
if(!isset($pages[$pageid]))
$pages[$pageid] = 0;
$amount = search::invindex_term_getoffsets($term, $pageid)->freq;
if(isset($data["fromtitle"]))
$amount *= 5;
$pages[$pageid] += $amount;
}
$i++;
}
arsort($pages, SORT_NUMERIC);
$result = []; $i = 0;
foreach($pages as $pageid => $count) {
if($limit_output && $i > $settings->similarpages_count) break;
$result[ids::getpagename($pageid)] = $count;
$i++;
}
return $result;
}
?>