2020-05-22 20:22:07 +00:00
|
|
|
<?php
|
2020-09-23 22:22:39 +00:00
|
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
|
|
|
|
2020-05-22 20:22:07 +00:00
|
|
|
register_module([
|
|
|
|
"name" => "Similar Pages",
|
|
|
|
"version" => "0.1",
|
|
|
|
"author" => "Starbeamrainbowlabs",
|
|
|
|
"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",
|
|
|
|
"id" => "feature-similarpages",
|
|
|
|
"depends" => [ "lib-search-engine", "feature-search" ],
|
|
|
|
"code" => function() {
|
|
|
|
global $settings;
|
|
|
|
/**
|
2020-05-23 11:56:46 +00:00
|
|
|
* @api {get} ?action=suggest-similar&page={pageName} Get similar page suggestions
|
|
|
|
* @apiName SuggestSimilar
|
2020-05-22 20:22:07 +00:00
|
|
|
* @apiGroup Page
|
|
|
|
* @apiPermission Anonymous
|
|
|
|
*
|
2020-05-23 11:56:46 +00:00
|
|
|
* @apiParam {string} page The page to return suggestions for.
|
2020-05-22 20:22:07 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2020-05-23 11:56:46 +00:00
|
|
|
* ███████ ██ ██ ██████ ██████ ███████ ███████ ████████
|
|
|
|
* ██ ██ ██ ██ ██ ██ ██ ██
|
|
|
|
* ███████ ██ ██ ██ ███ ██ ███ █████ ███████ ██
|
|
|
|
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
|
|
* ███████ ██████ ██████ ██████ ███████ ███████ ██
|
|
|
|
*
|
|
|
|
* ███████ ██ ███ ███ ██ ██ █████ ██████
|
|
|
|
* ██ ██ ████ ████ ██ ██ ██ ██ ██ ██
|
|
|
|
* ███████ ██ ██ ████ ██ ██ ██ ███████ ██████
|
|
|
|
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
|
|
* ███████ ██ ██ ██ ██ ███████ ██ ██ ██ ██
|
2020-05-22 20:22:07 +00:00
|
|
|
*/
|
|
|
|
add_action("suggest-similar", function() {
|
|
|
|
global $pageindex, $env;
|
|
|
|
|
|
|
|
$format = $_GET["format"] ?? "text";
|
|
|
|
|
|
|
|
|
|
|
|
// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here
|
|
|
|
$similarpages = similar_suggest(
|
|
|
|
$env->page,
|
|
|
|
file_get_contents($env->page_filename)
|
|
|
|
);
|
|
|
|
|
|
|
|
switch ($format) {
|
|
|
|
case "text":
|
|
|
|
header("content-type: text/plain");
|
|
|
|
foreach($similarpages as $pagename => $rank) {
|
|
|
|
echo("$pagename | $rank\n");
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "csv":
|
|
|
|
header("content-type: text/csv");
|
|
|
|
echo("pagename,rank\n");
|
|
|
|
foreach($similarpages as $pagename => $rank)
|
|
|
|
echo("$pagename,$rank\n");
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "json":
|
|
|
|
header("content-type: application/json");
|
|
|
|
echo(json_encode($similarpages));
|
|
|
|
|
|
|
|
default:
|
|
|
|
http_response_code(400);
|
|
|
|
header("content-type: text/plain");
|
|
|
|
exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
});
|
2020-05-25 20:15:19 +00:00
|
|
|
|
|
|
|
page_renderer::register_part_preprocessor(function(&$parts) {
|
|
|
|
global $env;
|
2020-06-04 00:15:10 +00:00
|
|
|
if($env->action !== "view")
|
|
|
|
return;
|
2020-05-25 20:15:19 +00:00
|
|
|
|
2020-08-11 17:13:47 +00:00
|
|
|
$html = "<aside class='similar-page-suggestions'><h2>Similar Pages</h2>\n\t\t<ul class='similar-page-suggestions-list'>\n";
|
2020-05-25 20:15:19 +00:00
|
|
|
$start_time = microtime(true);
|
|
|
|
$suggestions = similar_suggest(
|
|
|
|
$env->page,
|
|
|
|
file_get_contents($env->page_filename)
|
|
|
|
);
|
|
|
|
$env->perfdata->similar_suggest = round((microtime(true) - $start_time) * 1000, 2);
|
|
|
|
foreach($suggestions as $suggested_pagename => $rank)
|
|
|
|
$html .= "<li data-rank='$rank'><a href='?page=".rawurlencode($suggested_pagename)."'>".htmlentities($suggested_pagename)."</a></li>\n";
|
2020-05-26 12:16:33 +00:00
|
|
|
$html .= "</ul>\n\t\t<!-- Took {$env->perfdata->similar_suggest}ms to compute similar page suggestions -->\n\t\t</aside>\n";
|
2020-05-25 20:15:19 +00:00
|
|
|
|
|
|
|
$parts["{extra}"] = $html . $parts["{extra}"];
|
|
|
|
});
|
2020-05-22 20:22:07 +00:00
|
|
|
}
|
|
|
|
]);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Given a page name, returns a list fo similar pages.
|
|
|
|
* @param string $pagename The name of the page to return suggestions for.
|
|
|
|
* @param string $content The content of the given page.
|
|
|
|
* @return array A list of suggested page names in the format pagename => rank.
|
|
|
|
*/
|
|
|
|
function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {
|
|
|
|
global $settings;
|
|
|
|
$content_search = search::$literator->transliterate($content);
|
|
|
|
$index = search::index_generate($content_search);
|
|
|
|
$title_tokens = search::tokenize($pagename);
|
|
|
|
foreach($title_tokens as $token) {
|
|
|
|
if(in_array($token, search::$stop_words)) continue;
|
|
|
|
$index[$token] = [ "freq" => 10000, "fromtitle" => true ];
|
|
|
|
}
|
|
|
|
search::index_sort_freq($index, true);
|
|
|
|
search::invindex_load();
|
|
|
|
|
|
|
|
|
|
|
|
$our_pageid = ids::getid($pagename);
|
|
|
|
$pages = [];
|
|
|
|
$max_count = -1;
|
|
|
|
$i = 0;
|
|
|
|
foreach($index as $term => $data) {
|
|
|
|
// Only search the top 20% most common words
|
|
|
|
// Stop words are skipped automagically
|
|
|
|
// if($i > $max_count * 0.2) break;
|
|
|
|
// Skip words shorter than 3 characters
|
|
|
|
if(strlen($term) < 3) continue;
|
|
|
|
|
|
|
|
// if($i > 10) break;
|
|
|
|
|
|
|
|
// If this one is less than 0.2x the max frequency count, break out
|
|
|
|
if(!isset($data["fromtitle"]))
|
|
|
|
$max_count = max($max_count, $data["freq"]);
|
|
|
|
if($data["freq"] < $max_count * 0.2 || $data["freq"] <= 1) break;
|
|
|
|
|
|
|
|
// Check is it's present just in case (todo figure out if it's necessary)
|
|
|
|
if(!search::invindex_term_exists($term)) continue;
|
|
|
|
|
|
|
|
$otherpages = search::invindex_term_getpageids($term);
|
|
|
|
foreach($otherpages as $pageid) {
|
|
|
|
if($pageid == $our_pageid) continue;
|
|
|
|
if(!isset($pages[$pageid]))
|
|
|
|
$pages[$pageid] = 0;
|
|
|
|
|
|
|
|
$amount = search::invindex_term_getoffsets($term, $pageid)->freq;
|
|
|
|
if(isset($data["fromtitle"]))
|
|
|
|
$amount *= 5;
|
|
|
|
$pages[$pageid] += $amount;
|
|
|
|
}
|
|
|
|
|
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
|
|
|
|
arsort($pages, SORT_NUMERIC);
|
|
|
|
|
|
|
|
$result = []; $i = 0;
|
|
|
|
foreach($pages as $pageid => $count) {
|
2020-05-25 20:15:19 +00:00
|
|
|
if($limit_output && $i >= $settings->similarpages_count) break;
|
2020-05-22 20:22:07 +00:00
|
|
|
$result[ids::getpagename($pageid)] = $count;
|
|
|
|
$i++;
|
|
|
|
}
|
|
|
|
return $result;
|
|
|
|
}
|
|
|
|
|
|
|
|
?>
|