Pepperminty-Wiki/modules/feature-similarpages.php

<?php
register_module([
	"name" => "Similar Pages",
	"version" => "0.1",
	"author" => "Starbeamrainbowlabs",
	"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",
	"id" => "feature-similarpages",
	"depends" => [ "lib-search-engine", "feature-search" ],
	"code" => function() {
		global $settings;
		/**
		 * @api {get} ?action=raw&page={pageName} Get the raw source code of a page
		 * @apiName RawSource
		 * @apiGroup Page
		 * @apiPermission Anonymous
		 * 
		 * @apiParam {string}	page	The page to return the source of.
		 */
		
		/*
		 * ██████   █████  ██     ██ 
		 * ██   ██ ██   ██ ██     ██ 
		 * ██████  ███████ ██  █  ██ 
		 * ██   ██ ██   ██ ██ ███ ██ 
		 * ██   ██ ██   ██  ███ ███  
		 */
		add_action("suggest-similar", function() {
			global $pageindex, $env;
			
			$format = $_GET["format"] ?? "text";
			
			
			// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here
			$similarpages = similar_suggest(
				$env->page,
				file_get_contents($env->page_filename)
			);
			
			switch ($format) {
				case "text":
					header("content-type: text/plain");
					foreach($similarpages as $pagename => $rank) {
						echo("$pagename | $rank\n");
					}
					break;
				
				case "csv":
					header("content-type: text/csv");
					echo("pagename,rank\n");
					foreach($similarpages as $pagename => $rank)
						echo("$pagename,$rank\n");
					break;
				
				case "json":
					header("content-type: application/json");
					echo(json_encode($similarpages));
				
				default:
					http_response_code(400);
					header("content-type: text/plain");
					exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");
					break;
			}
		});
	}
]);

/**
 * Given a page name, returns a list fo similar pages.
 * @param	string	$pagename	The name of the page to return suggestions for.
 * @param	string	$content	The content of the given page.
 * @return	array	A list of suggested page names in the format pagename => rank.
 */
function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {
	global $settings;
	$content_search = search::$literator->transliterate($content);
	$index = search::index_generate($content_search);
	$title_tokens = search::tokenize($pagename);
	foreach($title_tokens as $token) {
		if(in_array($token, search::$stop_words)) continue;
		$index[$token] = [ "freq" => 10000, "fromtitle" => true ];
	}
	search::index_sort_freq($index, true);
	search::invindex_load();
	
	
	$our_pageid = ids::getid($pagename);
	$pages = [];
	$max_count = -1;
	$i = 0;
	foreach($index as $term => $data) {
		error_log("[similar_suggest] checking $term | {$data["freq"]}");
		// Only search the top 20% most common words
		// Stop words are skipped automagically
		// if($i > $max_count * 0.2) break;
		// Skip words shorter than 3 characters
		if(strlen($term) < 3) continue;
		
		// if($i > 10) break;
		
		// If this one is less than 0.2x the max frequency count, break out
		if(!isset($data["fromtitle"]))
			$max_count = max($max_count, $data["freq"]);
		if($data["freq"] < $max_count * 0.2 || $data["freq"] <= 1) break;
		
		// Check is it's present just in case (todo figure out if it's necessary)
		if(!search::invindex_term_exists($term)) continue;
		
		error_log("ok");
		
		$otherpages = search::invindex_term_getpageids($term);
		foreach($otherpages as $pageid) {
			if($pageid == $our_pageid) continue;
			if(!isset($pages[$pageid]))
				$pages[$pageid] = 0;
			
			$amount = search::invindex_term_getoffsets($term, $pageid)->freq;
			if(isset($data["fromtitle"]))
				$amount *= 5;
			$pages[$pageid] += $amount;
		}
		
		$i++;
	}
	
	arsort($pages, SORT_NUMERIC);
	
	$result = []; $i = 0;
	foreach($pages as $pageid => $count) {
		if($limit_output && $i > $settings->similarpages_count) break;
		$result[ids::getpagename($pageid)] = $count;
		$i++;
	}
	return $result;
}

?>
Start implementing similar pages system, but it's not finished yet We have the backend suggestion system done, but not the UI. I can tell that this is going to require lots of tweaking to get just right. I suspect it might be a good idea to explore some possible tweakable settings we can add to allow people to tweak the engine to better suit their individual setups. 2020-05-22 20:22:07 +00:00			`<?php`
			`register_module([`
			`"name" => "Similar Pages",`
			`"version" => "0.1",`
			`"author" => "Starbeamrainbowlabs",`
			`"description" => "Adds a few suggestions of similar pages below the main content and above the comments of a page. Requires the search engine.",`
			`"id" => "feature-similarpages",`
			`"depends" => [ "lib-search-engine", "feature-search" ],`
			`"code" => function() {`
			`global $settings;`
			`/**`
			`* @api {get} ?action=raw&page={pageName} Get the raw source code of a page`
			`* @apiName RawSource`
			`* @apiGroup Page`
			`* @apiPermission Anonymous`
			`*`
			`* @apiParam {string} page The page to return the source of.`
			`*/`

			`/*`
			`* ██████ █████ ██ ██`
			`* ██ ██ ██ ██ ██ ██`
			`* ██████ ███████ ██ █ ██`
			`* ██ ██ ██ ██ ██ ███ ██`
			`* ██ ██ ██ ██ ███ ███`
			`*/`
			`add_action("suggest-similar", function() {`
			`global $pageindex, $env;`

			`$format = $_GET["format"] ?? "text";`


			`// TODO: Supportr history revisions here? $env->page_filename might do this for us - we should check into the behaviour here`
			`$similarpages = similar_suggest(`
			`$env->page,`
			`file_get_contents($env->page_filename)`
			`);`

			`switch ($format) {`
			`case "text":`
			`header("content-type: text/plain");`
			`foreach($similarpages as $pagename => $rank) {`
			`echo("$pagename \| $rank\n");`
			`}`
			`break;`

			`case "csv":`
			`header("content-type: text/csv");`
			`echo("pagename,rank\n");`
			`foreach($similarpages as $pagename => $rank)`
			`echo("$pagename,$rank\n");`
			`break;`

			`case "json":`
			`header("content-type: application/json");`
			`echo(json_encode($similarpages));`

			`default:`
			`http_response_code(400);`
			`header("content-type: text/plain");`
			`exit("Error: The format $format wasn't recognised.\nAvailable formats for this action: text, json, csv");`
			`break;`
			`}`
			`});`
			`}`
			`]);`

			`/**`
			`* Given a page name, returns a list fo similar pages.`
			`* @param string $pagename The name of the page to return suggestions for.`
			`* @param string $content The content of the given page.`
			`* @return array A list of suggested page names in the format pagename => rank.`
			`*/`
			`function similar_suggest(string $pagename, string $content, bool $limit_output = true) : array {`
			`global $settings;`
			`$content_search = search::$literator->transliterate($content);`
			`$index = search::index_generate($content_search);`
			`$title_tokens = search::tokenize($pagename);`
			`foreach($title_tokens as $token) {`
			`if(in_array($token, search::$stop_words)) continue;`
			`$index[$token] = [ "freq" => 10000, "fromtitle" => true ];`
			`}`
			`search::index_sort_freq($index, true);`
			`search::invindex_load();`


			`$our_pageid = ids::getid($pagename);`
			`$pages = [];`
			`$max_count = -1;`
			`$i = 0;`
			`foreach($index as $term => $data) {`
			`error_log("[similar_suggest] checking $term \| {$data["freq"]}");`
			`// Only search the top 20% most common words`
			`// Stop words are skipped automagically`
			`// if($i > $max_count * 0.2) break;`
			`// Skip words shorter than 3 characters`
			`if(strlen($term) < 3) continue;`

			`// if($i > 10) break;`

			`// If this one is less than 0.2x the max frequency count, break out`
			`if(!isset($data["fromtitle"]))`
			`$max_count = max($max_count, $data["freq"]);`
			`if($data["freq"] < $max_count * 0.2 \|\| $data["freq"] <= 1) break;`

			`// Check is it's present just in case (todo figure out if it's necessary)`
			`if(!search::invindex_term_exists($term)) continue;`

			`error_log("ok");`

			`$otherpages = search::invindex_term_getpageids($term);`
			`foreach($otherpages as $pageid) {`
			`if($pageid == $our_pageid) continue;`
			`if(!isset($pages[$pageid]))`
			`$pages[$pageid] = 0;`

			`$amount = search::invindex_term_getoffsets($term, $pageid)->freq;`
			`if(isset($data["fromtitle"]))`
			`$amount *= 5;`
			`$pages[$pageid] += $amount;`
			`}`

			`$i++;`
			`}`

			`arsort($pages, SORT_NUMERIC);`

			`$result = []; $i = 0;`
			`foreach($pages as $pageid => $count) {`
			`if($limit_output && $i > $settings->similarpages_count) break;`
			`$result[ids::getpagename($pageid)] = $count;`
			`$i++;`
			`}`
			`return $result;`
			`}`

			`?>`