1
0
Fork 0
mirror of https://github.com/sbrl/Pepperminty-Wiki.git synced 2024-11-22 04:23:01 +00:00
Pepperminty-Wiki/modules/feature-search.php
Starbeamrainbowlabs d131666ff5
Squash all the error messages, but it's not working as intended.
All that seems to have happened is that searches are taking longer and 
not doing anything different.....
2020-03-15 18:10:23 +00:00

587 lines
26 KiB
PHP

<?php
register_module([
"name" => "Search",
"version" => "0.13",
"author" => "Starbeamrainbowlabs",
"description" => "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"id" => "feature-search",
// After refactoring, we'll need to specify dependencies like this
"depends" => [ "lib-search-engine" ],
"code" => function() {
global $settings, $paths;
/**
* @api {get} ?action=index&page={pageName} Get an index of words for a given page
* @apiName SearchIndex
* @apiGroup Search
* @apiPermission Anonymous
* @apiDescription For debugging purposes. Be warned - the format could change at any time!
*
* @apiParam {string} page The page to generate a word index page.
*/
/*
* ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ██████ ███████ ██ ██
*/
add_action("index", function() {
global $settings, $env;
$breakable_chars = "\r\n\t .,\\/!\"£$%^&*[]()+`_~#";
header("content-type: text/plain");
$source = file_get_contents("$env->storage_prefix$env->page.md");
$index = search::index_generate($source);
echo("Page name: $env->page\n");
echo("--------------- Source ---------------\n");
echo($source); echo("\n");
echo("--------------------------------------\n\n");
echo("---------------- Index ---------------\n");
foreach($index as $term => $entry) {
echo("$term: {$entry["freq"]} matches | " . implode(", ", $entry["offsets"]) . "\n");
}
echo("--------------------------------------\n");
});
/**
* @api {get} ?action=invindex-rebuild Rebuild the inverted search index from scratch
* @apiDescription Causes the inverted search index to be completely rebuilt from scratch. Can take a while for large wikis!
* @apiName SearchInvindexRebuild
* @apiGroup Search
* @apiPermission Admin
*
* @apiParam {string} secret Optional. Specify the secret from peppermint.json here in order to rebuild the search index without logging in.
*/
/*
* ██ ███ ██ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ █████ ███ █████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ████ ██ ██ ████ ██████ ███████ ██ ██
*
* ██████ ███████ ██████ ██ ██ ██ ██ ██████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ █████ ██████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ███████ ██████ ██████ ██ ███████ ██████
*/
add_action("invindex-rebuild", function() {
global $env, $settings;
if($env->is_admin ||
(
!empty($_POST["secret"]) &&
$_POST["secret"] === $settings->secret
)
)
search::invindex_rebuild();
else
{
http_response_code(401);
exit(page_renderer::render_main("Error - Search index regenerator - $settings->sitename", "<p>Error: You aren't allowed to regenerate the search index. Try logging in as an admin, or setting the <code>secret</code> POST parameter to $settings->sitename's secret - which can be found in $settings->sitename's <code>peppermint.json</code> file.</p>"));
}
});
/**
* @api {get} ?action=idindex-show Show the id index
* @apiDescription Outputs the id index. Useful if you need to verify that it's working as expected. Output is a json object.
* @apiName SearchShowIdIndex
* @apiGroup Search
* @apiPermission Anonymous
*/
add_action("idindex-show", function() {
global $idindex;
header("content-type: application/json; charset=UTF-8");
exit(json_encode($idindex, JSON_PRETTY_PRINT));
});
/**
* @api {get} ?action=search&query={text}[&format={format}] Search the wiki for a given query string
* @apiName Search
* @apiGroup Search
* @apiPermission Anonymous
*
* @apiParam {string} query The query string to search for.
* @apiParam {string} format Optional. Valid values: html, json. In json mode an object is returned with page names as keys, values as search result information - sorted in ranking order.
*/
/*
* ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
add_action("search", function() {
global $settings, $env, $pageindex, $paths;
// Create the inverted index if it doesn't exist.
if(!file_exists($paths->searchindex))
search::invindex_rebuild(false);
// Create the didyoumean index if it doesn't exist.
if(module_exists("feature-search-didyoumean") && !file_exists($paths->didyoumeanindex))
search::didyoumean_rebuild(false);
if(!isset($_GET["query"]))
exit(page_renderer::render("No Search Terms - Error - $settings->sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
$search_start = microtime(true);
$time_start = microtime(true);
search::invindex_load($paths->searchindex);
$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
$time_start = microtime(true);
$results = search::invindex_query($_GET["query"]);
$resultCount = count($results);
$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
header("x-invindex-load-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
$start = microtime(true);
foreach($results as &$result) {
$result["context"] = search::extract_context(
$result["pagename"],
$_GET["query"],
file_get_contents($env->storage_prefix . $result["pagename"] . ".md")
);
}
$env->perfdata->context_generation_time = round((microtime(true) - $start)*1000, 3);
header("x-context-generation-time: {$env->perfdata->context_generation_time}ms");
$env->perfdata->search_time = round((microtime(true) - $search_start)*1000, 3);
header("x-search-time: {$env->perfdata->search_time}ms");
if(!empty($_GET["format"]) && $_GET["format"] == "json") {
header("content-type: application/json");
$json_results = new stdClass();
foreach($results as $result) $json_results->{$result["pagename"]} = $result;
exit(json_encode($json_results));
}
$title = $_GET["query"] . " - Search results - $settings->sitename";
$content = "<section>\n";
$content .= "<h1>Search Results</h1>";
/// Search Box ///
$content .= "<form method='get' action=''>\n";
$content .= " <input type='search' id='search-box' name='query' placeholder='Type your query here and then press enter.' value='" . htmlentities($_GET["query"], ENT_HTML5 | ENT_QUOTES) . "' />\n";
$content .= " <input type='hidden' name='action' value='search' />\n";
$content .= "</form>";
$content .= "<p>Found $resultCount " . ($resultCount === 1 ? "result" : "results") . " in " . $env->perfdata->search_time . "ms. ";
$query = $_GET["query"];
if(isset($pageindex->$query)) {
$content .= "There's a page on $settings->sitename called <a href='?page=" . rawurlencode($query) . "'>$query</a>.";
}
else
{
$content .= "There isn't a page called $query on $settings->sitename, but you ";
if((!$settings->anonedits && !$env->is_logged_in) || !$settings->editing) {
$content .= "do not have permission to create it.";
if(!$env->is_logged_in) {
$content .= " You could try <a href='?action=login&returnto=" . rawurlencode($_SERVER["REQUEST_URI"]) . "'>logging in</a>.";
}
}
else {
$content .= "can <a href='?action=edit&page=" . rawurlencode($query) . "'>create it</a>.";
}
}
$content .= "<br /><small><em>Pssst! Power users can make use of $settings->sitename's advanced query syntax. Learn about it <a href='?action=help#27-search'>here</a>!</em></small></p>";
if(module_exists("page-list")) {
// TODO: Refactor ths to use STAS
$nterms = search::tokenize($query);
$nterms_regex = implode("|", array_map(function($nterm) {
return preg_quote(strtolower(trim($nterm)));
}, $nterms));
$all_tags = get_all_tags();
$matching_tags = [];
foreach($all_tags as $tag) {
if(preg_match("/$nterms_regex/i", trim($tag)) > 0)
$matching_tags[] = $tag;
}
if(count($matching_tags) > 0) {
$content .= "<p class='matching-tags-display'><label>Matching tags</label><span class='tags'>";
foreach($matching_tags as $tag) {
$content .= "\t<a href='?action=list-tags&tag=" . rawurlencode($tag) ."' class='mini-tag'>" . htmlentities($tag) . "</a> \n";
}
$content .= "</span></p>";
}
}
$i = 0; // todo use $_GET["offset"] and $_GET["result-count"] or something
foreach($results as $result)
{
$link = "?page=" . rawurlencode($result["pagename"]);
$pagesource = file_get_contents($env->storage_prefix . $result["pagename"] . ".md");
//echo("Extracting context for result " . $result["pagename"] . ".\n");
$context = $result["context"];
if(mb_strlen($context) === 0)
$context = mb_substr($pagesource, 0, $settings->search_characters_context * 2);
//echo("'Generated search context for " . $result["pagename"] . ": $context'\n");
$context = search::highlight_context(
$_GET["query"],
preg_replace('/</u', '&lt;', $context)
);
/*if(strlen($context) == 0)
{
$context = search::strip_markup(file_get_contents("$env->page.md", null, null, null, $settings->search_characters_context * 2));
if($pageindex->{$env->page}->size > $settings->search_characters_context * 2)
$context .= "...";
}*/
$tag_list = "<span class='tags'>";
foreach($pageindex->{$result["pagename"]}->tags ?? [] as $tag) $tag_list .= "<a href='?action=list-tags&tag=" . rawurlencode($tag) . "' class='mini-tag'>$tag</a>";
$tag_list .= "</span>\n";
// Make redirect pages italics
if(!empty($pageindex->{$result["pagename"]}->redirect))
$result["pagename"] = "<em>{$result["pagename"]}</em>";
// We add 1 to $i here to convert it from an index to a result
// number as people expect it to start from 1
$content .= "<div class='search-result' data-result-number='" . ($i + 1) . "' data-rank='" . $result["rank"] . "'>\n";
$content .= " <h2><a href='$link'>" . $result["pagename"] . "</a> <span class='search-result-badges'>$tag_list</span></h2>\n";
$content .= " <p class='search-context'>$context</p>\n";
$content .= "</div>\n";
$i++;
}
$content .= "</section>\n";
header("content-type: text/html; charset=UTF-8");
exit(page_renderer::render($title, $content));
//header("content-type: text/plain");
//var_dump($results);
});
/*
* ██████ ██ ██ ███████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ █████ ██████ ████ █████
* ██ ▄▄ ██ ██ ██ ██ ██ ██ ██
* ██████ ██████ ███████ ██ ██ ██
* ▀▀
* ███████ ███████ █████ ██████ ██████ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████ ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██ ██ ██ ████ ██████ ███████ ██ ██
*/
/**
* @api {get} ?action=query-searchindex&query={text} Inspect the internals of the search results for a query
* @apiName Search
* @apiGroup Search
* @apiPermission Anonymous
*
* @apiParam {string} query The query string to search for.
*/
add_action("query-searchindex", function() {
global $env, $paths;
if(empty($_GET["query"])) {
http_response_code(400);
header("content-type: text/plain");
exit("Error: No query specified. Specify it with the 'query' GET parameter.");
}
$env->perfdata->searchindex_decode_start = microtime(true);
search::invindex_load($paths->searchindex);
$env->perfdata->searchindex_decode_time = (microtime(true) - $env->perfdata->searchindex_decode_start) * 1000;
$env->perfdata->searchindex_query_start = microtime(true);
$searchResults = search::invindex_query($_GET["query"]);
$env->perfdata->searchindex_query_time = (microtime(true) - $env->perfdata->searchindex_query_start) * 1000;
header("content-type: application/json");
$result = new stdClass();
$result->time_format = "ms";
$result->decode_time = $env->perfdata->searchindex_decode_time;
$result->query_time = $env->perfdata->searchindex_query_time;
if(isset($env->perfdata->didyoumean_correction))
$result->didyoumean_correction_time = $env->perfdata->didyoumean_correction;
$result->total_time = $result->decode_time + $result->query_time;
$result->stas = search::stas_parse(search::stas_split($_GET["query"]));
$result->search_results = $searchResults;
exit(json_encode($result, JSON_PRETTY_PRINT));
});
/**
* @api {get} ?action=stas-parse&query={text} Debug search queries
* @apiDescription Debug Pepperminty Wiki's understanding of search queries.
* If you want something machine-readable, check out the new stas property on the object returned by query-searchindex.
* @apiName SearchSTASParse
* @apiGroup Search
* @apiPermission Anonymous
*
* @apiParam {string} query The query string to parse.
*/
add_action("stas-parse", function() {
global $settings;
if(!isset($_GET["query"])) {
http_response_code(400);
header("x-status: failed");
header("x-problem: no-query-specified");
exit(page_renderer::render_main("Error - STAS Query Analysis - $settings->sitename", "<p>No query was present in the <code>query</code> GET parameter.</p>"));
}
$tokens = search::stas_split($_GET["query"]);
$stas_query = search::stas_parse($tokens);
$result = "";
foreach($tokens as $token) {
if(in_array(substr($token, 1), $stas_query["exclude"])) {
$result .= "<span title='explicit exclude' style='color: red; text-decoration: dotted line-through;'>" . substr($token, 1) . "</span> ";
continue;
}
$term = null;
$token_part = $token;
if($token_part[0] == "+") $token_part = substr($token_part, 1);
if(strpos($token_part, ":") !== false) $token_part = explode(":", $token_part, 2)[1];
foreach($stas_query["terms"] as $c_term) {
// echo(var_export($token_part, true) . " / {$c_term["term"]}\n");
if($c_term["term"] == $token_part) {
$term = $c_term;
break;
}
}
if($term == null) {
$result .= "<span title='unknown' style='color: black; text-decoration: wavy underline;'>$token</span> ";
continue;
}
$title = "?";
$style = "";
switch($term["weight"]) {
case -1: $style .= "color: grey; text-decoration: wavy line-through;"; $title = "stop word"; break;
case 1: $style .= "color: blue;"; $title = "normal word"; break;
}
if($term["weight"] > 1) {
$style .= "color: darkblue; font-weight: bold;";
$title = "weighted word";
}
if($term["weight"] !== -1) {
switch($term["location"]) {
case "body": $style = "color: cyan"; $title = "body only"; break;
case "title": $style .= "font-weight: bolder; font-size: 1.2em; color: orange;"; $title = "searching title only"; $token = $token_part; break;
case "tags": $style .= "font-weight: bolder; color: purple;"; $title = "searching tags only"; $token = $token_part; break;
case "all": $title .= ", searching everywhere";
}
}
$title .= ", weight: {$term["weight"]}";
$result .= "<span title='$title' style='$style'>$token</span> ";
}
exit(page_renderer::render_main("STAS Query Analysis - $settings->sitename", "<p>$settings->sitename understood your query to mean the following:</p>
<blockquote>$result</blockquote>"));
});
/*
* ██████ ██████ ███████ ███ ██ ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██████ █████ ██ ██ ██ ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ ██ ███████ ██ ████ ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
/**
* @api {get} ?action=opensearch-description Get the opensearch description file
* @apiName OpenSearchDescription
* @apiGroup Search
* @apiPermission Anonymous
*/
add_action("opensearch-description", function () {
global $settings;
$siteRoot = full_url() . "/index.php";
if(!isset($_GET["debug"]))
header("content-type: application/opensearchdescription+xml");
else
header("content-type: text/plain");
exit('<?xml version="1.0" encoding="UTF-8"?' . '>' . // hack The build system strips it otherwise O.o I should really fix that.
"\n<OpenSearchDescription xmlns=\"http://a9.com/-/spec/opensearch/1.1/\">
<ShortName>Search $settings->sitename</ShortName>
<Description>Search $settings->sitename, which is powered by Pepperminty Wiki.</Description>
<Tags>$settings->sitename Wiki</Tags>
<Image type=\"image/png\">$settings->favicon</Image>
<Attribution>Search content available under the license linked to at the bottom of the search results page.</Attribution>
<Developer>Starbeamrainbowlabs (https://github.com/sbrl/Pepperminty-Wiki/graphs/contributors)</Developer>
<InputEncoding>UTF-8</InputEncoding>
<OutputEncoding>UTF-8</OutputEncoding>
<Url type=\"text/html\" method=\"get\" template=\"$siteRoot?action=view&amp;search-redirect=yes&amp;page={searchTerms}&amp;offset={startIndex?}&amp;count={count}\" />
<Url type=\"application/x-suggestions+json\" template=\"$siteRoot?action=suggest-pages&amp;query={searchTerms}&amp;type=opensearch\" />
</OpenSearchDescription>");
});
/**
* @api {get} ?action=suggest-pages[&type={type}] Get page name suggestions for a query
* @apiName OpenSearchDescription
* @apiGroup Search
* @apiPermission Anonymous
*
* @apiParam {string} text The search query string to get search suggestions for.
* @apiParam {string} type The type of result to return. Default value: json. Available values: json, opensearch
*/
add_action("suggest-pages", function() {
global $settings, $pageindex;
if($settings->dynamic_page_suggestion_count === 0) {
header("content-type: application/json");
header("content-length: 3");
exit("[]\n");
}
if(empty($_GET["query"])) {
http_response_code(400);
header("content-type: text/plain");
exit("Error: You didn't specify the 'query' GET parameter.");
}
$type = $_GET["type"] ?? "json";
if(!in_array($type, ["json", "opensearch"])) {
http_response_code(406);
exit("Error: The type '$type' is not one of the supported output types. Available values: json, opensearch. Default: json");
}
$query = search::$literator->transliterate($_GET["query"]);
// Rank each page name
$results = [];
foreach($pageindex as $pageName => $entry) {
$results[] = [
"pagename" => $pageName,
// Costs: Insert: 1, Replace: 8, Delete: 6
"distance" => levenshtein($query, search::$literator->transliterate($pageName), 1, 8, 6)
];
}
// Sort the page names by distance from the original query
usort($results, function($a, $b) {
if($a["distance"] == $b["distance"])
return strcmp($a["pagename"], $b["pagename"]);
return $a["distance"] < $b["distance"] ? -1 : 1;
});
// Send the results to the user
$suggestions = array_slice($results, 0, $settings->dynamic_page_suggestion_count);
switch($type)
{
case "json":
header("content-type: application/json");
exit(json_encode($suggestions));
case "opensearch":
$opensearch_output = [
$_GET["query"],
array_map(function($suggestion) { return $suggestion["pagename"]; }, $suggestions)
];
header("content-type: application/x-suggestions+json");
exit(json_encode($opensearch_output));
}
});
if($settings->dynamic_page_suggestion_count > 0)
{
page_renderer::add_js_snippet('/// Dynamic page suggestion system
// Micro snippet 8 - Promisified GET (fetched 20th Nov 2016)
function get(u){return new Promise(function(r,t,a){a=new XMLHttpRequest();a.onload=function(b,c){b=a.status;c=a.response;if(b>199&&b<300){r(c)}else{t(c)}};a.open("GET",u,true);a.send(null)})}
window.addEventListener("load", function(event) {
var searchBox = document.querySelector("input[type=search]");
searchBox.dataset.lastValue = "";
searchBox.addEventListener("keyup", function(event) {
// Make sure that we don\'t keep sending requests to the server if nothing has changed
if(searchBox.dataset.lastValue == event.target.value)
return;
searchBox.dataset.lastValue = event.target.value;
// Fetch the suggestions from the server
get("?action=suggest-pages&query=" + encodeURIComponent(event.target.value)).then(function(response) {
var suggestions = JSON.parse(response),
dataList = document.getElementById("allpages");
// If the server sent no suggestions, then we shouldn\'t replace the contents of the datalist
if(suggestions.length == 0)
return;
console.info(`Fetched suggestions for ${event.target.value}:`, suggestions.map(s => s.pagename));
// Remove all the existing suggestions
while(dataList.firstChild) {
dataList.removeChild(dataList.firstChild);
}
// Add the new suggestions to the datalist
var optionsFrag = document.createDocumentFragment();
suggestions.forEach(function(suggestion) {
var suggestionElement = document.createElement("option");
suggestionElement.value = suggestion.pagename;
suggestionElement.dataset.distance = suggestion.distance;
optionsFrag.appendChild(suggestionElement);
});
dataList.appendChild(optionsFrag);
});
});
});
');
}
if(module_exists("feature-cli")) {
cli_register("search", "Query and manipulate the search index", function(array $args) : int {
if(count($args) < 1) {
echo("search: query and manipulate the search index
Usage:
search {subcommand}
Subcommands:
rebuild Rebuilds the search index
");
return 0;
}
switch($args[0]) {
case "rebuild":
search::invindex_rebuild();
break;
}
return 0;
});
}
add_help_section("27-search", "Searching", "<p>$settings->sitename has an integrated full-text search engine, allowing you to search all of the pages on $settings->sitename and their content. To use it, simply enter your query into the page name box and press enter. If a page isn't found with the exact name of your query terms, a search will be performed instead.</p>
<p>Additionally, advanced users can take advantage of some extra query syntax that $settings->sitename supports, which is inspired by popular search engines:</p>
<table>
<tr><th style='width: 33%;'>Example</th><th style='width: 66%;'>Meaning</th></tr>
<tr><td><code>cat -dog</code></td><td>Search for pages containing \"cat\", but not \"dog\". This syntax does not make sense on it's own - other words must be present for it to take effect.</td>
<tr><td><code>+glass marble</code></td><td>Double the weighting of the word \"glass\".</td>
<tr><td><code>intitle:rocket</code></td><td>Search only page titles for \"rocket\".</td>
<tr><td><code>intags:bill</code></td><td>Search only tags for \"bill\".</td>
<tr><td><code>inbody:satellite</code></td><td>Search only the page body for \"satellite\".</td>
</table>
<p>More query syntax will be added in the future, so keep an eye on <a href='https://github.com/sbrl/Pepperminty-Wiki/releases/'>the latest releases</a> of <em>Pepperminty Wiki</em> to stay up-to-date (<a href='https://github.com/sbrl/Pepperminty-Wiki/releases.atom'>Atom / RSS feed available here</a>).</p>");
}
]);
?>