2015-10-27 21:10:05 +00:00
< ? php
register_module ([
" name " => " Search " ,
2017-09-19 16:32:52 +00:00
" version " => " 0.6 " ,
2015-10-27 21:10:05 +00:00
" author " => " Starbeamrainbowlabs " ,
2016-03-12 18:52:26 +00:00
" description " => " Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch. " ,
2015-10-27 21:10:05 +00:00
" id " => " feature-search " ,
" code " => function () {
2016-11-20 13:24:35 +00:00
global $settings ;
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = index & page = { pageName } Get an index of words for a given page
* @ apiName SearchIndex
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } page The page to generate a word index page .
*/
2015-12-26 12:55:19 +00:00
/*
* ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ██████ ███████ ██ ██
*/
2015-10-27 21:10:05 +00:00
add_action ( " index " , function () {
global $settings , $env ;
$breakable_chars = " \r \n \t ., \\ /! \" £ $ %^&*[]()+`_~# " ;
header ( " content-type: text/plain " );
2015-11-08 21:15:08 +00:00
$source = file_get_contents ( " $env->storage_prefix $env->page .md " );
2015-10-27 21:10:05 +00:00
2015-10-28 09:14:41 +00:00
$index = search :: index ( $source );
2015-10-27 21:10:05 +00:00
2015-10-28 08:03:56 +00:00
var_dump ( $env -> page );
var_dump ( $source );
2015-10-27 21:10:05 +00:00
var_dump ( $index );
});
2015-10-28 20:56:10 +00:00
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = invindex - rebuild Rebuild the inverted search index from scratch
* @ apiDescription Causes the inverted search index to be completely rebuilt from scratch . Can take a while for large wikis !
* @ apiName SearchInvindexRebuild
* @ apiGroup Search
2017-07-10 21:10:18 +00:00
* @ apiPermission Admin
*
* @ apiParam { string } secret Optional . Specify the secret from peppermint . json here in order to rebuild the search index without logging in .
2016-06-12 20:15:43 +00:00
*/
2015-12-26 12:55:19 +00:00
/*
* ██ ███ ██ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ █████ ███ █████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ████ ██ ██ ████ ██████ ███████ ██ ██
*
* ██████ ███████ ██████ ██ ██ ██ ██ ██████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ █████ ██████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ███████ ██████ ██████ ██ ███████ ██████
*/
2015-10-28 20:56:10 +00:00
add_action ( " invindex-rebuild " , function () {
2017-07-10 21:06:41 +00:00
global $env , $settings ;
if ( $env -> is_admin ||
(
! empty ( $_POST [ " secret " ]) &&
$_POST [ " secret " ] === $settings -> secret
)
)
search :: rebuild_invindex ();
else
{
http_response_code ( 401 );
exit ( page_renderer :: render_main ( " Error - Search index regenerator - $settings->sitename " , " <p>Error: You aren't allowed to regenerate the search index. Try logging in as an admin, or setting the <code>secret</code> POST parameter to $settings->sitename 's secret - which can be found in $settings->sitename 's <code>peppermint.json</code> file.</p> " ));
}
2015-10-28 20:56:10 +00:00
});
2016-08-26 16:47:46 +00:00
/**
2016-08-26 16:55:50 +00:00
* @ api { get } ? action = idindex - show Show the id index
2016-08-26 16:47:46 +00:00
* @ apiDescription Outputs the id index . Useful if you need to verify that it ' s working as expected .
* @ apiName SearchShowIdIndex
* @ apiGroup Search
* @ apiPermission Anonymous
*/
add_action ( " idindex-show " , function () {
global $idindex ;
header ( " content-type: application/json; charset=UTF-8 " );
exit ( json_encode ( $idindex , JSON_PRETTY_PRINT ));
});
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = search & query = { text } Search the wiki for a given query string
* @ apiName Search
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to search for .
*/
2015-12-26 12:55:19 +00:00
/*
* ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
2015-10-28 20:56:10 +00:00
add_action ( " search " , function () {
2015-11-09 07:25:28 +00:00
global $settings , $env , $pageindex , $paths ;
2015-10-28 20:56:10 +00:00
2015-12-05 17:27:01 +00:00
// Create the inverted index if it doesn't exist.
// todo In the future perhaps a CLI for this would be good?
if ( ! file_exists ( $paths -> searchindex ))
2017-07-10 20:53:52 +00:00
search :: rebuild_invindex ( false );
2015-12-05 17:27:01 +00:00
2015-10-28 20:56:10 +00:00
if ( ! isset ( $_GET [ " query " ]))
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( " No Search Terms - Error - $settings->sitename " , " <p>You didn't specify any search terms. Try typing some into the box above.</p> " ));
2015-10-28 20:56:10 +00:00
2015-10-29 11:21:04 +00:00
$search_start = microtime ( true );
2015-10-28 20:56:10 +00:00
2015-11-08 21:15:08 +00:00
$invindex = search :: load_invindex ( $paths -> searchindex );
2015-10-29 11:21:04 +00:00
$results = search :: query_invindex ( $_GET [ " query " ], $invindex );
2016-08-20 10:35:04 +00:00
$resultCount = count ( $results );
2015-11-08 21:15:08 +00:00
2016-08-20 10:35:04 +00:00
$env -> perfdata -> search_time = round (( microtime ( true ) - $search_start ) * 1000 , 3 );
2015-11-08 21:15:08 +00:00
2015-10-29 11:21:04 +00:00
$title = $_GET [ " query " ] . " - Search results - $settings->sitename " ;
$content = " <section> \n " ;
$content .= " <h1>Search Results</h1> " ;
2015-10-31 14:05:00 +00:00
/// Search Box ///
$content .= " <form method='get' action=''> \n " ;
2017-07-29 09:44:55 +00:00
$content .= " <input type='search' id='search-box' name='query' placeholder='Type your query here and then press enter.' value=' " . htmlentities ( $_GET [ " query " ], ENT_HTML5 | ENT_QUOTES ) . " ' /> \n " ;
2015-10-31 14:05:00 +00:00
$content .= " <input type='hidden' name='action' value='search' /> \n " ;
$content .= " </form> " ;
2015-10-29 11:21:04 +00:00
2016-08-20 10:35:04 +00:00
$content .= " <p>Found $resultCount " . ( $resultCount === 1 ? " result " : " results " ) . " in " . $env -> perfdata -> search_time . " ms. " ;
2015-11-01 10:13:35 +00:00
$query = $_GET [ " query " ];
if ( isset ( $pageindex -> $query ))
{
2016-08-20 10:35:04 +00:00
$content .= " There's a page on $settings->sitename called <a href='?page= " . rawurlencode ( $query ) . " '> $query </a>. " ;
2015-11-01 10:13:35 +00:00
}
else
{
2016-08-20 10:35:04 +00:00
$content .= " There isn't a page called $query on $settings->sitename , but you " ;
2016-03-12 19:02:36 +00:00
if (( ! $settings -> anonedits && ! $env -> is_logged_in ) || ! $settings -> editing )
{
$content .= " do not have permission to create it. " ;
if ( ! $env -> is_logged_in )
{
$content .= " You could try <a href='?action=login&returnto= " . rawurlencode ( $_SERVER [ " REQUEST_URI " ]) . " '>logging in</a>. " ;
}
}
else
{
2016-08-20 10:35:04 +00:00
$content .= " can <a href='?action=edit&page= " . rawurlencode ( $query ) . " '>create it</a>. " ;
2016-03-12 19:02:36 +00:00
}
2015-11-01 10:13:35 +00:00
}
2016-08-20 10:35:04 +00:00
$content .= " </p> " ;
2015-11-01 10:13:35 +00:00
2017-09-19 16:32:52 +00:00
if ( module_exists ( " page-list " )) {
$nterms = search :: tokenize ( $query );
$nterms_regex = implode ( " | " , array_map ( function ( $nterm ) {
return preg_quote ( strtolower ( trim ( $nterm )));
}, $nterms ));
$all_tags = get_all_tags ();
$matching_tags = [];
foreach ( $all_tags as $tag ) {
if ( preg_match ( " / $nterms_regex /i " , trim ( $tag )) > 0 )
$matching_tags [] = $tag ;
}
if ( count ( $matching_tags ) > 0 ) {
$content .= " <p>Matching tags: <span class='tags'> " ;
foreach ( $matching_tags as $tag ) {
$content .= " \t <a href='?action=list-tags&tag= " . rawurlencode ( $tag ) . " ' class='mini-tag'> " . htmlentities ( $tag ) . " </a> \n " ;
}
$content .= " </span></p> " ;
}
}
2015-10-31 14:16:19 +00:00
$i = 0 ; // todo use $_GET["offset"] and $_GET["result-count"] or something
2015-10-29 11:21:04 +00:00
foreach ( $results as $result )
{
$link = " ?page= " . rawurlencode ( $result [ " pagename " ]);
2015-11-08 21:15:08 +00:00
$pagesource = file_get_contents ( $env -> storage_prefix . $result [ " pagename " ] . " .md " );
2016-08-19 12:02:42 +00:00
//echo("Extracting context for result " . $result["pagename"] . ".\n");
2015-10-29 11:21:04 +00:00
$context = search :: extract_context ( $_GET [ " query " ], $pagesource );
2017-03-20 20:08:56 +00:00
if ( strlen ( $context ) === 0 )
$context = substr ( $pagesource , 0 , $settings -> search_characters_context * 2 );
2016-08-19 12:02:42 +00:00
//echo("'Generated search context for " . $result["pagename"] . ": $context'\n");
2017-08-26 11:40:33 +00:00
$context = search :: highlight_context ( $_GET [ " query " ], htmlentities ( $context ));
2015-11-02 14:42:38 +00:00
/* if ( strlen ( $context ) == 0 )
2015-11-01 15:05:54 +00:00
{
$context = search :: strip_markup ( file_get_contents ( " $env->page .md " , null , null , null , $settings -> search_characters_context * 2 ));
if ( $pageindex -> { $env -> page } -> size > $settings -> search_characters_context * 2 )
$context .= " ... " ;
2015-11-02 14:42:38 +00:00
} */
2015-11-01 15:05:54 +00:00
2017-10-14 21:48:58 +00:00
$tag_list = " <span class='tags'> " ;
foreach ( $pageindex -> { $result [ " pagename " ]} -> tags ? ? [] as $tag ) $tag_list .= " <a href='?action=list-tags&tag= " . rawurlencode ( $tag ) . " ' class='mini-tag'> $tag </a> " ;
$tag_list .= " </span> \n " ;
2017-03-20 20:21:25 +00:00
// Make redirect pages italics
if ( ! empty ( $pageindex -> { $result [ " pagename " ]} -> redirect ))
$result [ " pagename " ] = " <em> { $result [ " pagename " ] } </em> " ;
2015-10-29 11:21:04 +00:00
2015-10-31 14:16:19 +00:00
// We add 1 to $i here to convert it from an index to a result
// number as people expect it to start from 1
$content .= " <div class='search-result' data-result-number=' " . ( $i + 1 ) . " ' data-rank=' " . $result [ " rank " ] . " '> \n " ;
2017-10-14 21:48:58 +00:00
$content .= " <h2><a href=' $link '> " . $result [ " pagename " ] . " </a> <span class='search-result-badges'> $tag_list </span></h2> \n " ;
2016-08-19 12:47:56 +00:00
$content .= " <p class='search-context'> $context </p> \n " ;
2015-10-29 11:21:04 +00:00
$content .= " </div> \n " ;
2015-10-31 14:16:19 +00:00
$i ++ ;
2015-10-29 11:21:04 +00:00
}
$content .= " </section> \n " ;
2016-08-21 20:02:36 +00:00
header ( " content-type: text/html; charset=UTF-8 " );
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( $title , $content ));
//header("content-type: text/plain");
//var_dump($results);
2015-10-28 20:56:10 +00:00
});
2016-10-01 10:32:38 +00:00
2017-03-23 21:13:20 +00:00
/*
* ██████ ██ ██ ███████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ █████ ██████ ████ █████
* ██ ▄▄ ██ ██ ██ ██ ██ ██ ██
* ██████ ██████ ███████ ██ ██ ██
* ▀▀
* ███████ ███████ █████ ██████ ██████ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████ ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██ ██ ██ ████ ██████ ███████ ██ ██
*/
/**
* @ api { get } ? action = query - searchindex & query = { text } Inspect the internals of the search results for a query
* @ apiName Search
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to search for .
*/
add_action ( " query-searchindex " , function () {
global $env , $paths ;
if ( empty ( $_GET [ " query " ])) {
http_response_code ( 400 );
header ( " content-type: text/plain " );
exit ( " Error: No query specified. Specify it with the 'query' GET parameter. " );
}
$env -> perfdata -> searchindex_decode_start = microtime ( true );
$searchIndex = search :: load_invindex ( $paths -> searchindex );
$env -> perfdata -> searchindex_decode_time = ( microtime ( true ) - $env -> perfdata -> searchindex_decode_start ) * 1000 ;
$env -> perfdata -> searchindex_query_start = microtime ( true );
$searchResults = search :: query_invindex ( $_GET [ " query " ], $searchIndex );
$env -> perfdata -> searchindex_query_time = ( microtime ( true ) - $env -> perfdata -> searchindex_query_start ) * 1000 ;
header ( " content-type: application/json " );
$result = new stdClass ();
$result -> time_format = " ms " ;
$result -> decode_time = $env -> perfdata -> searchindex_decode_time ;
$result -> query_time = $env -> perfdata -> searchindex_query_time ;
$result -> total_time = $result -> decode_time + $result -> query_time ;
$result -> search_results = $searchResults ;
exit ( json_encode ( $result , JSON_PRETTY_PRINT ));
});
2017-06-28 08:45:13 +00:00
/*
* ██████ ██████ ███████ ███ ██ ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██████ █████ ██ ██ ██ ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ ██ ███████ ██ ████ ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
/**
* @ api { get } ? action = opensearch - description Get the opensearch description file
* @ apiName OpenSearchDescription
* @ apiGroup Search
* @ apiPermission Anonymous
2016-10-01 10:32:38 +00:00
*/
add_action ( " opensearch-description " , function () {
global $settings ;
$siteRoot = full_url () . " /index.php " ;
if ( ! isset ( $_GET [ " debug " ]))
header ( " content-type: application/opensearchdescription+xml " );
else
header ( " content-type: text/plain " );
2017-06-06 20:33:11 +00:00
exit ( '<?xml version="1.0" encoding="UTF-8"?' . '>' . // hack The build system strips it otherwise O.o I should really fix that.
" \n <OpenSearchDescription xmlns= \" http://a9.com/-/spec/opensearch/1.1/ \" >
2016-10-01 10:32:38 +00:00
< ShortName > Search $settings -> sitename </ ShortName >
< Description > Search $settings -> sitename , which is powered by Pepperminty Wiki .</ Description >
< Tags > $settings -> sitename Wiki </ Tags >
< Image type = \ " image/png \" > $settings->favicon </Image>
< Attribution > Search content available under the license linked to at the bottom of the search results page .</ Attribution >
< Developer > Starbeamrainbowlabs ( https :// github . com / sbrl / Pepperminty - Wiki / graphs / contributors ) </ Developer >
< InputEncoding > UTF - 8 </ InputEncoding >
< OutputEncoding > UTF - 8 </ OutputEncoding >
2017-07-16 09:26:22 +00:00
< Url type = \ " text/html \" method= \" get \" template= \" $siteRoot ?action=view&search-redirect=yes&page= { searchTerms}&offset= { startIndex?}&count= { count} \" />
2017-06-28 10:21:42 +00:00
< Url type = \ " application/x-suggestions+json \" template= \" $siteRoot ?action=suggest-pages&query= { searchTerms}&type=opensearch \" />
2017-06-06 20:33:11 +00:00
</ OpenSearchDescription > " );
2016-10-01 10:32:38 +00:00
});
2016-11-20 13:24:35 +00:00
2017-06-28 08:45:13 +00:00
/**
2017-06-28 09:44:44 +00:00
* @ api { get } ? action = suggest - pages [ & type = { type }] Get search suggestions for a query
2017-06-28 08:45:13 +00:00
* @ apiName OpenSearchDescription
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } text The search query string to get search suggestions for .
2017-06-28 09:44:44 +00:00
* @ apiParam { string } type The type of result to return . Default value : json . Available values : json , opensearch
2017-06-28 08:45:13 +00:00
*/
2016-11-20 13:24:35 +00:00
add_action ( " suggest-pages " , function () {
global $settings , $pageindex ;
if ( $settings -> dynamic_page_suggestion_count === 0 )
{
header ( " content-type: application/json " );
2017-06-28 10:21:42 +00:00
header ( " content-length: 3 " );
exit ( " [] \n " );
2016-11-20 13:24:35 +00:00
}
if ( empty ( $_GET [ " query " ])) {
http_response_code ( 400 );
header ( " content-type: text/plain " );
exit ( " Error: You didn't specify the 'query' GET parameter. " );
}
2017-06-28 09:44:44 +00:00
$type = $_GET [ " type " ] ? ? " json " ;
if ( ! in_array ( $type , [ " json " , " opensearch " ])) {
http_response_code ( 406 );
exit ( " Error: The type ' $type ' is not one of the supported output types. Available values: json, opensearch. Default: json " );
}
2016-11-20 13:24:35 +00:00
// Rank each page name
$results = [];
foreach ( $pageindex as $pageName => $entry ) {
$results [] = [
" pagename " => $pageName ,
// Costs: Insert: 1, Replace: 8, Delete: 6
2017-06-28 09:44:44 +00:00
" distance " => levenshtein ( mb_strtolower ( $_GET [ " query " ]), mb_strtolower ( $pageName ), 1 , 8 , 6 )
2016-11-20 13:24:35 +00:00
];
}
2017-06-28 09:44:44 +00:00
// Sort the page names by distance from the original query
2016-11-20 13:24:35 +00:00
usort ( $results , function ( $a , $b ) {
if ( $a [ " distance " ] == $b [ " distance " ])
return strcmp ( $a [ " pagename " ], $b [ " pagename " ]);
return $a [ " distance " ] < $b [ " distance " ] ? - 1 : 1 ;
});
// Send the results to the user
2017-06-28 09:44:44 +00:00
$suggestions = array_slice ( $results , 0 , $settings -> dynamic_page_suggestion_count );
switch ( $type )
{
case " json " :
header ( " content-type: application/json " );
exit ( json_encode ( $suggestions ));
case " opensearch " :
$opensearch_output = [
$_GET [ " query " ],
array_map ( function ( $suggestion ) { return $suggestion [ " pagename " ]; }, $suggestions )
];
header ( " content-type: application/x-suggestions+json " );
exit ( json_encode ( $opensearch_output ));
}
2016-11-20 13:24:35 +00:00
});
if ( $settings -> dynamic_page_suggestion_count > 0 )
{
page_renderer :: AddJSSnippet ( ' /// Dynamic page suggestion system
// Micro snippet 8 - Promisified GET (fetched 20th Nov 2016)
function get ( u ){ return new Promise ( function ( r , t , a ){ a = new XMLHttpRequest (); a . onload = function ( b , c ){ b = a . status ; c = a . response ; if ( b > 199 && b < 300 ){ r ( c )} else { t ( c )}}; a . open ( " GET " , u , true ); a . send ( null )})}
window . addEventListener ( " load " , function ( event ) {
var searchBox = document . querySelector ( " input[type=search] " );
searchBox . dataset . lastValue = " " ;
searchBox . addEventListener ( " keyup " , function ( event ) {
// Make sure that we don\'t keep sending requests to the server if nothing has changed
if ( searchBox . dataset . lastValue == event . target . value )
return ;
searchBox . dataset . lastValue = event . target . value ;
// Fetch the suggestions from the server
get ( " ?action=suggest-pages&query= " + encodeURIComponent ( event . target . value )) . then ( function ( response ) {
var suggestions = JSON . parse ( response ),
dataList = document . getElementById ( " allpages " );
// If the server sent no suggestions, then we shouldn\'t replace the contents of the datalist
if ( suggestions . length == 0 )
return ;
console . info ( `Fetched suggestions for ${event.target.value}:` , suggestions . map ( s => s . pagename ));
// Remove all the existing suggestions
while ( dataList . firstChild ) {
dataList . removeChild ( dataList . firstChild );
}
// Add the new suggestions to the datalist
2016-11-28 13:05:23 +00:00
var optionsFrag = document . createDocumentFragment ();
2016-11-20 13:24:35 +00:00
suggestions . forEach ( function ( suggestion ) {
var suggestionElement = document . createElement ( " option " );
suggestionElement . value = suggestion . pagename ;
suggestionElement . dataset . distance = suggestion . distance ;
2016-11-28 13:05:23 +00:00
optionsFrag . appendChild ( suggestionElement );
2016-11-20 13:24:35 +00:00
});
2016-11-28 13:05:23 +00:00
dataList . appendChild ( optionsFrag );
2016-11-20 13:24:35 +00:00
});
});
});
' );
}
2015-10-27 21:10:05 +00:00
}
]);
2015-10-28 09:14:41 +00:00
class search
{
2016-11-20 13:24:35 +00:00
// Words that we should exclude from the inverted index
2015-10-28 09:14:41 +00:00
public static $stop_words = [
2015-10-28 14:31:27 +00:00
" a " , " about " , " above " , " above " , " across " , " after " , " afterwards " , " again " ,
" against " , " all " , " almost " , " alone " , " along " , " already " , " also " ,
2015-11-08 21:15:08 +00:00
" although " , " always " , " am " , " among " , " amongst " , " amoungst " , " amount " ,
2015-10-28 14:31:27 +00:00
" an " , " and " , " another " , " any " , " anyhow " , " anyone " , " anything " , " anyway " ,
" anywhere " , " are " , " around " , " as " , " at " , " back " , " be " , " became " ,
" because " , " become " , " becomes " , " becoming " , " been " , " before " ,
" beforehand " , " behind " , " being " , " below " , " beside " , " besides " ,
" between " , " beyond " , " bill " , " both " , " bottom " , " but " , " by " , " call " ,
" can " , " cannot " , " cant " , " co " , " con " , " could " , " couldnt " , " cry " , " de " ,
" describe " , " detail " , " do " , " done " , " down " , " due " , " during " , " each " ,
" eg " , " eight " , " either " , " eleven " , " else " , " elsewhere " , " empty " ,
" enough " , " etc " , " even " , " ever " , " every " , " everyone " , " everything " ,
2016-11-20 13:24:35 +00:00
" everywhere " , " except " , " few " , " fill " , " find " ,
" fire " , " first " , " five " , " for " , " former " , " formerly " , " found " ,
2015-10-28 14:31:27 +00:00
" four " , " from " , " front " , " full " , " further " , " get " , " give " , " go " , " had " ,
" has " , " hasnt " , " have " , " he " , " hence " , " her " , " here " , " hereafter " ,
" hereby " , " herein " , " hereupon " , " hers " , " herself " , " him " , " himself " ,
2016-11-20 13:24:35 +00:00
" his " , " how " , " however " , " ie " , " if " , " in " , " inc " , " indeed " ,
2015-10-28 14:31:27 +00:00
" interest " , " into " , " is " , " it " , " its " , " itself " , " keep " , " last " ,
" latter " , " latterly " , " least " , " less " , " ltd " , " made " , " many " , " may " ,
" me " , " meanwhile " , " might " , " mine " , " more " , " moreover " , " most " ,
" mostly " , " move " , " much " , " must " , " my " , " myself " , " name " , " namely " ,
" neither " , " never " , " nevertheless " , " next " , " nine " , " no " , " none " ,
" nor " , " not " , " nothing " , " now " , " nowhere " , " of " , " off " , " often " , " on " ,
" once " , " one " , " only " , " onto " , " or " , " other " , " others " , " otherwise " ,
" our " , " ours " , " ourselves " , " out " , " over " , " own " , " part " , " per " ,
" perhaps " , " please " , " put " , " rather " , " re " , " same " , " see " , " seem " ,
" seemed " , " seeming " , " seems " , " serious " , " several " , " she " , " should " ,
" show " , " side " , " since " , " sincere " , " six " , " sixty " , " so " , " some " ,
" somehow " , " someone " , " something " , " sometime " , " sometimes " ,
" somewhere " , " still " , " such " , " system " , " take " , " ten " , " than " , " that " ,
" the " , " their " , " them " , " themselves " , " then " , " thence " , " there " ,
" thereafter " , " thereby " , " therefore " , " therein " , " thereupon " , " these " ,
" they " , " thickv " , " thin " , " third " , " this " , " those " , " though " , " three " ,
" through " , " throughout " , " thru " , " thus " , " to " , " together " , " too " , " top " ,
" toward " , " towards " , " twelve " , " twenty " , " two " , " un " , " under " , " until " ,
" up " , " upon " , " us " , " very " , " via " , " was " , " we " , " well " , " were " , " what " ,
" whatever " , " when " , " whence " , " whenever " , " where " , " whereafter " ,
" whereas " , " whereby " , " wherein " , " whereupon " , " wherever " , " whether " ,
" which " , " while " , " whither " , " who " , " whoever " , " whole " , " whom " , " whose " ,
" why " , " will " , " with " , " within " , " without " , " would " , " yet " , " you " ,
" your " , " yours " , " yourself " , " yourselves "
2015-10-28 09:14:41 +00:00
];
2015-10-28 20:56:10 +00:00
2015-10-28 09:14:41 +00:00
public static function index ( $source )
{
$source = html_entity_decode ( $source , ENT_QUOTES );
$source_length = strlen ( $source );
2015-10-28 14:31:27 +00:00
2015-10-28 09:14:41 +00:00
$index = [];
2015-10-28 14:31:27 +00:00
2015-10-28 20:56:10 +00:00
$terms = self :: tokenize ( $source );
2015-10-28 09:14:41 +00:00
$i = 0 ;
foreach ( $terms as $term )
{
2015-10-28 20:56:10 +00:00
$nterm = $term ;
2015-10-28 09:14:41 +00:00
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
if ( in_array ( $nterm , self :: $stop_words )) continue ;
if ( ! isset ( $index [ $nterm ]))
{
$index [ $nterm ] = [ " freq " => 0 , " offsets " => [] ];
}
2015-10-28 11:36:07 +00:00
2015-10-28 09:14:41 +00:00
$index [ $nterm ][ " freq " ] ++ ;
$index [ $nterm ][ " offsets " ][] = $i ;
2015-10-28 11:36:07 +00:00
2015-10-28 09:14:41 +00:00
$i ++ ;
}
return $index ;
}
2015-10-28 20:56:10 +00:00
public static function tokenize ( $source )
{
$source = strtolower ( $source );
2016-10-25 20:12:39 +00:00
$source = str_replace ([ '[' , ']' , '|' , '{' , '}' ], " " , $source );
2016-08-26 16:47:46 +00:00
return preg_split ( " /((^ \ p { P}+)|( \ p { P}* \ s+ \ p { P}*)|( \ p { P}+ $ ))| \ |/u " , $source , - 1 , PREG_SPLIT_NO_EMPTY );
2015-10-28 20:56:10 +00:00
}
2015-10-29 11:21:04 +00:00
public static function strip_markup ( $source )
{
return str_replace ([ " [ " , " ] " , " \" " , " * " , " _ " , " - " , " ` " ], " " , $source );
}
2017-07-10 20:53:52 +00:00
public static function rebuild_invindex ( $output = true )
2015-10-28 20:56:10 +00:00
{
2017-07-11 19:21:20 +00:00
global $pageindex , $env , $paths , $settings ;
2015-10-28 20:56:10 +00:00
2017-07-11 19:21:20 +00:00
if ( $output ) {
2017-07-10 20:53:52 +00:00
header ( " content-type: text/event-stream " );
2017-07-11 19:21:20 +00:00
ob_end_flush ();
}
2016-08-26 16:47:46 +00:00
2017-03-23 20:48:42 +00:00
// Clear the id index out
ids :: clear ();
// Reindex each page in turn
2015-10-28 20:56:10 +00:00
$invindex = [];
2017-07-10 20:53:52 +00:00
$i = 0 ; $max = count ( get_object_vars ( $pageindex ));
2017-07-11 19:21:20 +00:00
$missing_files = 0 ;
2015-10-28 20:56:10 +00:00
foreach ( $pageindex as $pagename => $pagedetails )
{
2017-07-11 19:21:20 +00:00
$page_filename = $env -> storage_prefix . $pagedetails -> filename ;
if ( ! file_exists ( $page_filename )) {
echo ( " data: [ " . ( $i + 1 ) . " / $max ] Error: Can't find $page_filename " );
flush ();
$missing_files ++ ;
continue ;
}
$pagesource = utf8_encode ( file_get_contents ( $page_filename ));
2015-10-28 20:56:10 +00:00
$index = self :: index ( $pagesource );
2017-03-23 20:48:42 +00:00
$pageid = ids :: getid ( $pagename );
self :: merge_into_invindex ( $invindex , $pageid , $index );
2017-07-10 20:53:52 +00:00
if ( $output ) {
2017-07-11 19:21:20 +00:00
echo ( " data: [ " . ( $i + 1 ) . " / $max ] Added $pagename (id # $pageid ) to the new search index. \n \n " );
2017-07-10 20:53:52 +00:00
flush ();
}
$i ++ ;
2015-10-28 20:56:10 +00:00
}
2016-08-19 11:02:09 +00:00
2017-07-10 20:53:52 +00:00
if ( $output ) {
2017-07-11 19:21:20 +00:00
echo ( " data: Search index rebuilding complete. \n \n " );
echo ( " data: Couldn't find $missing_files pages on disk. If $settings->sitename couldn't find some pages on disk, then you might need to manually correct $settings->sitename 's page index (stored in pageindex.json). \n \n " );
echo ( " data: Done! Saving new search index to ' $paths->searchindex '. \n \n " );
2017-07-10 20:53:52 +00:00
}
2016-08-26 16:47:46 +00:00
2015-11-11 18:42:12 +00:00
self :: save_invindex ( $paths -> searchindex , $invindex );
2015-10-28 20:56:10 +00:00
}
2015-10-28 09:14:41 +00:00
/*
2015-10-28 14:31:27 +00:00
* @ summary Sorts an index alphabetically . Will also sort an inverted index .
* This allows us to do a binary search instead of a regular
* sequential search .
2015-10-28 09:14:41 +00:00
*/
public static function sort_index ( & $index )
{
ksort ( $index , SORT_NATURAL );
}
2015-10-28 14:31:27 +00:00
2015-10-28 09:14:41 +00:00
/*
2015-10-28 14:31:27 +00:00
* @ summary Compares two * regular * indexes to find the differences between them .
*
* @ param { array } $indexa - The old index .
* @ param { array } $indexb - The new index .
* @ param { array } $changed - An array to be filled with the nterms of all
* the changed entries .
* @ param { array } $removed - An array to be filled with the nterms of all
* the removed entries .
2015-10-28 09:14:41 +00:00
*/
2015-11-01 14:26:13 +00:00
public static function compare_indexes ( $oldindex , $newindex , & $changed , & $removed )
2015-10-28 09:14:41 +00:00
{
2015-11-01 14:26:13 +00:00
foreach ( $oldindex as $nterm => $entry )
2015-10-28 14:31:27 +00:00
{
2015-11-01 14:26:13 +00:00
if ( ! isset ( $newindex [ $nterm ]))
2015-10-28 14:31:27 +00:00
$removed [] = $nterm ;
2015-11-01 14:26:13 +00:00
}
foreach ( $newindex as $nterm => $entry )
{
if ( ! isset ( $oldindex [ $nterm ]) or // If this world is new
$newindex [ $nterm ] !== $oldindex [ $nterm ]) // If this word has changed
$changed [ $nterm ] = $newindex [ $nterm ];
2015-10-28 14:31:27 +00:00
}
2015-10-28 09:14:41 +00:00
}
/*
2015-10-28 14:31:27 +00:00
* @ summary Reads in and parses an inverted index .
2015-10-28 09:14:41 +00:00
*/
2015-10-28 14:31:27 +00:00
// Todo remove this function and make everything streamable
2015-10-29 11:21:04 +00:00
public static function load_invindex ( $invindex_filename ) {
2015-10-28 14:31:27 +00:00
$invindex = json_decode ( file_get_contents ( $invindex_filename ), true );
return $invindex ;
}
2016-08-20 10:27:26 +00:00
public static function measure_invindex_load_time ( $invindex_filename ) {
global $env ;
$searchindex_decode_start = microtime ( true );
search :: load_invindex ( $invindex_filename );
$env -> perfdata -> searchindex_decode_time = round (( microtime ( true ) - $searchindex_decode_start ) * 1000 , 3 );
}
2015-10-28 14:31:27 +00:00
/*
* @ summary Merge an index into an inverted index .
*/
public static function merge_into_invindex ( & $invindex , $pageid , & $index , & $removals = [])
2015-10-28 09:14:41 +00:00
{
2015-10-28 14:31:27 +00:00
// Remove all the subentries that were removed since last time
foreach ( $removals as $nterm )
{
unset ( $invindex [ $nterm ][ $pageid ]);
}
2015-10-28 09:14:41 +00:00
2015-10-28 14:31:27 +00:00
// Merge all the new / changed index entries into the inverted index
foreach ( $index as $nterm => $newentry )
{
// If the nterm isn't in the inverted index, then create a space for it
if ( ! isset ( $invindex [ $nterm ])) $invindex [ $nterm ] = [];
$invindex [ $nterm ][ $pageid ] = $newentry ;
2015-10-28 20:56:10 +00:00
// Sort the page entries for this word by frequency
uasort ( $invindex [ $nterm ], function ( $a , $b ) {
if ( $a [ " freq " ] == $b [ " freq " ]) return 0 ;
return ( $a [ " freq " ] < $b [ " freq " ]) ? + 1 : - 1 ;
});
2015-10-28 14:31:27 +00:00
}
2015-11-01 14:26:13 +00:00
// Sort the inverted index by rank
uasort ( $invindex , function ( $a , $b ) {
$ac = count ( $a ); $bc = count ( $b );
if ( $ac == $bc ) return 0 ;
return ( $ac < $bc ) ? + 1 : - 1 ;
});
2015-10-28 14:31:27 +00:00
}
2015-11-14 17:01:23 +00:00
/**
* Deletes the given pageid from the given pageindex .
* @ param inverted_index & $invindex The inverted index .
* @ param number $pageid The pageid to remove .
*/
public static function delete_entry ( & $invindex , $pageid )
{
$str_pageid = ( string ) $pageid ;
foreach ( $invindex as $nterm => & $entry )
{
if ( isset ( $entry [ $pageid ]))
unset ( $entry [ $pageid ]);
if ( isset ( $entry [ $str_pageid ]))
unset ( $entry [ $str_pageid ]);
if ( count ( $entry ) === 0 )
unset ( $invindex [ $nterm ]);
}
}
2015-10-28 14:31:27 +00:00
public static function save_invindex ( $filename , & $invindex )
{
file_put_contents ( $filename , json_encode ( $invindex ));
2015-10-28 09:14:41 +00:00
}
2015-10-28 20:56:10 +00:00
2015-10-29 11:21:04 +00:00
public static function query_invindex ( $query , & $invindex )
2015-10-28 20:56:10 +00:00
{
2015-11-01 15:05:54 +00:00
global $settings , $pageindex ;
2015-10-28 20:56:10 +00:00
$query_terms = self :: tokenize ( $query );
$matching_pages = [];
2015-11-01 15:05:54 +00:00
2015-10-28 20:56:10 +00:00
// Loop over each term in the query and find the matching page entries
2015-12-26 13:06:45 +00:00
$count = count ( $query_terms );
for ( $i = 0 ; $i < $count ; $i ++ )
2015-10-28 20:56:10 +00:00
{
$qterm = $query_terms [ $i ];
2017-03-23 21:25:55 +00:00
// Stop words aren't worth the bother - make sure we don't search
// the title or the tags for them
if ( in_array ( $qterm , self :: $stop_words ))
continue ;
2015-11-01 15:05:54 +00:00
// Only search the inverted index if it actually exists there
if ( isset ( $invindex [ $qterm ]))
{
// Loop over each page in the inverted index entry
foreach ( $invindex [ $qterm ] as $pageid => $page_entry )
{
// Create an entry in the matching pages array if it doesn't exist
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
$matching_pages [ $pageid ][ " nterms " ][ $qterm ] = $page_entry ;
}
}
2015-10-28 20:56:10 +00:00
2015-11-01 15:05:54 +00:00
// Loop over the pageindex and search the titles / tags
foreach ( $pageindex as $pagename => $pagedata )
2015-10-28 20:56:10 +00:00
{
2015-11-01 15:05:54 +00:00
// Get the current page's id
$pageid = ids :: getid ( $pagename );
// Consider matches in the page title
if ( stripos ( $pagename , $qterm ) !== false )
{
// We found the qterm in the title
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
// Set up a counter for page title matches if it doesn't exist already
if ( ! isset ( $matching_pages [ $pageid ][ " title-matches " ]))
$matching_pages [ $pageid ][ " title-matches " ] = 0 ;
2017-10-15 12:42:15 +00:00
$matching_pages [ $pageid ][ " title-matches " ] += count ( mb_stripos_all ( $pagename , $qterm )) * strlen ( $qterm );
2015-11-01 15:05:54 +00:00
}
// Consider matches in the page's tags
if ( isset ( $pagedata -> tags ) and // If this page has tags
stripos ( implode ( " " , $pagedata -> tags ), $qterm ) !== false ) // And we found the qterm in the tags
{
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
// Set up a counter for tag match if there isn't one already
if ( ! isset ( $matching_pages [ $pageid ][ " tag-matches " ]))
$matching_pages [ $pageid ][ " tag-matches " ] = 0 ;
2017-10-15 12:42:15 +00:00
$matching_pages [ $pageid ][ " tag-matches " ] += count ( mb_stripos_all ( implode ( " " , $pagedata -> tags ), $qterm )) * strlen ( $qterm );
2015-11-01 15:05:54 +00:00
}
2015-10-28 20:56:10 +00:00
}
}
2015-11-01 15:05:54 +00:00
2015-10-29 11:21:04 +00:00
foreach ( $matching_pages as $pageid => & $pagedata )
2015-10-28 20:56:10 +00:00
{
2015-10-29 11:21:04 +00:00
$pagedata [ " pagename " ] = ids :: getpagename ( $pageid );
2015-10-28 20:56:10 +00:00
$pagedata [ " rank " ] = 0 ;
2017-03-23 20:48:42 +00:00
$pageOffsets = [];
// Loop over each search term found on this page
2015-10-29 11:21:04 +00:00
foreach ( $pagedata [ " nterms " ] as $pterm => $entry )
2015-10-28 20:56:10 +00:00
{
2017-03-23 20:48:42 +00:00
// Add the number of occurrences of this search term to the ranking
2017-10-15 12:42:15 +00:00
// Multiply it by the length of the word
$pagedata [ " rank " ] += $entry [ " freq " ] * strlen ( $pterm );
2015-10-28 20:56:10 +00:00
2017-03-23 20:48:42 +00:00
// Add the offsets to a listof all offsets on this page
foreach ( $entry [ " offsets " ] as $offset )
$pageOffsets [] = $offset ;
2015-10-28 20:56:10 +00:00
}
2017-03-23 20:48:42 +00:00
/*
// Sort the list of offsets
$pageOffsets = array_unique ( $pageOffsets );
sort ( $pageOffsets );
var_dump ( $pageOffsets );
// Calcualate the clump distances via a variable moving window size
$pageOffsetsCount = count ( $pageOffsets );
$clumpDistanceWindow = min ( $count , $pageOffsetsCount ); // a.k.a. count($query_terms) - see above
$clumpDistances = [];
for ( $i = 0 ; $i < $pageOffsetsCount - $clumpDistanceWindow ; $i ++ )
$clumpDistances [] = $pageOffsets [ $i ] - $pageOffsets [ $i + $clumpDistanceWindow ];
// Sort the new list of clump distances
sort ( $clumpDistances );
// Calcualate a measureof how clumped the offsets are
$tightClumpLimit = floor (( count ( $clumpDistances ) - 1 ) / 0.25 );
$tightClumpsMeasure = $clumpDistances [ $tightClumpLimit ] - $clumpDistances [ 0 ];
$clumpsRange = $clumpDistances [ count ( $clumpDistances ) - 1 ] - $clumpDistances [ 0 ];
$clumpiness = $tightClumpsMeasure / $clumpsRange ;
echo ( " { $pagedata [ " pagename " ] } - $clumpiness " );
*/
2015-10-28 20:56:10 +00:00
2015-11-01 15:05:54 +00:00
// Consider matches in the title / tags
if ( isset ( $pagedata [ " title-matches " ]))
$pagedata [ " rank " ] += $pagedata [ " title-matches " ] * $settings -> search_title_matches_weighting ;
if ( isset ( $pagedata [ " tag-matches " ]))
$pagedata [ " rank " ] += $pagedata [ " tag-matches " ] * $settings -> search_tags_matches_weighting ;
2015-10-28 20:56:10 +00:00
// todo remove items if the rank is below a threshold
}
// todo sort by rank here
uasort ( $matching_pages , function ( $a , $b ) {
if ( $a [ " rank " ] == $b [ " rank " ]) return 0 ;
return ( $a [ " rank " ] < $b [ " rank " ]) ? + 1 : - 1 ;
});
return $matching_pages ;
}
2015-10-29 11:21:04 +00:00
public static function extract_context ( $query , $source )
{
global $settings ;
$nterms = self :: tokenize ( $query );
$matches = [];
// Loop over each nterm and find it in the source
foreach ( $nterms as $nterm )
{
2017-01-26 20:55:46 +00:00
if ( in_array ( $nterm , static :: $stop_words ))
continue ;
2015-10-29 11:21:04 +00:00
$all_offsets = mb_stripos_all ( $source , $nterm );
// Skip over adding matches if there aren't any
if ( $all_offsets === false )
continue ;
foreach ( $all_offsets as $offset )
{
$matches [] = [ $nterm , $offset ];
}
}
2016-08-19 11:02:09 +00:00
// Sort the matches by offset
2015-10-29 11:21:04 +00:00
usort ( $matches , function ( $a , $b ) {
if ( $a [ 1 ] == $b [ 1 ]) return 0 ;
2016-08-19 11:02:09 +00:00
return ( $a [ 1 ] > $b [ 1 ]) ? + 1 : - 1 ;
2015-10-29 11:21:04 +00:00
});
2016-08-19 12:02:42 +00:00
$sourceLength = strlen ( $source );
2015-10-29 11:21:04 +00:00
$contexts = [];
$basepos = 0 ;
$matches_count = count ( $matches );
while ( $basepos < $matches_count )
{
2016-08-19 11:02:09 +00:00
// Store the next match along - all others will be relative to that one
2015-10-29 11:21:04 +00:00
$group = [ $matches [ $basepos ]];
// Start scanning at the next one along - we always store the first match
$scanpos = $basepos + 1 ;
$distance = 0 ;
while ( true )
{
// Break out if we reach the end
if ( $scanpos >= $matches_count ) break ;
// Find the distance between the current one and the last one
$distance = $matches [ $scanpos ][ 1 ] - $matches [ $scanpos - 1 ][ 1 ];
// Store it if the distance is below the threshold
if ( $distance < $settings -> search_characters_context )
$group [] = $matches [ $scanpos ];
else
break ;
$scanpos ++ ;
}
$context_start = $group [ 0 ][ 1 ] - $settings -> search_characters_context ;
$context_end = $group [ count ( $group ) - 1 ][ 1 ] + $settings -> search_characters_context ;
2016-08-19 12:02:42 +00:00
if ( $context_start < 0 ) $context_start = 0 ;
if ( $context_end > $sourceLength ) $context_end = $sourceLength ;
2016-08-19 11:02:09 +00:00
//echo("Got context. Start: $context_start, End: $context_end\n");
//echo("Group:"); var_dump($group);
2015-10-29 11:21:04 +00:00
$context = substr ( $source , $context_start , $context_end - $context_start );
// Strip the markdown from the context - it's most likely going to
// be broken anyway.
2016-08-19 12:02:42 +00:00
//$context = self::strip_markup($context);
// Escape special characters to protect against attacks
$context = htmlentities ( $context );
2015-10-29 11:21:04 +00:00
$contexts [] = $context ;
$basepos = $scanpos + 1 ;
}
return implode ( " ... " , $contexts );
}
2015-10-31 13:52:50 +00:00
public static function highlight_context ( $query , $context )
{
$qterms = self :: tokenize ( $query );
foreach ( $qterms as $qterm )
{
2017-01-26 20:55:46 +00:00
if ( in_array ( $qterm , static :: $stop_words ))
continue ;
2015-10-31 14:05:00 +00:00
// From http://stackoverflow.com/a/2483859/1460422
2016-08-19 09:06:21 +00:00
$context = preg_replace ( " / " . str_replace ( " / " , " \ / " , preg_quote ( $qterm )) . " /i " , " <strong class='search-term-highlight'> $ 0</strong> " , $context );
2015-10-31 13:52:50 +00:00
}
return $context ;
}
2015-10-29 11:21:04 +00:00
}
2015-10-27 21:10:05 +00:00
?>