2015-10-27 21:10:05 +00:00
< ? php
register_module ([
" name " => " Search " ,
2020-03-11 23:51:49 +00:00
" version " => " 0.12 " ,
2015-10-27 21:10:05 +00:00
" author " => " Starbeamrainbowlabs " ,
2016-03-12 18:52:26 +00:00
" description " => " Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch. " ,
2015-10-27 21:10:05 +00:00
" id " => " feature-search " ,
2020-03-11 23:51:49 +00:00
// After refactoring, we'll need to specify dependencies like this
2020-03-11 23:32:10 +00:00
// "depends" => [ "search-engine" ],
2015-10-27 21:10:05 +00:00
" code " => function () {
2019-08-17 19:47:51 +00:00
global $settings , $paths ;
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = index & page = { pageName } Get an index of words for a given page
* @ apiName SearchIndex
* @ apiGroup Search
* @ apiPermission Anonymous
2018-02-14 23:08:28 +00:00
* @ apiDescription For debugging purposes . Be warned - the format could change at any time !
2016-06-12 20:15:43 +00:00
*
* @ apiParam { string } page The page to generate a word index page .
*/
2015-12-26 12:55:19 +00:00
/*
* ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ██████ ███████ ██ ██
*/
2015-10-27 21:10:05 +00:00
add_action ( " index " , function () {
global $settings , $env ;
$breakable_chars = " \r \n \t ., \\ /! \" £ $ %^&*[]()+`_~# " ;
header ( " content-type: text/plain " );
2015-11-08 21:15:08 +00:00
$source = file_get_contents ( " $env->storage_prefix $env->page .md " );
2015-10-27 21:10:05 +00:00
2019-08-18 17:52:29 +00:00
$index = search :: index_generate ( $source );
2015-10-27 21:10:05 +00:00
2018-06-29 11:08:38 +00:00
echo ( " Page name: $env->page\n " );
echo ( " --------------- Source --------------- \n " );
echo ( $source ); echo ( " \n " );
echo ( " -------------------------------------- \n \n " );
echo ( " ---------------- Index --------------- \n " );
foreach ( $index as $term => $entry ) {
echo ( " $term : { $entry [ " freq " ] } matches | " . implode ( " , " , $entry [ " offsets " ]) . " \n " );
}
echo ( " -------------------------------------- \n " );
2015-10-27 21:10:05 +00:00
});
2015-10-28 20:56:10 +00:00
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = invindex - rebuild Rebuild the inverted search index from scratch
* @ apiDescription Causes the inverted search index to be completely rebuilt from scratch . Can take a while for large wikis !
* @ apiName SearchInvindexRebuild
* @ apiGroup Search
2017-07-10 21:10:18 +00:00
* @ apiPermission Admin
*
* @ apiParam { string } secret Optional . Specify the secret from peppermint . json here in order to rebuild the search index without logging in .
2016-06-12 20:15:43 +00:00
*/
2015-12-26 12:55:19 +00:00
/*
* ██ ███ ██ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ █████ ███ █████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ████ ██ ██ ████ ██████ ███████ ██ ██
*
* ██████ ███████ ██████ ██ ██ ██ ██ ██████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ █████ ██████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ███████ ██████ ██████ ██ ███████ ██████
*/
2015-10-28 20:56:10 +00:00
add_action ( " invindex-rebuild " , function () {
2017-07-10 21:06:41 +00:00
global $env , $settings ;
if ( $env -> is_admin ||
(
! empty ( $_POST [ " secret " ]) &&
$_POST [ " secret " ] === $settings -> secret
)
)
2019-08-18 17:52:29 +00:00
search :: invindex_rebuild ();
2017-07-10 21:06:41 +00:00
else
{
http_response_code ( 401 );
exit ( page_renderer :: render_main ( " Error - Search index regenerator - $settings->sitename " , " <p>Error: You aren't allowed to regenerate the search index. Try logging in as an admin, or setting the <code>secret</code> POST parameter to $settings->sitename 's secret - which can be found in $settings->sitename 's <code>peppermint.json</code> file.</p> " ));
}
2015-10-28 20:56:10 +00:00
});
2016-08-26 16:47:46 +00:00
/**
2016-08-26 16:55:50 +00:00
* @ api { get } ? action = idindex - show Show the id index
2018-02-14 23:10:20 +00:00
* @ apiDescription Outputs the id index . Useful if you need to verify that it ' s working as expected . Output is a json object .
2016-08-26 16:47:46 +00:00
* @ apiName SearchShowIdIndex
* @ apiGroup Search
* @ apiPermission Anonymous
*/
add_action ( " idindex-show " , function () {
global $idindex ;
header ( " content-type: application/json; charset=UTF-8 " );
exit ( json_encode ( $idindex , JSON_PRETTY_PRINT ));
});
2016-06-12 20:15:43 +00:00
/**
2018-02-14 23:08:28 +00:00
* @ api { get } ? action = search & query = { text }[ & format = { format }] Search the wiki for a given query string
2016-06-12 20:15:43 +00:00
* @ apiName Search
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to search for .
2018-02-14 23:08:28 +00:00
* @ apiParam { string } format Optional . Valid values : html , json . In json mode an object is returned with page names as keys , values as search result information - sorted in ranking order .
2016-06-12 20:15:43 +00:00
*/
2015-12-26 12:55:19 +00:00
/*
* ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
2015-10-28 20:56:10 +00:00
add_action ( " search " , function () {
2015-11-09 07:25:28 +00:00
global $settings , $env , $pageindex , $paths ;
2015-10-28 20:56:10 +00:00
2015-12-05 17:27:01 +00:00
// Create the inverted index if it doesn't exist.
// todo In the future perhaps a CLI for this would be good?
if ( ! file_exists ( $paths -> searchindex ))
2019-08-18 17:52:29 +00:00
search :: invindex_rebuild ( false );
2015-12-05 17:27:01 +00:00
2015-10-28 20:56:10 +00:00
if ( ! isset ( $_GET [ " query " ]))
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( " No Search Terms - Error - $settings->sitename " , " <p>You didn't specify any search terms. Try typing some into the box above.</p> " ));
2015-10-28 20:56:10 +00:00
2015-10-29 11:21:04 +00:00
$search_start = microtime ( true );
2015-10-28 20:56:10 +00:00
2018-06-25 21:53:53 +00:00
$time_start = microtime ( true );
2019-08-22 16:43:14 +00:00
search :: invindex_load ( $paths -> searchindex );
2018-06-25 21:53:53 +00:00
$env -> perfdata -> invindex_decode_time = round (( microtime ( true ) - $time_start ) * 1000 , 3 );
2019-08-15 22:46:23 +00:00
$time_start = microtime ( true );
2019-08-22 16:43:14 +00:00
$results = search :: invindex_query ( $_GET [ " query " ]);
2016-08-20 10:35:04 +00:00
$resultCount = count ( $results );
2018-06-25 21:53:53 +00:00
$env -> perfdata -> invindex_query_time = round (( microtime ( true ) - $time_start ) * 1000 , 3 );
2019-08-22 16:43:14 +00:00
header ( " x-invindex-load-time: { $env -> perfdata -> invindex_decode_time } ms " );
2018-06-25 21:53:53 +00:00
header ( " x-invindex-query-time: { $env -> perfdata -> invindex_query_time } ms " );
2018-02-14 23:08:28 +00:00
2018-06-26 13:15:19 +00:00
$start = microtime ( true );
2018-02-14 23:08:28 +00:00
foreach ( $results as & $result ) {
$result [ " context " ] = search :: extract_context (
2019-08-22 20:38:17 +00:00
$result [ " pagename " ],
2018-02-14 23:08:28 +00:00
$_GET [ " query " ],
file_get_contents ( $env -> storage_prefix . $result [ " pagename " ] . " .md " )
);
}
2018-06-26 13:15:19 +00:00
$env -> perfdata -> context_generation_time = round (( microtime ( true ) - $start ) * 1000 , 3 );
header ( " x-context-generation-time: { $env -> perfdata -> context_generation_time } ms " );
2018-02-14 23:08:28 +00:00
2018-06-25 23:11:01 +00:00
$env -> perfdata -> search_time = round (( microtime ( true ) - $search_start ) * 1000 , 3 );
header ( " x-search-time: { $env -> perfdata -> search_time } ms " );
2018-02-14 23:08:28 +00:00
if ( ! empty ( $_GET [ " format " ]) && $_GET [ " format " ] == " json " ) {
header ( " content-type: application/json " );
$json_results = new stdClass ();
foreach ( $results as $result ) $json_results -> { $result [ " pagename " ]} = $result ;
exit ( json_encode ( $json_results ));
}
2015-11-08 21:15:08 +00:00
2015-10-29 11:21:04 +00:00
$title = $_GET [ " query " ] . " - Search results - $settings->sitename " ;
$content = " <section> \n " ;
$content .= " <h1>Search Results</h1> " ;
2015-10-31 14:05:00 +00:00
/// Search Box ///
$content .= " <form method='get' action=''> \n " ;
2017-07-29 09:44:55 +00:00
$content .= " <input type='search' id='search-box' name='query' placeholder='Type your query here and then press enter.' value=' " . htmlentities ( $_GET [ " query " ], ENT_HTML5 | ENT_QUOTES ) . " ' /> \n " ;
2015-10-31 14:05:00 +00:00
$content .= " <input type='hidden' name='action' value='search' /> \n " ;
$content .= " </form> " ;
2015-10-29 11:21:04 +00:00
2016-08-20 10:35:04 +00:00
$content .= " <p>Found $resultCount " . ( $resultCount === 1 ? " result " : " results " ) . " in " . $env -> perfdata -> search_time . " ms. " ;
2015-11-01 10:13:35 +00:00
$query = $_GET [ " query " ];
2019-08-22 16:43:14 +00:00
if ( isset ( $pageindex -> $query )) {
2016-08-20 10:35:04 +00:00
$content .= " There's a page on $settings->sitename called <a href='?page= " . rawurlencode ( $query ) . " '> $query </a>. " ;
2015-11-01 10:13:35 +00:00
}
else
{
2016-08-20 10:35:04 +00:00
$content .= " There isn't a page called $query on $settings->sitename , but you " ;
2019-08-22 16:43:14 +00:00
if (( ! $settings -> anonedits && ! $env -> is_logged_in ) || ! $settings -> editing ) {
2016-03-12 19:02:36 +00:00
$content .= " do not have permission to create it. " ;
2019-08-22 16:43:14 +00:00
if ( ! $env -> is_logged_in ) {
2016-03-12 19:02:36 +00:00
$content .= " You could try <a href='?action=login&returnto= " . rawurlencode ( $_SERVER [ " REQUEST_URI " ]) . " '>logging in</a>. " ;
}
}
2019-08-22 16:43:14 +00:00
else {
2016-08-20 10:35:04 +00:00
$content .= " can <a href='?action=edit&page= " . rawurlencode ( $query ) . " '>create it</a>. " ;
2016-03-12 19:02:36 +00:00
}
2015-11-01 10:13:35 +00:00
}
2019-08-24 19:47:41 +00:00
$content .= " <br /><small><em>Pssst! Power users can make use of $settings->sitename 's advanced query syntax. Learn about it <a href='?action=help#27-search'>here</a>!</em></small></p> " ;
2015-11-01 10:13:35 +00:00
2017-09-19 16:32:52 +00:00
if ( module_exists ( " page-list " )) {
2019-08-22 16:43:14 +00:00
// TODO: Refactor ths to use STAS
2017-09-19 16:32:52 +00:00
$nterms = search :: tokenize ( $query );
$nterms_regex = implode ( " | " , array_map ( function ( $nterm ) {
return preg_quote ( strtolower ( trim ( $nterm )));
}, $nterms ));
$all_tags = get_all_tags ();
$matching_tags = [];
foreach ( $all_tags as $tag ) {
if ( preg_match ( " / $nterms_regex /i " , trim ( $tag )) > 0 )
$matching_tags [] = $tag ;
}
if ( count ( $matching_tags ) > 0 ) {
2018-06-30 10:46:07 +00:00
$content .= " <p class='matching-tags-display'><label>Matching tags</label><span class='tags'> " ;
2017-09-19 16:32:52 +00:00
foreach ( $matching_tags as $tag ) {
$content .= " \t <a href='?action=list-tags&tag= " . rawurlencode ( $tag ) . " ' class='mini-tag'> " . htmlentities ( $tag ) . " </a> \n " ;
}
$content .= " </span></p> " ;
}
}
2015-10-31 14:16:19 +00:00
$i = 0 ; // todo use $_GET["offset"] and $_GET["result-count"] or something
2015-10-29 11:21:04 +00:00
foreach ( $results as $result )
{
$link = " ?page= " . rawurlencode ( $result [ " pagename " ]);
2015-11-08 21:15:08 +00:00
$pagesource = file_get_contents ( $env -> storage_prefix . $result [ " pagename " ] . " .md " );
2016-08-19 12:02:42 +00:00
//echo("Extracting context for result " . $result["pagename"] . ".\n");
2018-02-14 23:08:28 +00:00
$context = $result [ " context " ];
2018-03-18 16:52:55 +00:00
if ( mb_strlen ( $context ) === 0 )
$context = mb_substr ( $pagesource , 0 , $settings -> search_characters_context * 2 );
2016-08-19 12:02:42 +00:00
//echo("'Generated search context for " . $result["pagename"] . ": $context'\n");
2018-03-18 16:52:55 +00:00
$context = search :: highlight_context (
$_GET [ " query " ],
preg_replace ( '/</u' , '<' , $context )
);
2015-11-02 14:42:38 +00:00
/* if ( strlen ( $context ) == 0 )
2015-11-01 15:05:54 +00:00
{
$context = search :: strip_markup ( file_get_contents ( " $env->page .md " , null , null , null , $settings -> search_characters_context * 2 ));
if ( $pageindex -> { $env -> page } -> size > $settings -> search_characters_context * 2 )
$context .= " ... " ;
2015-11-02 14:42:38 +00:00
} */
2015-11-01 15:05:54 +00:00
2017-10-14 21:48:58 +00:00
$tag_list = " <span class='tags'> " ;
foreach ( $pageindex -> { $result [ " pagename " ]} -> tags ? ? [] as $tag ) $tag_list .= " <a href='?action=list-tags&tag= " . rawurlencode ( $tag ) . " ' class='mini-tag'> $tag </a> " ;
$tag_list .= " </span> \n " ;
2017-03-20 20:21:25 +00:00
// Make redirect pages italics
if ( ! empty ( $pageindex -> { $result [ " pagename " ]} -> redirect ))
$result [ " pagename " ] = " <em> { $result [ " pagename " ] } </em> " ;
2015-10-29 11:21:04 +00:00
2015-10-31 14:16:19 +00:00
// We add 1 to $i here to convert it from an index to a result
// number as people expect it to start from 1
$content .= " <div class='search-result' data-result-number=' " . ( $i + 1 ) . " ' data-rank=' " . $result [ " rank " ] . " '> \n " ;
2017-10-14 21:48:58 +00:00
$content .= " <h2><a href=' $link '> " . $result [ " pagename " ] . " </a> <span class='search-result-badges'> $tag_list </span></h2> \n " ;
2016-08-19 12:47:56 +00:00
$content .= " <p class='search-context'> $context </p> \n " ;
2015-10-29 11:21:04 +00:00
$content .= " </div> \n " ;
2015-10-31 14:16:19 +00:00
$i ++ ;
2015-10-29 11:21:04 +00:00
}
$content .= " </section> \n " ;
2016-08-21 20:02:36 +00:00
header ( " content-type: text/html; charset=UTF-8 " );
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( $title , $content ));
//header("content-type: text/plain");
//var_dump($results);
2015-10-28 20:56:10 +00:00
});
2016-10-01 10:32:38 +00:00
2017-03-23 21:13:20 +00:00
/*
* ██████ ██ ██ ███████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ █████ ██████ ████ █████
* ██ ▄▄ ██ ██ ██ ██ ██ ██ ██
* ██████ ██████ ███████ ██ ██ ██
* ▀▀
* ███████ ███████ █████ ██████ ██████ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████ ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██ ██ ██ ████ ██████ ███████ ██ ██
*/
/**
* @ api { get } ? action = query - searchindex & query = { text } Inspect the internals of the search results for a query
* @ apiName Search
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to search for .
*/
add_action ( " query-searchindex " , function () {
global $env , $paths ;
if ( empty ( $_GET [ " query " ])) {
http_response_code ( 400 );
header ( " content-type: text/plain " );
exit ( " Error: No query specified. Specify it with the 'query' GET parameter. " );
}
$env -> perfdata -> searchindex_decode_start = microtime ( true );
2019-08-22 23:51:39 +00:00
search :: invindex_load ( $paths -> searchindex );
2017-03-23 21:13:20 +00:00
$env -> perfdata -> searchindex_decode_time = ( microtime ( true ) - $env -> perfdata -> searchindex_decode_start ) * 1000 ;
$env -> perfdata -> searchindex_query_start = microtime ( true );
2019-08-22 23:51:39 +00:00
$searchResults = search :: invindex_query ( $_GET [ " query " ]);
2017-03-23 21:13:20 +00:00
$env -> perfdata -> searchindex_query_time = ( microtime ( true ) - $env -> perfdata -> searchindex_query_start ) * 1000 ;
header ( " content-type: application/json " );
$result = new stdClass ();
$result -> time_format = " ms " ;
$result -> decode_time = $env -> perfdata -> searchindex_decode_time ;
$result -> query_time = $env -> perfdata -> searchindex_query_time ;
$result -> total_time = $result -> decode_time + $result -> query_time ;
2019-08-22 23:51:39 +00:00
$result -> stas = search :: stas_parse ( search :: stas_split ( $_GET [ " query " ]));
2017-03-23 21:13:20 +00:00
$result -> search_results = $searchResults ;
exit ( json_encode ( $result , JSON_PRETTY_PRINT ));
});
2019-08-23 00:24:17 +00:00
2019-08-23 00:27:35 +00:00
/**
* @ api { get } ? action = stas - parse & query = { text } Debug search queries
* @ apiDescription Debug Pepperminty Wiki ' s understanding of search queries .
* If you want something machine - readable , check out the new stas property on the object returned by query - searchindex .
* @ apiName SearchSTASParse
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to parse .
*/
2019-08-23 00:24:17 +00:00
add_action ( " stas-parse " , function () {
global $settings ;
$tokens = search :: stas_split ( $_GET [ " query " ]);
$stas_query = search :: stas_parse ( $tokens );
$result = " " ;
foreach ( $tokens as $token ) {
if ( in_array ( substr ( $token , 1 ), $stas_query [ " exclude " ])) {
2019-08-23 00:29:11 +00:00
$result .= " <span title='explicit exclude' style='color: red; text-decoration: dotted line-through;'> " . substr ( $token , 1 ) . " </span> " ;
2019-08-23 00:24:17 +00:00
continue ;
}
2019-12-15 20:03:04 +00:00
2019-08-23 00:24:17 +00:00
$term = null ;
$token_part = $token ;
2019-12-15 20:03:04 +00:00
if ( $token_part [ 0 ] == " + " ) $token_part = substr ( $token_part , 1 );
2019-08-23 00:24:17 +00:00
if ( strpos ( $token_part , " : " ) !== false ) $token_part = explode ( " : " , $token_part , 2 )[ 1 ];
foreach ( $stas_query [ " terms " ] as $c_term ) {
2019-12-15 20:03:04 +00:00
// echo(var_export($token_part, true) . " / {$c_term["term"]}\n");
2019-08-23 00:24:17 +00:00
if ( $c_term [ " term " ] == $token_part ) {
$term = $c_term ;
break ;
}
}
if ( $term == null ) {
$result .= " <span title='unknown' style='color: black; text-decoration: wavy underline;'> $token </span> " ;
continue ;
}
$title = " ? " ;
$style = " " ;
switch ( $term [ " weight " ]) {
case - 1 : $style .= " color: grey; text-decoration: wavy line-through; " ; $title = " stop word " ; break ;
case 1 : $style .= " color: blue; " ; $title = " normal word " ; break ;
}
2019-12-15 20:03:04 +00:00
if ( $term [ " weight " ] > 1 ) {
$style .= " color: darkblue; font-weight: bold; " ;
$title = " weighted word " ;
}
2019-12-15 17:56:56 +00:00
if ( $term [ " weight " ] !== - 1 ) {
switch ( $term [ " location " ]) {
case " body " : $style = " color: cyan " ; $title = " body only " ; break ;
case " title " : $style .= " font-weight: bolder; font-size: 1.2em; color: orange; " ; $title = " searching title only " ; $token = $token_part ; break ;
case " tags " : $style .= " font-weight: bolder; color: purple; " ; $title = " searching tags only " ; $token = $token_part ; break ;
case " all " : $title .= " , searching everywhere " ;
}
2019-08-23 00:24:17 +00:00
}
2019-12-15 20:03:04 +00:00
$title .= " , weight: { $term [ " weight " ] } " ;
2019-08-23 00:24:17 +00:00
$result .= " <span title=' $title ' style=' $style '> $token </span> " ;
}
exit ( page_renderer :: render_main ( " STAS Query Analysis - $settings->sitename " , " <p> $settings->sitename understood your query to mean the following:</p>
< blockquote > $result </ blockquote > " ));
});
2017-06-28 08:45:13 +00:00
/*
* ██████ ██████ ███████ ███ ██ ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██████ █████ ██ ██ ██ ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ ██ ███████ ██ ████ ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
/**
* @ api { get } ? action = opensearch - description Get the opensearch description file
* @ apiName OpenSearchDescription
* @ apiGroup Search
* @ apiPermission Anonymous
2016-10-01 10:32:38 +00:00
*/
add_action ( " opensearch-description " , function () {
global $settings ;
$siteRoot = full_url () . " /index.php " ;
if ( ! isset ( $_GET [ " debug " ]))
header ( " content-type: application/opensearchdescription+xml " );
else
header ( " content-type: text/plain " );
2017-06-06 20:33:11 +00:00
exit ( '<?xml version="1.0" encoding="UTF-8"?' . '>' . // hack The build system strips it otherwise O.o I should really fix that.
" \n <OpenSearchDescription xmlns= \" http://a9.com/-/spec/opensearch/1.1/ \" >
2016-10-01 10:32:38 +00:00
< ShortName > Search $settings -> sitename </ ShortName >
< Description > Search $settings -> sitename , which is powered by Pepperminty Wiki .</ Description >
< Tags > $settings -> sitename Wiki </ Tags >
< Image type = \ " image/png \" > $settings->favicon </Image>
< Attribution > Search content available under the license linked to at the bottom of the search results page .</ Attribution >
< Developer > Starbeamrainbowlabs ( https :// github . com / sbrl / Pepperminty - Wiki / graphs / contributors ) </ Developer >
< InputEncoding > UTF - 8 </ InputEncoding >
< OutputEncoding > UTF - 8 </ OutputEncoding >
2017-07-16 09:26:22 +00:00
< Url type = \ " text/html \" method= \" get \" template= \" $siteRoot ?action=view&search-redirect=yes&page= { searchTerms}&offset= { startIndex?}&count= { count} \" />
2017-06-28 10:21:42 +00:00
< Url type = \ " application/x-suggestions+json \" template= \" $siteRoot ?action=suggest-pages&query= { searchTerms}&type=opensearch \" />
2017-06-06 20:33:11 +00:00
</ OpenSearchDescription > " );
2016-10-01 10:32:38 +00:00
});
2016-11-20 13:24:35 +00:00
2017-06-28 08:45:13 +00:00
/**
2018-02-14 23:08:28 +00:00
* @ api { get } ? action = suggest - pages [ & type = { type }] Get page name suggestions for a query
2017-06-28 08:45:13 +00:00
* @ apiName OpenSearchDescription
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } text The search query string to get search suggestions for .
2017-06-28 09:44:44 +00:00
* @ apiParam { string } type The type of result to return . Default value : json . Available values : json , opensearch
2017-06-28 08:45:13 +00:00
*/
2016-11-20 13:24:35 +00:00
add_action ( " suggest-pages " , function () {
global $settings , $pageindex ;
2019-08-22 21:11:09 +00:00
if ( $settings -> dynamic_page_suggestion_count === 0 ) {
2016-11-20 13:24:35 +00:00
header ( " content-type: application/json " );
2017-06-28 10:21:42 +00:00
header ( " content-length: 3 " );
exit ( " [] \n " );
2016-11-20 13:24:35 +00:00
}
if ( empty ( $_GET [ " query " ])) {
http_response_code ( 400 );
header ( " content-type: text/plain " );
exit ( " Error: You didn't specify the 'query' GET parameter. " );
}
2017-06-28 09:44:44 +00:00
$type = $_GET [ " type " ] ? ? " json " ;
if ( ! in_array ( $type , [ " json " , " opensearch " ])) {
http_response_code ( 406 );
exit ( " Error: The type ' $type ' is not one of the supported output types. Available values: json, opensearch. Default: json " );
}
2019-12-08 21:04:59 +00:00
$query = search :: $literator -> transliterate ( $_GET [ " query " ]);
2018-06-25 22:03:00 +00:00
2016-11-20 13:24:35 +00:00
// Rank each page name
$results = [];
foreach ( $pageindex as $pageName => $entry ) {
$results [] = [
" pagename " => $pageName ,
// Costs: Insert: 1, Replace: 8, Delete: 6
2019-12-08 21:04:59 +00:00
" distance " => levenshtein ( $query , search :: $literator -> transliterate ( $pageName ), 1 , 8 , 6 )
2016-11-20 13:24:35 +00:00
];
}
2017-06-28 09:44:44 +00:00
// Sort the page names by distance from the original query
2016-11-20 13:24:35 +00:00
usort ( $results , function ( $a , $b ) {
if ( $a [ " distance " ] == $b [ " distance " ])
return strcmp ( $a [ " pagename " ], $b [ " pagename " ]);
return $a [ " distance " ] < $b [ " distance " ] ? - 1 : 1 ;
});
// Send the results to the user
2017-06-28 09:44:44 +00:00
$suggestions = array_slice ( $results , 0 , $settings -> dynamic_page_suggestion_count );
switch ( $type )
{
case " json " :
header ( " content-type: application/json " );
exit ( json_encode ( $suggestions ));
case " opensearch " :
$opensearch_output = [
$_GET [ " query " ],
array_map ( function ( $suggestion ) { return $suggestion [ " pagename " ]; }, $suggestions )
];
header ( " content-type: application/x-suggestions+json " );
exit ( json_encode ( $opensearch_output ));
}
2016-11-20 13:24:35 +00:00
});
if ( $settings -> dynamic_page_suggestion_count > 0 )
{
2019-01-27 22:56:51 +00:00
page_renderer :: add_js_snippet ( ' /// Dynamic page suggestion system
2016-11-20 13:24:35 +00:00
// Micro snippet 8 - Promisified GET (fetched 20th Nov 2016)
function get ( u ){ return new Promise ( function ( r , t , a ){ a = new XMLHttpRequest (); a . onload = function ( b , c ){ b = a . status ; c = a . response ; if ( b > 199 && b < 300 ){ r ( c )} else { t ( c )}}; a . open ( " GET " , u , true ); a . send ( null )})}
window . addEventListener ( " load " , function ( event ) {
var searchBox = document . querySelector ( " input[type=search] " );
searchBox . dataset . lastValue = " " ;
searchBox . addEventListener ( " keyup " , function ( event ) {
// Make sure that we don\'t keep sending requests to the server if nothing has changed
if ( searchBox . dataset . lastValue == event . target . value )
return ;
searchBox . dataset . lastValue = event . target . value ;
// Fetch the suggestions from the server
get ( " ?action=suggest-pages&query= " + encodeURIComponent ( event . target . value )) . then ( function ( response ) {
var suggestions = JSON . parse ( response ),
dataList = document . getElementById ( " allpages " );
// If the server sent no suggestions, then we shouldn\'t replace the contents of the datalist
if ( suggestions . length == 0 )
return ;
console . info ( `Fetched suggestions for ${event.target.value}:` , suggestions . map ( s => s . pagename ));
// Remove all the existing suggestions
while ( dataList . firstChild ) {
dataList . removeChild ( dataList . firstChild );
}
// Add the new suggestions to the datalist
2016-11-28 13:05:23 +00:00
var optionsFrag = document . createDocumentFragment ();
2016-11-20 13:24:35 +00:00
suggestions . forEach ( function ( suggestion ) {
var suggestionElement = document . createElement ( " option " );
suggestionElement . value = suggestion . pagename ;
suggestionElement . dataset . distance = suggestion . distance ;
2016-11-28 13:05:23 +00:00
optionsFrag . appendChild ( suggestionElement );
2016-11-20 13:24:35 +00:00
});
2016-11-28 13:05:23 +00:00
dataList . appendChild ( optionsFrag );
2016-11-20 13:24:35 +00:00
});
});
});
' );
}
2019-08-24 18:56:14 +00:00
2020-03-11 23:07:38 +00:00
if ( module_exists ( " feature-cli " )) {
cli_register ( " search " , " Query and manipulate the search index " , function ( array $args ) : int {
if ( count ( $args ) < 1 ) {
echo ( " search: query and manipulate the search index
Usage :
search { subcommand }
Subcommands :
rebuild Rebuilds the search index
" );
return 0 ;
}
switch ( $args [ 0 ]) {
case " rebuild " :
search :: invindex_rebuild ();
break ;
}
return 0 ;
});
}
2019-08-24 18:56:14 +00:00
add_help_section ( " 27-search " , " Searching " , " <p> $settings->sitename has an integrated full-text search engine, allowing you to search all of the pages on $settings->sitename and their content. To use it, simply enter your query into the page name box and press enter. If a page isn't found with the exact name of your query terms, a search will be performed instead.</p>
< p > Additionally , advanced users can take advantage of some extra query syntax that $settings -> sitename supports , which is inspired by popular search engines :</ p >
< table >
< tr >< th style = 'width: 33%;' > Example </ th >< th style = 'width: 66%;' > Meaning </ th ></ tr >
< tr >< td >< code > cat - dog </ code ></ td >< td > Search for pages containing \ " cat \" , but not \" dog \" . This syntax does not make sense on it's own - other words must be present for it to take effect.</td>
< tr >< td >< code >+ glass marble </ code ></ td >< td > Double the weighting of the word \ " glass \" .</td>
< tr >< td >< code > intitle : rocket </ code ></ td >< td > Search only page titles for \ " rocket \" .</td>
< tr >< td >< code > intags : bill </ code ></ td >< td > Search only tags for \ " bill \" .</td>
< tr >< td >< code > inbody : satellite </ code ></ td >< td > Search only the page body for \ " satellite \" .</td>
</ table >
< p > More query syntax will be added in the future , so keep an eye on < a href = 'https://github.com/sbrl/Pepperminty-Wiki/releases/' > the latest releases </ a > of < em > Pepperminty Wiki </ em > to stay up - to - date ( < a href = 'https://github.com/sbrl/Pepperminty-Wiki/releases.atom' > Atom / RSS feed available here </ a > ) .</ p > " );
2015-10-27 21:10:05 +00:00
}
]);
2019-08-18 17:52:29 +00:00
/*
███████ ████████ ██████ ██████ █████ ██████ ███████ ██████ ██████ ██ ██
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ██ ██ ██ ██████ ███████ ██ ███ █████ ██████ ██ ██ ███
██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ██ ██████ ██ ██ ██ ██ ██████ ███████ ██████ ██████ ██ ██
*/
/**
* Represents a key - value data store .
*/
class StorageBox {
2019-12-06 23:40:28 +00:00
const MODE_JSON = 0 ;
const MODE_ARR_SIMPLE = 1 ;
2019-08-18 17:52:29 +00:00
/**
* The SQLite database connection .
* @ var \PDO
*/
private $db ;
2019-08-22 20:38:17 +00:00
/**
* A cache of values .
* @ var object []
*/
private $cache = [];
/**
* A cache of prepared SQL statements .
* @ var \PDOStatement []
*/
private $query_cache = [];
2019-08-18 17:52:29 +00:00
/**
* Initialises a new store connection .
* @ param string $filename The filename that the store is located in .
*/
function __construct ( string $filename ) {
$firstrun = ! file_exists ( $filename );
2019-08-22 20:38:17 +00:00
$this -> db = new \PDO ( " sqlite: " . path_resolve ( $filename , __DIR__ )); // HACK: This might not work on some systems, because it depends on the current working directory
$this -> db -> setAttribute ( PDO :: ATTR_ERRMODE , PDO :: ERRMODE_EXCEPTION );
2019-08-18 17:52:29 +00:00
if ( $firstrun ) {
$this -> query ( " CREATE TABLE store (key TEXT UNIQUE NOT NULL, value TEXT) " );
}
}
/**
* Makes a query against the database .
* @ param string $sql The ( potentially parametised ) query to make .
* @ param array $variables Optional . The variables to substitute into the SQL query .
* @ return \PDOStatement The result of the query , as a PDOStatement .
*/
private function query ( string $sql , array $variables = []) {
2019-08-22 20:38:17 +00:00
// Add to the query cache if it doesn't exist
if ( ! isset ( $this -> query_cache [ $sql ]))
$this -> query_cache [ $sql ] = $this -> db -> prepare ( $sql );
$this -> query_cache [ $sql ] -> execute ( $variables );
return $this -> query_cache [ $sql ]; // fetchColumn(), fetchAll(), etc. are defined on the statement, not the return value of execute()
2019-08-18 17:52:29 +00:00
}
/**
* Determines if the given key exists in the store or not .
* @ param string $key The key to test .
* @ return bool Whether the key exists in the store or not .
*/
public function has ( string $key ) : bool {
2019-08-22 20:38:17 +00:00
if ( isset ( $this -> cache [ $key ]))
return true ;
2019-08-18 17:52:29 +00:00
return $this -> query (
" SELECT COUNT(key) FROM store WHERE key = :key; " ,
[ " key " => $key ]
) -> fetchColumn () > 0 ;
}
/**
* Gets a value from the store .
2019-08-22 20:38:17 +00:00
* @ param string $key The key value is stored under .
* @ return mixed The stored value .
2019-08-18 17:52:29 +00:00
*/
2019-08-22 20:38:17 +00:00
public function get ( string $key ) {
// If it's not in the cache, insert it
if ( ! isset ( $this -> cache [ $key ])) {
$this -> cache [ $key ] = [ " modified " => false , " value " => json_decode ( $this -> query (
" SELECT value FROM store WHERE key = :key; " ,
[ " key " => $key ]
) -> fetchColumn ()) ];
}
return $this -> cache [ $key ][ " value " ];
2019-08-18 17:52:29 +00:00
}
2019-12-06 23:40:28 +00:00
public function get_arr_simple ( string $key , string $delimiter = " | " ) {
// If it's not in the cache, insert it
if ( ! isset ( $this -> cache [ $key ])) {
$this -> cache [ $key ] = [
" modified " => false ,
" value " => explode ( $delimiter , $this -> query (
" SELECT value FROM store WHERE key = :key; " ,
[ " key " => $key ]
) -> fetchColumn ())
];
}
return $this -> cache [ $key ][ " value " ];
}
2019-08-18 17:52:29 +00:00
/**
* Sets a value in the data store .
2019-08-22 20:38:17 +00:00
* Note that this does NOT save changes to disk until you close the connection !
2019-08-18 17:52:29 +00:00
* @ param string $key The key to set the value of .
2019-08-22 20:38:17 +00:00
* @ param mixed $value The value to store .
2019-08-18 17:52:29 +00:00
*/
2019-08-22 20:38:17 +00:00
public function set ( string $key , $value ) : void {
if ( ! isset ( $this -> cache [ $key ])) $this -> cache [ $key ] = [];
$this -> cache [ $key ][ " value " ] = $value ;
$this -> cache [ $key ][ " modified " ] = true ;
2019-12-06 23:40:28 +00:00
$this -> cache [ $key ][ " mode " ] = self :: MODE_JSON ;
}
public function set_arr_simple ( string $key , $value , string $delimiter = " | " ) : void {
if ( ! isset ( $this -> cache [ $key ])) $this -> cache [ $key ] = [];
$this -> cache [ $key ][ " value " ] = $value ;
$this -> cache [ $key ][ " modified " ] = true ;
$this -> cache [ $key ][ " delimiter " ] = $delimiter ;
$this -> cache [ $key ][ " mode " ] = self :: MODE_ARR_SIMPLE ;
2019-08-18 17:52:29 +00:00
}
/**
* Deletes an item from the data store .
* @ param string $key The key of the item to delete .
* @ return bool Whether it was really deleted or not . Note that if it doesn 't exist, then it can' t be deleted .
*/
public function delete ( string $key ) : bool {
2019-08-22 20:38:17 +00:00
// Remove it from the cache
if ( isset ( $this -> cache [ $key ]))
unset ( $this -> cache [ $key ]);
// Remove it from disk
2019-12-06 23:40:28 +00:00
// TODO: Queue this action for the transaction later
2019-09-21 20:05:14 +00:00
return $this -> query (
2019-08-18 17:52:29 +00:00
" DELETE FROM store WHERE key = :key; " ,
[ " key " => $key ]
) -> rowCount () > 0 ;
}
/**
* Empties the store .
*/
public function clear () : void {
2019-08-22 20:38:17 +00:00
// Empty the cache;
$this -> cache = [];
// Empty the disk
2019-08-18 17:52:29 +00:00
$this -> query ( " DELETE FROM store; " );
}
2019-08-22 20:38:17 +00:00
/**
* Syncs changes to disk and closes the PDO connection .
*/
public function close () : void {
$this -> db -> beginTransaction ();
foreach ( $this -> cache as $key => $value_data ) {
// If it wasn't modified, there's no point in saving it, is there?
if ( ! $value_data [ " modified " ])
continue ;
$this -> query (
" INSERT OR REPLACE INTO store(key, value) VALUES(:key, :value) " ,
[
" key " => $key ,
2019-12-06 23:40:28 +00:00
" value " => $value_data [ " mode " ] == self :: MODE_ARR_SIMPLE ?
implode ( $value_data [ " delimiter " ], $value_data [ " value " ]) :
json_encode ( $value_data [ " value " ])
2019-08-22 20:38:17 +00:00
]
);
}
$this -> db -> commit ();
$this -> db = null ;
}
2019-08-18 17:52:29 +00:00
}
/*
███████ ███████ █████ ██████ ██████ ██ ██
██ ██ ██ ██ ██ ██ ██ ██ ██
███████ █████ ███████ ██████ ██ ███████
██ ██ ██ ██ ██ ██ ██ ██ ██
███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
2017-11-20 20:40:59 +00:00
/**
* Holds a collection to methods to manipulate various types of search index .
2019-02-10 23:18:34 +00:00
* @ package search
2017-11-20 20:40:59 +00:00
*/
2015-10-28 09:14:41 +00:00
class search
{
2017-11-20 20:40:59 +00:00
/**
2019-12-15 22:38:44 +00:00
* Words that we should exclude from the inverted index .
* @ source http :// xpo6 . com / list - of - english - stop - words /
2019-05-06 19:22:36 +00:00
* @ var string []
2017-11-20 20:40:59 +00:00
*/
2015-10-28 09:14:41 +00:00
public static $stop_words = [
2015-10-28 14:31:27 +00:00
" a " , " about " , " above " , " above " , " across " , " after " , " afterwards " , " again " ,
" against " , " all " , " almost " , " alone " , " along " , " already " , " also " ,
2015-11-08 21:15:08 +00:00
" although " , " always " , " am " , " among " , " amongst " , " amoungst " , " amount " ,
2015-10-28 14:31:27 +00:00
" an " , " and " , " another " , " any " , " anyhow " , " anyone " , " anything " , " anyway " ,
" anywhere " , " are " , " around " , " as " , " at " , " back " , " be " , " became " ,
" because " , " become " , " becomes " , " becoming " , " been " , " before " ,
" beforehand " , " behind " , " being " , " below " , " beside " , " besides " ,
" between " , " beyond " , " bill " , " both " , " bottom " , " but " , " by " , " call " ,
2019-12-15 20:21:05 +00:00
" can " , " can't " , " cannot " , " co " , " con " , " could " , " couldnt " , " cry " , " de " ,
2015-10-28 14:31:27 +00:00
" describe " , " detail " , " do " , " done " , " down " , " due " , " during " , " each " ,
" eg " , " eight " , " either " , " eleven " , " else " , " elsewhere " , " empty " ,
" enough " , " etc " , " even " , " ever " , " every " , " everyone " , " everything " ,
2016-11-20 13:24:35 +00:00
" everywhere " , " except " , " few " , " fill " , " find " ,
" fire " , " first " , " five " , " for " , " former " , " formerly " , " found " ,
2015-10-28 14:31:27 +00:00
" four " , " from " , " front " , " full " , " further " , " get " , " give " , " go " , " had " ,
" has " , " hasnt " , " have " , " he " , " hence " , " her " , " here " , " hereafter " ,
" hereby " , " herein " , " hereupon " , " hers " , " herself " , " him " , " himself " ,
2016-11-20 13:24:35 +00:00
" his " , " how " , " however " , " ie " , " if " , " in " , " inc " , " indeed " ,
2019-12-15 20:21:05 +00:00
" interest " , " into " , " is " , " it " , " its " , " it's " , " itself " , " keep " , " last " ,
2015-10-28 14:31:27 +00:00
" latter " , " latterly " , " least " , " less " , " ltd " , " made " , " many " , " may " ,
" me " , " meanwhile " , " might " , " mine " , " more " , " moreover " , " most " ,
" mostly " , " move " , " much " , " must " , " my " , " myself " , " name " , " namely " ,
" neither " , " never " , " nevertheless " , " next " , " nine " , " no " , " none " ,
" nor " , " not " , " nothing " , " now " , " nowhere " , " of " , " off " , " often " , " on " ,
" once " , " one " , " only " , " onto " , " or " , " other " , " others " , " otherwise " ,
" our " , " ours " , " ourselves " , " out " , " over " , " own " , " part " , " per " ,
" perhaps " , " please " , " put " , " rather " , " re " , " same " , " see " , " seem " ,
" seemed " , " seeming " , " seems " , " serious " , " several " , " she " , " should " ,
" show " , " side " , " since " , " sincere " , " six " , " sixty " , " so " , " some " ,
" somehow " , " someone " , " something " , " sometime " , " sometimes " ,
" somewhere " , " still " , " such " , " system " , " take " , " ten " , " than " , " that " ,
" the " , " their " , " them " , " themselves " , " then " , " thence " , " there " ,
" thereafter " , " thereby " , " therefore " , " therein " , " thereupon " , " these " ,
" they " , " thickv " , " thin " , " third " , " this " , " those " , " though " , " three " ,
" through " , " throughout " , " thru " , " thus " , " to " , " together " , " too " , " top " ,
" toward " , " towards " , " twelve " , " twenty " , " two " , " un " , " under " , " until " ,
" up " , " upon " , " us " , " very " , " via " , " was " , " we " , " well " , " were " , " what " ,
" whatever " , " when " , " whence " , " whenever " , " where " , " whereafter " ,
" whereas " , " whereby " , " wherein " , " whereupon " , " wherever " , " whether " ,
" which " , " while " , " whither " , " who " , " whoever " , " whole " , " whom " , " whose " ,
" why " , " will " , " with " , " within " , " without " , " would " , " yet " , " you " ,
" your " , " yours " , " yourself " , " yourselves "
2015-10-28 09:14:41 +00:00
];
2015-10-28 20:56:10 +00:00
2019-08-18 17:52:29 +00:00
/**
* The StorageBox that contains the inverted index .
* @ var StorageBox
*/
private static $invindex = null ;
2019-08-22 23:51:39 +00:00
/**
2019-12-08 21:04:59 +00:00
* The transliterator that can be used to transliterate strings .
* Transliterated strings are more suitable for use with the search index .
* Note that this is no longer wrapped in a function as of v0 . 21 for
* performance reasons .
2019-08-22 23:51:39 +00:00
* @ var Transliterator
*/
2019-12-08 21:04:59 +00:00
public static $literator = null ;
2019-08-22 20:38:17 +00:00
/**
2019-12-08 21:04:59 +00:00
* Initialises the search system .
* Do not call this function ! It is called automatically .
2019-08-22 20:38:17 +00:00
*/
2019-12-08 21:04:59 +00:00
public static function init () {
self :: $literator = Transliterator :: createFromRules ( ':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;' , Transliterator :: FORWARD );
2019-08-22 20:38:17 +00:00
}
2019-12-08 21:04:59 +00:00
2017-11-20 20:40:59 +00:00
/**
* Converts a source string into an index of search terms that can be
* merged into an inverted index .
2019-08-18 17:52:29 +00:00
* Automatically transliterates the source string .
2017-11-20 20:40:59 +00:00
* @ param string $source The source string to index .
* @ return array An index represents the specified string .
*/
2019-08-18 17:52:29 +00:00
public static function index_generate ( string $source ) : array {
2018-06-25 21:53:53 +00:00
// We don't need to normalise or transliterate here because self::tokenize() does this for us
2015-10-28 09:14:41 +00:00
$source = html_entity_decode ( $source , ENT_QUOTES );
2018-03-18 16:37:25 +00:00
$source_length = mb_strlen ( $source );
2015-10-28 14:31:27 +00:00
2015-10-28 09:14:41 +00:00
$index = [];
2015-10-28 14:31:27 +00:00
2018-06-30 10:19:38 +00:00
$terms = self :: tokenize ( $source , true );
2019-08-17 00:19:04 +00:00
foreach ( $terms as $term ) {
2015-10-28 09:14:41 +00:00
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
2018-06-30 10:19:38 +00:00
if ( in_array ( $term [ 0 ], self :: $stop_words )) continue ;
2015-10-28 11:36:07 +00:00
2018-06-30 10:19:38 +00:00
if ( ! isset ( $index [ $term [ 0 ]]))
$index [ $term [ 0 ]] = [ " freq " => 0 , " offsets " => [] ];
2015-10-28 11:36:07 +00:00
2018-06-30 10:19:38 +00:00
$index [ $term [ 0 ]][ " freq " ] ++ ;
$index [ $term [ 0 ]][ " offsets " ][] = $term [ 1 ];
2015-10-28 09:14:41 +00:00
}
return $index ;
}
2017-11-20 20:40:59 +00:00
/**
* Converts a source string into a series of raw tokens .
2018-06-29 23:08:57 +00:00
* @ param string $source The source string to process .
2019-05-06 19:22:36 +00:00
* @ param bool $capture_offsets Whether to capture & return the character offsets of the tokens detected . If true , then each token returned will be an array in the form [ token , char_offset ] .
2017-11-20 20:40:59 +00:00
* @ return array An array of raw tokens extracted from the specified source string .
*/
2019-08-18 17:52:29 +00:00
public static function tokenize ( string $source , bool $capture_offsets = false ) : array {
2018-06-25 21:53:53 +00:00
2018-06-29 23:08:57 +00:00
$flags = PREG_SPLIT_NO_EMPTY ; // Don't return empty items
if ( $capture_offsets )
$flags |= PREG_SPLIT_OFFSET_CAPTURE ;
2018-06-25 21:53:53 +00:00
// We don't need to normalise here because the transliterator handles
2019-12-08 21:04:59 +00:00
$source = self :: $literator -> transliterate ( $source );
2018-03-18 16:19:13 +00:00
$source = preg_replace ( '/[\[\]\|\{\}\/]/u' , " " , $source );
2018-06-29 23:08:57 +00:00
return preg_split ( " /((^ \ p { P}+)|( \ p { P}* \ s+ \ p { P}*)|( \ p { P}+ $ ))| \ |/u " , $source , - 1 , $flags );
2015-10-28 20:56:10 +00:00
}
2017-11-20 20:40:59 +00:00
/**
* Removes ( most ) markdown markup from the specified string .
* Stripped strings are not suitable for indexing !
* @ param string $source The source string to process .
* @ return string The stripped string .
*/
2019-08-18 17:52:29 +00:00
public static function strip_markup ( string $source ) : string {
2018-03-18 16:19:13 +00:00
return preg_replace ( '/([\"*_\[\]]| - |`)/u' , " " , $source );
2015-10-29 11:21:04 +00:00
}
2017-11-20 20:40:59 +00:00
/**
* Rebuilds the master inverted index and clears the page id index .
2019-08-15 22:06:06 +00:00
* @ param bool $output Whether to send progress information to the user ' s browser .
2017-11-20 20:40:59 +00:00
*/
2019-08-18 17:52:29 +00:00
public static function invindex_rebuild ( bool $output = true ) : void {
2017-07-11 19:21:20 +00:00
global $pageindex , $env , $paths , $settings ;
2020-03-11 23:07:38 +00:00
$env -> perfdata -> invindex_rebuild = microtime ( true );
2015-10-28 20:56:10 +00:00
2020-03-11 23:07:38 +00:00
if ( $output && ! is_cli ()) {
2017-07-10 20:53:52 +00:00
header ( " content-type: text/event-stream " );
2017-07-11 19:21:20 +00:00
ob_end_flush ();
}
2016-08-26 16:47:46 +00:00
2020-03-11 23:07:38 +00:00
2017-03-23 20:48:42 +00:00
// Clear the id index out
ids :: clear ();
2019-08-18 17:52:29 +00:00
// Clear the existing inverted index out
2019-08-22 20:38:17 +00:00
if ( self :: $invindex == null )
self :: invindex_load ( $paths -> searchindex );
self :: $invindex -> clear ();
self :: $invindex -> set ( " |termlist| " , []);
2019-08-18 17:52:29 +00:00
2017-03-23 20:48:42 +00:00
// Reindex each page in turn
2017-07-10 20:53:52 +00:00
$i = 0 ; $max = count ( get_object_vars ( $pageindex ));
2017-07-11 19:21:20 +00:00
$missing_files = 0 ;
2015-10-28 20:56:10 +00:00
foreach ( $pageindex as $pagename => $pagedetails )
{
2017-07-11 19:21:20 +00:00
$page_filename = $env -> storage_prefix . $pagedetails -> filename ;
if ( ! file_exists ( $page_filename )) {
2020-03-11 23:07:38 +00:00
if ( ! is_cli ()) echo ( " data: " );
echo ( " [ " . ( $i + 1 ) . " / $max ] Error: Can't find $page_filename\n " );
2017-07-11 19:21:20 +00:00
flush ();
2018-04-07 12:47:39 +00:00
$i ++ ; $missing_files ++ ;
2017-07-11 19:21:20 +00:00
continue ;
}
2018-06-25 21:53:53 +00:00
// We do not transliterate or normalise here because the indexer will take care of this for us
2019-08-18 17:52:29 +00:00
$index = self :: index_generate ( file_get_contents ( $page_filename ));
2015-10-28 20:56:10 +00:00
2017-03-23 20:48:42 +00:00
$pageid = ids :: getid ( $pagename );
2019-08-18 17:52:29 +00:00
self :: invindex_merge ( $pageid , $index );
2017-03-23 20:48:42 +00:00
2017-07-10 20:53:52 +00:00
if ( $output ) {
2020-03-11 23:07:38 +00:00
$message = " [ " . ( $i + 1 ) . " / $max ] Added $pagename (id # $pageid ) to the new search index. " ;
if ( ! is_cli ()) $message = " data: $message\n\n " ;
else $message = " $message\r " ;
echo ( $message );
2017-07-10 20:53:52 +00:00
flush ();
}
$i ++ ;
2015-10-28 20:56:10 +00:00
}
2016-08-19 11:02:09 +00:00
2020-03-11 23:07:38 +00:00
$msg = " Syncing to disk.... " ;
if ( ! is_cli ()) $msg = " data: $msg\n\n " ;
else $msg = " $msg\r " ;
echo ( $msg );
2019-08-22 20:38:17 +00:00
self :: invindex_close ();
2020-03-11 23:07:38 +00:00
$env -> perfdata -> invindex_rebuild = round ( microtime ( true ) - $env -> perfdata -> invindex_rebuild , 4 );
if ( $output && ! is_cli ()) {
echo ( " data: Search index rebuilding complete in { $env -> perfdata -> invindex_rebuild } s. \n \n " );
2017-07-11 19:21:20 +00:00
echo ( " data: Couldn't find $missing_files pages on disk. If $settings->sitename couldn't find some pages on disk, then you might need to manually correct $settings->sitename 's page index (stored in pageindex.json). \n \n " );
echo ( " data: Done! Saving new search index to ' $paths->searchindex '. \n \n " );
2017-07-10 20:53:52 +00:00
}
2020-03-11 23:07:38 +00:00
if ( is_cli ()) echo ( " \n Search index rebuilding complete in { $env -> perfdata -> invindex_rebuild } s. \n " );
2019-08-18 17:52:29 +00:00
// No need to save, it's an SQLite DB backend
2015-10-28 20:56:10 +00:00
}
2017-11-20 20:40:59 +00:00
/**
2019-09-03 17:16:01 +00:00
* Sorts an index alphabetically .
2017-11-20 20:40:59 +00:00
* This allows us to do a binary search instead of a regular
* sequential search .
* @ param array $index The index to sort .
2015-10-28 09:14:41 +00:00
*/
2019-09-03 17:16:01 +00:00
public static function index_sort ( & $index ) {
$sorter = new Collator ( " " );
uksort ( $index , function ( $a , $b ) use ( $sorter ) : int {
return $sorter -> compare ( $a , $b );
});
}
2015-10-28 14:31:27 +00:00
2017-11-20 20:40:59 +00:00
/**
* Compares two * regular * indexes to find the differences between them .
* @ param array $oldindex The old index .
* @ param array $newindex The new index .
* @ param array $changed An array to be filled with the nterms of all the changed entries .
* @ param array $removed An array to be filled with the nterms of all the removed entries .
2015-10-28 09:14:41 +00:00
*/
2019-08-18 17:52:29 +00:00
public static function index_compare ( $oldindex , $newindex , & $changed , & $removed ) {
2019-08-17 00:19:04 +00:00
foreach ( $oldindex as $nterm => $entry ) {
2015-11-01 14:26:13 +00:00
if ( ! isset ( $newindex [ $nterm ]))
2015-10-28 14:31:27 +00:00
$removed [] = $nterm ;
2015-11-01 14:26:13 +00:00
}
2019-08-17 00:19:04 +00:00
foreach ( $newindex as $nterm => $entry ) {
2018-09-12 20:27:51 +00:00
if ( ! isset ( $oldindex [ $nterm ]) or // If this word is new
2015-11-01 14:26:13 +00:00
$newindex [ $nterm ] !== $oldindex [ $nterm ]) // If this word has changed
$changed [ $nterm ] = $newindex [ $nterm ];
2015-10-28 14:31:27 +00:00
}
2015-10-28 09:14:41 +00:00
}
2017-11-20 20:40:59 +00:00
/**
2019-08-22 20:38:17 +00:00
* Loads a connection to an inverted index .
2019-08-18 17:52:29 +00:00
* @ param string $invindex_filename The path to the inverted index to load .
2017-11-20 20:40:59 +00:00
* @ todo Remove this function and make everything streamable
2015-10-28 09:14:41 +00:00
*/
2019-08-18 17:52:29 +00:00
public static function invindex_load ( string $invindex_filename ) {
2019-08-22 20:38:17 +00:00
global $env , $paths ;
2019-08-18 17:52:29 +00:00
$start_time = microtime ( true );
2019-08-22 20:38:17 +00:00
self :: $invindex = new StorageBox ( $invindex_filename );
2019-08-18 17:52:29 +00:00
$env -> perfdata -> searchindex_load_time = round (( microtime ( true ) - $start_time ) * 1000 , 3 );
2016-08-20 10:27:26 +00:00
}
2019-08-22 20:38:17 +00:00
/**
* Closes the currently open inverted index .
*/
public static function invindex_close () {
global $env ;
$start_time = microtime ( true );
self :: $invindex -> close ();
$env -> perfdata -> searchindex_close_time = round (( microtime ( true ) - $start_time ) * 1000 , 3 );
}
2017-11-20 20:40:59 +00:00
/**
* Merge an index into an inverted index .
* @ param int $pageid The id of the page to assign to the index that ' s being merged .
* @ param array $index The regular index to merge .
* @ param array $removals An array of index entries to remove from the inverted index . Useful for applying changes to an inverted index instead of deleting and remerging an entire page ' s index .
2015-10-28 14:31:27 +00:00
*/
2019-08-18 17:52:29 +00:00
public static function invindex_merge ( $pageid , & $index , & $removals = []) : void {
2019-08-22 20:38:17 +00:00
if ( self :: $invindex == null )
2019-08-18 17:52:29 +00:00
throw new Exception ( " Error: Can't merge into an inverted index that isn't loaded. " );
2019-08-22 20:38:17 +00:00
if ( ! self :: $invindex -> has ( " |termlist| " ))
self :: $invindex -> set ( " |termlist| " , []);
$termlist = self :: $invindex -> get ( " |termlist| " );
2019-12-23 22:02:41 +00:00
2015-10-28 14:31:27 +00:00
// Remove all the subentries that were removed since last time
2019-08-18 17:52:29 +00:00
foreach ( $removals as $nterm ) {
// Delete the offsets
2019-08-22 20:38:17 +00:00
self :: $invindex -> delete ( " $nterm | $pageid " );
2019-08-18 17:52:29 +00:00
// Delete the item from the list of pageids containing this term
2019-12-06 23:40:28 +00:00
$nterm_pageids = self :: $invindex -> get_arr_simple ( $nterm );
2019-08-18 17:52:29 +00:00
array_splice ( $nterm_pageids , array_search ( $pageid , $nterm_pageids ), 1 );
if ( empty ( $nterm_pageids )) { // No need to keep the pageid list if there's nothing in it
2019-08-22 20:38:17 +00:00
self :: $invindex -> delete ( $nterm );
2019-08-18 17:52:29 +00:00
// Update the termlist if we're deleting the term completely
$termlist_loc = array_search ( $nterm , $termlist );
if ( $termlist_loc !== false ) array_splice ( $termlist , $termlist_loc , 1 );
}
else
2019-12-23 21:58:23 +00:00
self :: $invindex -> set_arr_simple ( $nterm , $nterm_pageids );
2019-08-18 17:52:29 +00:00
}
2015-10-28 09:14:41 +00:00
2015-10-28 14:31:27 +00:00
// Merge all the new / changed index entries into the inverted index
2019-08-15 22:06:06 +00:00
foreach ( $index as $nterm => $newentry ) {
2019-12-06 23:40:28 +00:00
// if(!is_string($nterm)) $nterm = strval($nterm);
2019-08-22 20:38:17 +00:00
if ( ! self :: $invindex -> has ( $nterm )) {
2019-12-06 23:40:28 +00:00
self :: $invindex -> set_arr_simple ( $nterm , []);
2019-08-18 17:52:29 +00:00
$termlist [] = $nterm ;
}
// Update the nterm pageid list
2019-12-06 23:40:28 +00:00
$nterm_pageids = self :: $invindex -> get_arr_simple ( $nterm );
2019-08-18 17:52:29 +00:00
if ( array_search ( $pageid , $nterm_pageids ) === false ) {
$nterm_pageids [] = $pageid ;
2019-12-06 23:40:28 +00:00
self :: $invindex -> set_arr_simple ( $nterm , $nterm_pageids );
2019-08-18 17:52:29 +00:00
}
// Store the offset list
2019-08-22 20:38:17 +00:00
self :: $invindex -> set ( " $nterm | $pageid " , $newentry );
2015-10-28 14:31:27 +00:00
}
2019-08-18 17:52:29 +00:00
2019-08-22 20:38:17 +00:00
self :: $invindex -> set ( " |termlist| " , $termlist );
2015-10-28 14:31:27 +00:00
}
2015-11-14 17:01:23 +00:00
/**
* Deletes the given pageid from the given pageindex .
2019-08-15 22:06:06 +00:00
* @ param int $pageid The pageid to remove .
2015-11-14 17:01:23 +00:00
*/
2019-08-18 17:52:29 +00:00
public static function invindex_delete ( int $pageid ) {
2019-08-22 20:38:17 +00:00
$termlist = self :: $invindex -> get ( " |termlist| " );
2019-08-18 17:52:29 +00:00
foreach ( $termlist as $nterm ) {
2019-12-06 23:40:28 +00:00
$nterm_pageids = self :: $invindex -> get_arr_simple ( $nterm );
2019-08-18 17:52:29 +00:00
$nterm_loc = array_search ( $pageid , $nterm_pageids );
// If this nterm doesn't appear in the list, we're not interested
if ( $nterm_loc === false )
continue ;
// Delete it from the ntemr list
array_splice ( $nterm_pageids , $nterm_loc , 1 );
// Delete the offset list
2019-08-22 20:38:17 +00:00
self :: $invindex -> delete ( " $nterm | $pageid " );
2019-08-18 17:52:29 +00:00
// If this term doesn't appear in any other documents, delete it
if ( count ( $nterm_pageids ) === 0 ) {
2019-08-22 20:38:17 +00:00
self :: $invindex -> delete ( $nterm );
2019-08-18 17:52:29 +00:00
array_splice ( $termlist , array_search ( $nterm , $termlist ), 1 );
}
else // Save the document id list back, since it still contains other pageids
2019-12-06 23:40:28 +00:00
self :: $invindex -> set_arr_simple ( $nterm , $nterm_pageids );
2015-11-14 17:01:23 +00:00
}
2019-08-18 17:52:29 +00:00
// Save the termlist back to the store
2019-08-22 20:38:17 +00:00
self :: $invindex -> set ( " |termlist| " , $termlist );
2015-10-28 09:14:41 +00:00
}
2015-10-28 20:56:10 +00:00
2019-08-18 20:25:48 +00:00
/*
* ███████ ████████ █████ ███████
* ██ ██ ██ ██ ██
* ███████ ██ ███████ ███████
* ██ ██ ██ ██ ██
* ███████ ██ ██ ██ ███████
*/
/**
2019-12-10 01:13:51 +00:00
* Splits a query string into tokens . Does not require that the input string be transliterated .
2019-12-15 17:56:56 +00:00
* Was based on my earlier explode_adv : https :// starbeamrainbowlabs . com / blog / article . php ? article = posts / 081 - PHP - String - Splitting . html
* Now improved to be strtok - based , since it ' s much faster .
* Example I used when writing this : https :// www . php . net / manual / en / function . strtok . php #94463
* @ param string $query The query string to split .
2019-08-18 20:25:48 +00:00
*/
2019-08-22 20:38:17 +00:00
public function stas_split ( $query ) {
2019-12-15 17:56:56 +00:00
$query = self :: $literator -> transliterate ( $query );
2019-08-18 20:25:48 +00:00
$terms = [];
2019-12-15 17:56:56 +00:00
$next_token = strtok ( $query , " \r \n \t " );
while ( true ) {
2019-08-18 20:25:48 +00:00
2019-12-15 17:56:56 +00:00
if ( strpos ( $next_token , '"' ) !== false )
$next_token .= " " . strtok ( '"' ) . '"' ;
if ( strpos ( $next_token , " ' " ) !== false )
$next_token .= " " . strtok ( " ' " ) . " ' " ;
$terms [] = $next_token ;
$next_token = strtok ( " \r \n \t " );
if ( $next_token === false ) break ;
2019-08-18 20:25:48 +00:00
}
return $terms ;
}
/**
* Parses an array of query tokens into an associative array of search directives .
* Supported syntax derived from these sources :
* https :// help . duckduckgo . com / duckduckgo - help - pages / results / syntax /
* https :// docs . microsoft . com / en - us / windows / win32 / lwef /- search - 2 x - wds - aqsreference
* @ param string [] $tokens The array of query tokens to parse .
*/
2019-08-22 20:38:17 +00:00
public function stas_parse ( $tokens ) {
2019-08-22 21:23:30 +00:00
global $settings ;
2019-08-18 20:25:48 +00:00
/* Supported Syntax *
*
* - term exclude a term
* + term double the weighting of a term
* terms ! dest terms redirect entire query ( minus the ! bang ) to interwiki with registered shortcut dest
* prefix : term apply prefix operator to term
*/
2019-08-22 20:38:17 +00:00
// var_dump($tokens);
2019-08-18 20:25:48 +00:00
$result = [
" terms " => [],
" exclude " => [],
" interwiki " => null
];
// foreach($operators as $op)
// $result[$op] = [];
2019-08-22 23:51:39 +00:00
2019-08-18 20:25:48 +00:00
$count = count ( $tokens );
for ( $i = count ( $tokens ) - 1 ; $i >= 0 ; $i -- ) {
// Look for excludes
if ( $tokens [ $i ][ 0 ] == " - " ) {
2019-12-15 17:56:56 +00:00
if ( in_array ( substr ( $tokens [ $i ], 1 ), self :: $stop_words )) {
$result [ " tokens " ][] = [
" term " => substr ( $tokens [ $i ], 1 ),
" weight " => - 1 ,
" location " => " all "
];
}
else
$result [ " exclude " ][] = substr ( $tokens [ $i ], 1 );
2019-08-18 20:25:48 +00:00
continue ;
}
// Look for weighted terms
if ( $tokens [ $i ][ 0 ] == " + " ) {
2019-12-15 17:56:56 +00:00
if ( in_array ( substr ( $tokens [ $i ], 1 ), self :: $stop_words )) {
$result [ " tokens " ] = [ " term " => substr ( $tokens [ $i ], 1 ), " weight " => - 1 , " location " => " all " ];
}
else {
$result [ " terms " ][] = [
" term " => substr ( $tokens [ $i ], 1 ),
" weight " => 2 ,
" location " => " all "
];
}
2019-08-18 20:25:48 +00:00
continue ;
}
// Look for interwiki searches
if ( $tokens [ $i ][ 0 ] == " ! " || substr ( $tokens [ $i ], - 1 ) == " ! " ) {
// You can only go to 1 interwiki destination at once, so we replace any previous finding with this one
$result [ " interwiki " ] = trim ( $tokens [ $i ], " ! " );
}
// Look for colon directives in the form directive:term
// Also supports prefix:"quoted term with spaces", quotes stripped automatically
/*** Example directives *** ( . = implemented , * = not implemented )
. intitle search only page titles for term
. intags search only tags for term
. inbody search page body only for term
* before search only pages that were last modified before term
* after search only pages that were last modified after term
* size search only pages that match the size spec term ( e . g . 1 k + -> more than 1 k bytes , 2 k - -> less than 2 k bytes , > 5 k -> more than 5 k bytes , < 10 k -> less than 10 k bytes )
**************************/
if ( strpos ( $tokens [ $i ], " : " ) !== false ) {
$parts = explode ( " : " , $tokens [ $i ], 2 );
if ( ! isset ( $result [ $parts [ 0 ]]))
$result [ $parts [ 0 ]] = [];
switch ( $parts [ 0 ]) {
2019-08-23 00:24:17 +00:00
case " intitle " : // BUG: What if a normal word is found in a title?
2019-08-18 20:25:48 +00:00
$result [ " terms " ][] = [
" term " => $parts [ 1 ],
" weight " => $settings -> search_title_matches_weighting * mb_strlen ( $parts [ 1 ]),
" location " => " title "
];
break ;
case " intags " :
$result [ " terms " ][] = [
" term " => $parts [ 1 ],
" weight " => $settings -> search_tags_matches_weighting * mb_strlen ( $parts [ 1 ]),
" location " => " tags "
];
break ;
case " inbody " :
$result [ " terms " ][] = [
" term " => $parts [ 1 ],
" weight " => 1 ,
" location " => " body "
];
break ;
default :
$result [ $parts [ 0 ]][] = trim ( $parts [ 1 ], '"' );
break ;
}
continue ;
}
// Doesn't appear to be particularly special *shrugs*
// Set the weight to -1 if it's a stop word
$result [ " terms " ][] = [
" term " => $tokens [ $i ],
2019-08-22 21:11:09 +00:00
" weight " => in_array ( $tokens [ $i ], self :: $stop_words ) ? - 1 : 1 ,
2019-08-18 20:25:48 +00:00
" location " => " all "
];
}
return $result ;
}
2017-11-20 20:40:59 +00:00
/**
* Searches the given inverted index for the specified search terms .
* @ param string $query The search query .
* @ return array An array of matching pages .
*/
2019-08-18 20:25:48 +00:00
public static function invindex_query ( $query )
2015-10-28 20:56:10 +00:00
{
2015-11-01 15:05:54 +00:00
global $settings , $pageindex ;
2019-08-22 20:38:17 +00:00
$query_stas = self :: stas_parse (
2019-12-08 21:04:59 +00:00
self :: stas_split ( self :: $literator -> transliterate ( $query ))
2019-08-18 20:25:48 +00:00
);
2015-10-28 20:56:10 +00:00
2019-08-18 20:25:48 +00:00
/* Sub - array format :
* [
* nterms : [ nterm => frequency , nterm => frequency , .... ],
* offsets_body : int [],
* matches_title : int ,
* matches_tags : int
* ]
*/
$matching_pages = [];
$match_template = [
" nterms " => [],
" offsets_body " => [],
" rank_title " => 0 ,
" rank_tags " => 0
];
2015-11-01 15:05:54 +00:00
2019-08-18 20:25:48 +00:00
// Query the inverted index
2019-08-22 20:38:17 +00:00
foreach ( $query_stas [ " terms " ] as $term_def ) {
2019-08-18 20:25:48 +00:00
if ( $term_def [ " weight " ] == - 1 )
continue ; // Skip stop words
if ( ! in_array ( $term_def [ " location " ], [ " all " , " inbody " ]))
continue ; // Skip terms we shouldn't search the page body for
2019-08-22 20:38:17 +00:00
if ( ! self :: $invindex -> has ( $term_def [ " term " ]))
2019-08-18 20:25:48 +00:00
continue ; // Skip if it's not in the index
// For each page that contains this term.....
2019-12-06 23:40:28 +00:00
$term_pageids = self :: $invindex -> get_arr_simple ( $term_def [ " term " ]);
2019-08-18 20:25:48 +00:00
foreach ( $term_pageids as $pageid ) {
// Check to see if it contains any words we should exclude
$skip = false ;
2019-08-22 21:11:09 +00:00
foreach ( $query_stas [ " exclude " ] as $excl_term ) {
2019-08-22 20:38:17 +00:00
if ( self :: $invindex -> has ( " $excl_term | $pageid " )) {
2019-08-18 20:25:48 +00:00
$skip = true ;
break ;
}
2015-11-01 15:05:54 +00:00
}
2019-08-18 20:25:48 +00:00
if ( $skip ) continue ;
// Get the list of offsets
2019-08-22 20:38:17 +00:00
$page_offsets = self :: $invindex -> get ( " { $term_def [ " term " ] } | $pageid " );
2019-08-18 20:25:48 +00:00
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = $match_template ; // Arrays are assigned by copy in php
// Add it to the appropriate $matching_pages entry, not forgetting to apply the weighting
$matching_pages [ $pageid ][ " offsets_body " ] = array_merge (
$matching_pages [ $pageid ][ " offsets_body " ],
2019-08-22 21:11:09 +00:00
$page_offsets -> offsets
2019-08-18 20:25:48 +00:00
);
2019-08-22 21:11:09 +00:00
$matching_pages [ $pageid ][ " nterms " ][ $term_def [ " term " ]] = $page_offsets -> freq * $term_def [ " weight " ];
2015-11-01 15:05:54 +00:00
}
2019-08-18 20:25:48 +00:00
}
// Query page titles & tags
2019-08-22 20:38:17 +00:00
foreach ( $query_stas [ " terms " ] as $term_def ) {
2019-08-18 20:25:48 +00:00
// No need to skip stop words here, since we're doing a normal
// sequential search anyway
2019-08-22 21:23:30 +00:00
if ( ! in_array ( $term_def [ " location " ], [ " all " , " title " , " tags " ]))
2019-08-18 20:25:48 +00:00
continue ; // Skip terms we shouldn't search the page body for
2015-10-28 20:56:10 +00:00
2015-11-01 15:05:54 +00:00
// Loop over the pageindex and search the titles / tags
2018-06-25 23:06:20 +00:00
reset ( $pageindex ); // Reset array/object pointer
2019-08-22 23:51:39 +00:00
foreach ( $pageindex as $pagename => $pagedata ) {
2018-06-29 11:08:38 +00:00
// Setup a variable to hold the current page's id
2019-08-18 20:25:48 +00:00
$pageid = null ; // Cache the page id
2019-12-08 21:04:59 +00:00
$lit_title = self :: $literator -> transliterate ( $pagename );
$lit_tags = isset ( $pagedata -> tags ) ? self :: $literator -> transliterate ( implode ( " " , $pagedata -> tags )) : null ;
2019-08-18 20:25:48 +00:00
// Make sure that the title & tags don't contain a term we should exclude
$skip = false ;
foreach ( $query_stas [ " exclude " ] as $excl_term ) {
if ( mb_strpos ( $lit_title , $excl_term ) !== false ) {
$skip = true ;
// Delete it from the candidate matches (it might be present in the tags / title but not the body)
if ( isset ( $matching_pages [ $excl_term ]))
unset ( $matching_pages [ $excl_term ]);
break ;
}
}
if ( $skip ) continue ;
2015-11-01 15:05:54 +00:00
// Consider matches in the page title
2019-08-22 21:23:30 +00:00
if ( in_array ( $term_def [ " location " ], [ " all " , " title " ])) {
2019-08-18 20:25:48 +00:00
// FUTURE: We may be able to optimise this further by using preg_match_all + preg_quote instead of mb_stripos_all. Experimentation / benchmarking is required to figure out which one is faster
$title_matches = mb_stripos_all ( $lit_title , $term_def [ " term " ]);
$title_matches_count = $title_matches !== false ? count ( $title_matches ) : 0 ;
if ( $title_matches_count > 0 ) {
$pageid = ids :: getid ( $pagename ); // Fetch the page id
// We found the qterm in the title
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = $match_template ; // Assign by copy
$matching_pages [ $pageid ][ " rank_title " ] += $title_matches_count * $term_def [ " weight " ];
}
2015-11-01 15:05:54 +00:00
}
2019-08-22 20:38:17 +00:00
// If this page doesn't have any tags, skip it
if ( $lit_tags == null )
continue ;
2019-08-22 21:23:30 +00:00
if ( ! in_array ( $term_def [ " location " ], [ " all " , " tags " ]))
2019-08-18 20:25:48 +00:00
continue ; // If we shouldn't search the tags, no point in continuing
2015-11-01 15:05:54 +00:00
// Consider matches in the page's tags
2019-08-18 20:25:48 +00:00
$tag_matches = isset ( $pagedata -> tags ) ? mb_stripos_all ( $lit_tags , $term_def [ " term " ]) : false ;
2018-06-25 21:53:53 +00:00
$tag_matches_count = $tag_matches !== false ? count ( $tag_matches ) : 0 ;
2019-08-18 20:25:48 +00:00
if ( $tag_matches_count > 0 ) { // And we found the qterm in the tags
if ( $pageid === null ) // Fill out the page id if it hasn't been already
2018-06-26 13:28:11 +00:00
$pageid = ids :: getid ( $pagename );
2015-11-01 15:05:54 +00:00
if ( ! isset ( $matching_pages [ $pageid ]))
2019-08-18 20:25:48 +00:00
$matching_pages [ $pageid ] = $match_template ; // Assign by copy
2015-11-01 15:05:54 +00:00
2019-08-18 20:25:48 +00:00
$matching_pages [ $pageid ][ " rank_tags " ] += $tag_matches_count * $term_def [ " weight " ];
2015-11-01 15:05:54 +00:00
}
2015-10-28 20:56:10 +00:00
}
}
2019-08-18 20:25:48 +00:00
// TODO: Implement the rest of STAS here
2018-06-26 13:15:19 +00:00
reset ( $matching_pages );
2019-08-22 16:43:14 +00:00
foreach ( $matching_pages as $pageid => & $pagedata ) {
2015-10-29 11:21:04 +00:00
$pagedata [ " pagename " ] = ids :: getpagename ( $pageid );
2015-10-28 20:56:10 +00:00
$pagedata [ " rank " ] = 0 ;
2017-03-23 20:48:42 +00:00
$pageOffsets = [];
// Loop over each search term found on this page
2018-06-26 13:15:19 +00:00
reset ( $pagedata [ " nterms " ]);
2019-08-22 16:43:14 +00:00
foreach ( $pagedata [ " nterms " ] as $pterm => $frequency ) {
2017-03-23 20:48:42 +00:00
// Add the number of occurrences of this search term to the ranking
2017-10-15 12:42:15 +00:00
// Multiply it by the length of the word
2019-08-22 16:43:14 +00:00
$pagedata [ " rank " ] += $frequency * strlen ( $pterm );
2015-10-28 20:56:10 +00:00
}
2015-11-01 15:05:54 +00:00
// Consider matches in the title / tags
2019-08-22 16:43:14 +00:00
$pagedata [ " rank " ] += $pagedata [ " rank_title " ] + $pagedata [ " rank_tags " ];
// TODO: Consider implementing kernel density estimation here.
// https://en.wikipedia.org/wiki/Kernel_density_estimation
// We want it to have more of an effect the more words that are present in the query. Maybe a logarithmic function would be worth investigating here?
2015-11-01 15:05:54 +00:00
2019-08-22 16:43:14 +00:00
// TODO: Remove items if the computed rank is below a threshold
2015-10-28 20:56:10 +00:00
}
uasort ( $matching_pages , function ( $a , $b ) {
if ( $a [ " rank " ] == $b [ " rank " ]) return 0 ;
return ( $a [ " rank " ] < $b [ " rank " ]) ? + 1 : - 1 ;
});
return $matching_pages ;
}
2015-10-29 11:21:04 +00:00
2017-11-20 20:40:59 +00:00
/**
* Extracts a context string ( in HTML ) given a search query that could be displayed
* in a list of search results .
2018-06-26 13:28:11 +00:00
* @ param string $pagename The name of the paget that this source belongs to . Used when consulting the inverted index .
* @ param string $query The search queary to generate the context for .
* @ param string $source The page source to extract the context from .
* @ return string The generated context string .
2017-11-20 20:40:59 +00:00
*/
2019-08-22 20:38:17 +00:00
public static function extract_context ( $pagename , $query , $source )
2015-10-29 11:21:04 +00:00
{
global $settings ;
2018-06-26 13:28:11 +00:00
$pageid = ids :: getid ( $pagename );
2019-08-22 20:38:17 +00:00
$nterms = self :: stas_parse ( self :: stas_split ( $query ))[ " terms " ];
2018-06-26 13:15:19 +00:00
2019-08-22 20:38:17 +00:00
// Query the inverted index for offsets
$matches = [];
2018-06-26 13:15:19 +00:00
foreach ( $nterms as $nterm ) {
// Skip if the page isn't found in the inverted index for this word
2019-08-22 20:38:17 +00:00
if ( ! self :: $invindex -> has ( " { $nterm [ " term " ] } | $pageid " ))
2015-10-29 11:21:04 +00:00
continue ;
2018-06-26 13:15:19 +00:00
2019-08-22 20:38:17 +00:00
$nterm_offsets = self :: $invindex -> get ( " { $nterm [ " term " ] } | $pageid " ) -> offsets ;
foreach ( $nterm_offsets as $next_offset )
$matches [] = [ $nterm [ " term " ], $next_offset ];
2015-10-29 11:21:04 +00:00
}
2016-08-19 11:02:09 +00:00
// Sort the matches by offset
2015-10-29 11:21:04 +00:00
usort ( $matches , function ( $a , $b ) {
if ( $a [ 1 ] == $b [ 1 ]) return 0 ;
2016-08-19 11:02:09 +00:00
return ( $a [ 1 ] > $b [ 1 ]) ? + 1 : - 1 ;
2015-10-29 11:21:04 +00:00
});
2018-03-18 16:19:13 +00:00
$sourceLength = mb_strlen ( $source );
2016-08-19 12:02:42 +00:00
2015-10-29 11:21:04 +00:00
$contexts = [];
2018-06-25 23:06:20 +00:00
2015-10-29 11:21:04 +00:00
$matches_count = count ( $matches );
2018-06-25 23:06:20 +00:00
$total_context_length = 0 ;
for ( $i = 0 ; $i < $matches_count ; $i ++ ) {
$next_context = [
" from " => max ( 0 , $matches [ $i ][ 1 ] - $settings -> search_characters_context ),
2018-09-29 12:27:17 +00:00
" to " => min ( $sourceLength , $matches [ $i ][ 1 ] + mb_strlen ( $matches [ $i ][ 0 ]) + $settings -> search_characters_context )
2018-06-25 23:06:20 +00:00
];
if ( end ( $contexts ) !== false && end ( $contexts )[ " to " ] > $next_context [ " from " ]) {
// This next context overlaps with the previous one
// Extend the last one instead of adding a new one
2015-10-29 11:21:04 +00:00
2018-06-25 23:06:20 +00:00
// The array pointer is pointing at the last element now because we called end() above
2015-10-29 11:21:04 +00:00
2018-06-25 23:06:20 +00:00
// Update the total context length counter appropriately
$total_context_length += $next_context [ " to " ] - $contexts [ key ( $contexts )][ " to " ];
$contexts [ key ( $contexts )][ " to " ] = $next_context [ " to " ];
}
else { // No overlap here! Business as usual.
$contexts [] = $next_context ;
// Update the total context length counter as normal
$total_context_length += $next_context [ " to " ] - $next_context [ " from " ];
2015-10-29 11:21:04 +00:00
}
2018-06-25 23:06:20 +00:00
end ( $contexts );
$last_context = & $contexts [ key ( $contexts )];
if ( $total_context_length > $settings -> search_characters_context_total ) {
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
$last_context [ " to " ] -= $total_context_length - $settings -> search_characters_context_total ;
break ;
}
}
$contexts_text = [];
foreach ( $contexts as $context ) {
$contexts_text [] = substr ( $source , $context [ " from " ], $context [ " to " ] - $context [ " from " ]);
2015-10-29 11:21:04 +00:00
}
2019-08-22 20:38:17 +00:00
// BUG: Make sure that a snippet is centred on the word in question if we have to cut it short
2018-06-30 10:46:07 +00:00
$result = implode ( " … " , $contexts_text );
end ( $contexts ); // If there's at least one item in the list and were not at the very end of the page, add an extra ellipsis
if ( isset ( $contexts [ 0 ]) && $contexts [ key ( $contexts )][ " to " ] < $sourceLength ) $result .= " … " ;
2018-09-29 12:32:17 +00:00
// Prepend an ellipsis if the context doesn't start at the beginning of a page
if ( isset ( $contexts [ 0 ]) && $contexts [ 0 ][ " from " ] > 0 ) $result = " … $result " ;
2018-06-30 10:46:07 +00:00
return $result ;
2015-10-29 11:21:04 +00:00
}
2015-10-31 13:52:50 +00:00
2017-11-20 20:40:59 +00:00
/**
* Highlights the keywords of a context string .
* @ param string $query The query to use when highlighting .
* @ param string $context The context string to highlight .
* @ return string The highlighted ( HTML ) string .
*/
2015-10-31 13:52:50 +00:00
public static function highlight_context ( $query , $context )
{
2019-08-22 20:38:17 +00:00
$qterms = self :: stas_parse ( self :: stas_split ( $query ))[ " terms " ];
2015-10-31 13:52:50 +00:00
2019-08-22 20:38:17 +00:00
foreach ( $qterms as $qterm ) {
// Stop words are marked by STAS
2019-08-22 21:11:09 +00:00
if ( $qterm [ " weight " ] == - 1 )
2017-01-26 20:55:46 +00:00
continue ;
2018-03-18 16:52:55 +00:00
2018-09-29 22:40:23 +00:00
// From http://stackoverflow.com/a/2483859/1460422
2019-08-22 20:38:17 +00:00
$context = preg_replace ( " / " . preg_replace ( '/\\//u' , " \ / " , preg_quote ( $qterm [ " term " ])) . " /iu " , " <strong class='search-term-highlight'> $ 0</strong> " , $context );
2015-10-31 13:52:50 +00:00
}
return $context ;
}
2015-10-29 11:21:04 +00:00
}
2019-12-08 21:04:59 +00:00
// Run the init function
search :: init ();
2015-10-29 11:21:04 +00:00
2015-10-27 21:10:05 +00:00
?>