2015-10-27 21:10:05 +00:00
< ? php
register_module ([
" name " => " Search " ,
2018-07-01 11:14:06 +00:00
" version " => " 0.7 " ,
2015-10-27 21:10:05 +00:00
" author " => " Starbeamrainbowlabs " ,
2016-03-12 18:52:26 +00:00
" description " => " Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch. " ,
2015-10-27 21:10:05 +00:00
" id " => " feature-search " ,
" code " => function () {
2016-11-20 13:24:35 +00:00
global $settings ;
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = index & page = { pageName } Get an index of words for a given page
* @ apiName SearchIndex
* @ apiGroup Search
* @ apiPermission Anonymous
2018-02-14 23:08:28 +00:00
* @ apiDescription For debugging purposes . Be warned - the format could change at any time !
2016-06-12 20:15:43 +00:00
*
* @ apiParam { string } page The page to generate a word index page .
*/
2015-12-26 12:55:19 +00:00
/*
* ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ██████ ███████ ██ ██
*/
2015-10-27 21:10:05 +00:00
add_action ( " index " , function () {
global $settings , $env ;
$breakable_chars = " \r \n \t ., \\ /! \" £ $ %^&*[]()+`_~# " ;
header ( " content-type: text/plain " );
2015-11-08 21:15:08 +00:00
$source = file_get_contents ( " $env->storage_prefix $env->page .md " );
2015-10-27 21:10:05 +00:00
2015-10-28 09:14:41 +00:00
$index = search :: index ( $source );
2015-10-27 21:10:05 +00:00
2018-06-29 11:08:38 +00:00
echo ( " Page name: $env->page\n " );
echo ( " --------------- Source --------------- \n " );
echo ( $source ); echo ( " \n " );
echo ( " -------------------------------------- \n \n " );
echo ( " ---------------- Index --------------- \n " );
foreach ( $index as $term => $entry ) {
echo ( " $term : { $entry [ " freq " ] } matches | " . implode ( " , " , $entry [ " offsets " ]) . " \n " );
}
echo ( " -------------------------------------- \n " );
2015-10-27 21:10:05 +00:00
});
2015-10-28 20:56:10 +00:00
2016-06-12 20:15:43 +00:00
/**
* @ api { get } ? action = invindex - rebuild Rebuild the inverted search index from scratch
* @ apiDescription Causes the inverted search index to be completely rebuilt from scratch . Can take a while for large wikis !
* @ apiName SearchInvindexRebuild
* @ apiGroup Search
2017-07-10 21:10:18 +00:00
* @ apiPermission Admin
*
* @ apiParam { string } secret Optional . Specify the secret from peppermint . json here in order to rebuild the search index without logging in .
2016-06-12 20:15:43 +00:00
*/
2015-12-26 12:55:19 +00:00
/*
* ██ ███ ██ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ████ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ █████ ███ █████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ████ ████ ██ ██ ████ ██████ ███████ ██ ██
*
* ██████ ███████ ██████ ██ ██ ██ ██ ██████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ █████ ██████ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ███████ ██████ ██████ ██ ███████ ██████
*/
2015-10-28 20:56:10 +00:00
add_action ( " invindex-rebuild " , function () {
2017-07-10 21:06:41 +00:00
global $env , $settings ;
if ( $env -> is_admin ||
(
! empty ( $_POST [ " secret " ]) &&
$_POST [ " secret " ] === $settings -> secret
)
)
search :: rebuild_invindex ();
else
{
http_response_code ( 401 );
exit ( page_renderer :: render_main ( " Error - Search index regenerator - $settings->sitename " , " <p>Error: You aren't allowed to regenerate the search index. Try logging in as an admin, or setting the <code>secret</code> POST parameter to $settings->sitename 's secret - which can be found in $settings->sitename 's <code>peppermint.json</code> file.</p> " ));
}
2015-10-28 20:56:10 +00:00
});
2016-08-26 16:47:46 +00:00
/**
2016-08-26 16:55:50 +00:00
* @ api { get } ? action = idindex - show Show the id index
2018-02-14 23:10:20 +00:00
* @ apiDescription Outputs the id index . Useful if you need to verify that it ' s working as expected . Output is a json object .
2016-08-26 16:47:46 +00:00
* @ apiName SearchShowIdIndex
* @ apiGroup Search
* @ apiPermission Anonymous
*/
add_action ( " idindex-show " , function () {
global $idindex ;
header ( " content-type: application/json; charset=UTF-8 " );
exit ( json_encode ( $idindex , JSON_PRETTY_PRINT ));
});
2016-06-12 20:15:43 +00:00
/**
2018-02-14 23:08:28 +00:00
* @ api { get } ? action = search & query = { text }[ & format = { format }] Search the wiki for a given query string
2016-06-12 20:15:43 +00:00
* @ apiName Search
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to search for .
2018-02-14 23:08:28 +00:00
* @ apiParam { string } format Optional . Valid values : html , json . In json mode an object is returned with page names as keys , values as search result information - sorted in ranking order .
2016-06-12 20:15:43 +00:00
*/
2015-12-26 12:55:19 +00:00
/*
* ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
2015-10-28 20:56:10 +00:00
add_action ( " search " , function () {
2015-11-09 07:25:28 +00:00
global $settings , $env , $pageindex , $paths ;
2015-10-28 20:56:10 +00:00
2015-12-05 17:27:01 +00:00
// Create the inverted index if it doesn't exist.
// todo In the future perhaps a CLI for this would be good?
if ( ! file_exists ( $paths -> searchindex ))
2017-07-10 20:53:52 +00:00
search :: rebuild_invindex ( false );
2015-12-05 17:27:01 +00:00
2015-10-28 20:56:10 +00:00
if ( ! isset ( $_GET [ " query " ]))
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( " No Search Terms - Error - $settings->sitename " , " <p>You didn't specify any search terms. Try typing some into the box above.</p> " ));
2015-10-28 20:56:10 +00:00
2015-10-29 11:21:04 +00:00
$search_start = microtime ( true );
2015-10-28 20:56:10 +00:00
2018-06-25 21:53:53 +00:00
$time_start = microtime ( true );
2015-11-08 21:15:08 +00:00
$invindex = search :: load_invindex ( $paths -> searchindex );
2018-06-25 21:53:53 +00:00
$env -> perfdata -> invindex_decode_time = round (( microtime ( true ) - $time_start ) * 1000 , 3 );
$start = microtime ( true );
2015-10-29 11:21:04 +00:00
$results = search :: query_invindex ( $_GET [ " query " ], $invindex );
2016-08-20 10:35:04 +00:00
$resultCount = count ( $results );
2018-06-25 21:53:53 +00:00
$env -> perfdata -> invindex_query_time = round (( microtime ( true ) - $time_start ) * 1000 , 3 );
header ( " x-invindex-decode-time: { $env -> perfdata -> invindex_decode_time } ms " );
header ( " x-invindex-query-time: { $env -> perfdata -> invindex_query_time } ms " );
2018-02-14 23:08:28 +00:00
2018-06-26 13:15:19 +00:00
$start = microtime ( true );
2018-02-14 23:08:28 +00:00
foreach ( $results as & $result ) {
$result [ " context " ] = search :: extract_context (
2018-06-26 13:28:11 +00:00
$invindex , $result [ " pagename " ],
2018-02-14 23:08:28 +00:00
$_GET [ " query " ],
file_get_contents ( $env -> storage_prefix . $result [ " pagename " ] . " .md " )
);
}
2018-06-26 13:15:19 +00:00
$env -> perfdata -> context_generation_time = round (( microtime ( true ) - $start ) * 1000 , 3 );
header ( " x-context-generation-time: { $env -> perfdata -> context_generation_time } ms " );
2018-02-14 23:08:28 +00:00
2018-06-25 23:11:01 +00:00
$env -> perfdata -> search_time = round (( microtime ( true ) - $search_start ) * 1000 , 3 );
header ( " x-search-time: { $env -> perfdata -> search_time } ms " );
2018-02-14 23:08:28 +00:00
if ( ! empty ( $_GET [ " format " ]) && $_GET [ " format " ] == " json " ) {
header ( " content-type: application/json " );
$json_results = new stdClass ();
foreach ( $results as $result ) $json_results -> { $result [ " pagename " ]} = $result ;
exit ( json_encode ( $json_results ));
}
2015-11-08 21:15:08 +00:00
2015-10-29 11:21:04 +00:00
$title = $_GET [ " query " ] . " - Search results - $settings->sitename " ;
$content = " <section> \n " ;
$content .= " <h1>Search Results</h1> " ;
2015-10-31 14:05:00 +00:00
/// Search Box ///
$content .= " <form method='get' action=''> \n " ;
2017-07-29 09:44:55 +00:00
$content .= " <input type='search' id='search-box' name='query' placeholder='Type your query here and then press enter.' value=' " . htmlentities ( $_GET [ " query " ], ENT_HTML5 | ENT_QUOTES ) . " ' /> \n " ;
2015-10-31 14:05:00 +00:00
$content .= " <input type='hidden' name='action' value='search' /> \n " ;
$content .= " </form> " ;
2015-10-29 11:21:04 +00:00
2016-08-20 10:35:04 +00:00
$content .= " <p>Found $resultCount " . ( $resultCount === 1 ? " result " : " results " ) . " in " . $env -> perfdata -> search_time . " ms. " ;
2015-11-01 10:13:35 +00:00
$query = $_GET [ " query " ];
if ( isset ( $pageindex -> $query ))
{
2016-08-20 10:35:04 +00:00
$content .= " There's a page on $settings->sitename called <a href='?page= " . rawurlencode ( $query ) . " '> $query </a>. " ;
2015-11-01 10:13:35 +00:00
}
else
{
2016-08-20 10:35:04 +00:00
$content .= " There isn't a page called $query on $settings->sitename , but you " ;
2016-03-12 19:02:36 +00:00
if (( ! $settings -> anonedits && ! $env -> is_logged_in ) || ! $settings -> editing )
{
$content .= " do not have permission to create it. " ;
if ( ! $env -> is_logged_in )
{
$content .= " You could try <a href='?action=login&returnto= " . rawurlencode ( $_SERVER [ " REQUEST_URI " ]) . " '>logging in</a>. " ;
}
}
else
{
2016-08-20 10:35:04 +00:00
$content .= " can <a href='?action=edit&page= " . rawurlencode ( $query ) . " '>create it</a>. " ;
2016-03-12 19:02:36 +00:00
}
2015-11-01 10:13:35 +00:00
}
2016-08-20 10:35:04 +00:00
$content .= " </p> " ;
2015-11-01 10:13:35 +00:00
2017-09-19 16:32:52 +00:00
if ( module_exists ( " page-list " )) {
$nterms = search :: tokenize ( $query );
$nterms_regex = implode ( " | " , array_map ( function ( $nterm ) {
return preg_quote ( strtolower ( trim ( $nterm )));
}, $nterms ));
$all_tags = get_all_tags ();
$matching_tags = [];
foreach ( $all_tags as $tag ) {
if ( preg_match ( " / $nterms_regex /i " , trim ( $tag )) > 0 )
$matching_tags [] = $tag ;
}
if ( count ( $matching_tags ) > 0 ) {
2018-06-30 10:46:07 +00:00
$content .= " <p class='matching-tags-display'><label>Matching tags</label><span class='tags'> " ;
2017-09-19 16:32:52 +00:00
foreach ( $matching_tags as $tag ) {
$content .= " \t <a href='?action=list-tags&tag= " . rawurlencode ( $tag ) . " ' class='mini-tag'> " . htmlentities ( $tag ) . " </a> \n " ;
}
$content .= " </span></p> " ;
}
}
2015-10-31 14:16:19 +00:00
$i = 0 ; // todo use $_GET["offset"] and $_GET["result-count"] or something
2015-10-29 11:21:04 +00:00
foreach ( $results as $result )
{
$link = " ?page= " . rawurlencode ( $result [ " pagename " ]);
2015-11-08 21:15:08 +00:00
$pagesource = file_get_contents ( $env -> storage_prefix . $result [ " pagename " ] . " .md " );
2016-08-19 12:02:42 +00:00
//echo("Extracting context for result " . $result["pagename"] . ".\n");
2018-02-14 23:08:28 +00:00
$context = $result [ " context " ];
2018-03-18 16:52:55 +00:00
if ( mb_strlen ( $context ) === 0 )
$context = mb_substr ( $pagesource , 0 , $settings -> search_characters_context * 2 );
2016-08-19 12:02:42 +00:00
//echo("'Generated search context for " . $result["pagename"] . ": $context'\n");
2018-03-18 16:52:55 +00:00
$context = search :: highlight_context (
$_GET [ " query " ],
preg_replace ( '/</u' , '<' , $context )
);
2015-11-02 14:42:38 +00:00
/* if ( strlen ( $context ) == 0 )
2015-11-01 15:05:54 +00:00
{
$context = search :: strip_markup ( file_get_contents ( " $env->page .md " , null , null , null , $settings -> search_characters_context * 2 ));
if ( $pageindex -> { $env -> page } -> size > $settings -> search_characters_context * 2 )
$context .= " ... " ;
2015-11-02 14:42:38 +00:00
} */
2015-11-01 15:05:54 +00:00
2017-10-14 21:48:58 +00:00
$tag_list = " <span class='tags'> " ;
foreach ( $pageindex -> { $result [ " pagename " ]} -> tags ? ? [] as $tag ) $tag_list .= " <a href='?action=list-tags&tag= " . rawurlencode ( $tag ) . " ' class='mini-tag'> $tag </a> " ;
$tag_list .= " </span> \n " ;
2017-03-20 20:21:25 +00:00
// Make redirect pages italics
if ( ! empty ( $pageindex -> { $result [ " pagename " ]} -> redirect ))
$result [ " pagename " ] = " <em> { $result [ " pagename " ] } </em> " ;
2015-10-29 11:21:04 +00:00
2015-10-31 14:16:19 +00:00
// We add 1 to $i here to convert it from an index to a result
// number as people expect it to start from 1
$content .= " <div class='search-result' data-result-number=' " . ( $i + 1 ) . " ' data-rank=' " . $result [ " rank " ] . " '> \n " ;
2017-10-14 21:48:58 +00:00
$content .= " <h2><a href=' $link '> " . $result [ " pagename " ] . " </a> <span class='search-result-badges'> $tag_list </span></h2> \n " ;
2016-08-19 12:47:56 +00:00
$content .= " <p class='search-context'> $context </p> \n " ;
2015-10-29 11:21:04 +00:00
$content .= " </div> \n " ;
2015-10-31 14:16:19 +00:00
$i ++ ;
2015-10-29 11:21:04 +00:00
}
$content .= " </section> \n " ;
2016-08-21 20:02:36 +00:00
header ( " content-type: text/html; charset=UTF-8 " );
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( $title , $content ));
//header("content-type: text/plain");
//var_dump($results);
2015-10-28 20:56:10 +00:00
});
2016-10-01 10:32:38 +00:00
2017-03-23 21:13:20 +00:00
/*
* ██████ ██ ██ ███████ ██████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██ ██ █████ ██████ ████ █████
* ██ ▄▄ ██ ██ ██ ██ ██ ██ ██
* ██████ ██████ ███████ ██ ██ ██
* ▀▀
* ███████ ███████ █████ ██████ ██████ ██ ██ ██ ███ ██ ██████ ███████ ██ ██
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██
* ███████ █████ ███████ ██████ ██ ███████ ██ ██ ██ ██ ██ ██ █████ ███
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ███████ ███████ ██ ██ ██ ██ ██████ ██ ██ ██ ██ ████ ██████ ███████ ██ ██
*/
/**
* @ api { get } ? action = query - searchindex & query = { text } Inspect the internals of the search results for a query
* @ apiName Search
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } query The query string to search for .
*/
add_action ( " query-searchindex " , function () {
global $env , $paths ;
if ( empty ( $_GET [ " query " ])) {
http_response_code ( 400 );
header ( " content-type: text/plain " );
exit ( " Error: No query specified. Specify it with the 'query' GET parameter. " );
}
$env -> perfdata -> searchindex_decode_start = microtime ( true );
$searchIndex = search :: load_invindex ( $paths -> searchindex );
$env -> perfdata -> searchindex_decode_time = ( microtime ( true ) - $env -> perfdata -> searchindex_decode_start ) * 1000 ;
$env -> perfdata -> searchindex_query_start = microtime ( true );
$searchResults = search :: query_invindex ( $_GET [ " query " ], $searchIndex );
$env -> perfdata -> searchindex_query_time = ( microtime ( true ) - $env -> perfdata -> searchindex_query_start ) * 1000 ;
header ( " content-type: application/json " );
$result = new stdClass ();
$result -> time_format = " ms " ;
$result -> decode_time = $env -> perfdata -> searchindex_decode_time ;
$result -> query_time = $env -> perfdata -> searchindex_query_time ;
$result -> total_time = $result -> decode_time + $result -> query_time ;
$result -> search_results = $searchResults ;
exit ( json_encode ( $result , JSON_PRETTY_PRINT ));
});
2017-06-28 08:45:13 +00:00
/*
* ██████ ██████ ███████ ███ ██ ███████ ███████ █████ ██████ ██████ ██ ██
* ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██ ██ ██████ █████ ██ ██ ██ ███████ █████ ███████ ██████ ██ ███████
* ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██
* ██████ ██ ███████ ██ ████ ███████ ███████ ██ ██ ██ ██ ██████ ██ ██
*/
/**
* @ api { get } ? action = opensearch - description Get the opensearch description file
* @ apiName OpenSearchDescription
* @ apiGroup Search
* @ apiPermission Anonymous
2016-10-01 10:32:38 +00:00
*/
add_action ( " opensearch-description " , function () {
global $settings ;
$siteRoot = full_url () . " /index.php " ;
if ( ! isset ( $_GET [ " debug " ]))
header ( " content-type: application/opensearchdescription+xml " );
else
header ( " content-type: text/plain " );
2017-06-06 20:33:11 +00:00
exit ( '<?xml version="1.0" encoding="UTF-8"?' . '>' . // hack The build system strips it otherwise O.o I should really fix that.
" \n <OpenSearchDescription xmlns= \" http://a9.com/-/spec/opensearch/1.1/ \" >
2016-10-01 10:32:38 +00:00
< ShortName > Search $settings -> sitename </ ShortName >
< Description > Search $settings -> sitename , which is powered by Pepperminty Wiki .</ Description >
< Tags > $settings -> sitename Wiki </ Tags >
< Image type = \ " image/png \" > $settings->favicon </Image>
< Attribution > Search content available under the license linked to at the bottom of the search results page .</ Attribution >
< Developer > Starbeamrainbowlabs ( https :// github . com / sbrl / Pepperminty - Wiki / graphs / contributors ) </ Developer >
< InputEncoding > UTF - 8 </ InputEncoding >
< OutputEncoding > UTF - 8 </ OutputEncoding >
2017-07-16 09:26:22 +00:00
< Url type = \ " text/html \" method= \" get \" template= \" $siteRoot ?action=view&search-redirect=yes&page= { searchTerms}&offset= { startIndex?}&count= { count} \" />
2017-06-28 10:21:42 +00:00
< Url type = \ " application/x-suggestions+json \" template= \" $siteRoot ?action=suggest-pages&query= { searchTerms}&type=opensearch \" />
2017-06-06 20:33:11 +00:00
</ OpenSearchDescription > " );
2016-10-01 10:32:38 +00:00
});
2016-11-20 13:24:35 +00:00
2017-06-28 08:45:13 +00:00
/**
2018-02-14 23:08:28 +00:00
* @ api { get } ? action = suggest - pages [ & type = { type }] Get page name suggestions for a query
2017-06-28 08:45:13 +00:00
* @ apiName OpenSearchDescription
* @ apiGroup Search
* @ apiPermission Anonymous
*
* @ apiParam { string } text The search query string to get search suggestions for .
2017-06-28 09:44:44 +00:00
* @ apiParam { string } type The type of result to return . Default value : json . Available values : json , opensearch
2017-06-28 08:45:13 +00:00
*/
2016-11-20 13:24:35 +00:00
add_action ( " suggest-pages " , function () {
global $settings , $pageindex ;
if ( $settings -> dynamic_page_suggestion_count === 0 )
{
header ( " content-type: application/json " );
2017-06-28 10:21:42 +00:00
header ( " content-length: 3 " );
exit ( " [] \n " );
2016-11-20 13:24:35 +00:00
}
if ( empty ( $_GET [ " query " ])) {
http_response_code ( 400 );
header ( " content-type: text/plain " );
exit ( " Error: You didn't specify the 'query' GET parameter. " );
}
2017-06-28 09:44:44 +00:00
$type = $_GET [ " type " ] ? ? " json " ;
if ( ! in_array ( $type , [ " json " , " opensearch " ])) {
http_response_code ( 406 );
exit ( " Error: The type ' $type ' is not one of the supported output types. Available values: json, opensearch. Default: json " );
}
2018-06-25 22:03:00 +00:00
$literator = Transliterator :: createFromRules ( ':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;' , Transliterator :: FORWARD );
$query = $literator -> transliterate ( $_GET [ " query " ]);
2016-11-20 13:24:35 +00:00
// Rank each page name
$results = [];
foreach ( $pageindex as $pageName => $entry ) {
$results [] = [
" pagename " => $pageName ,
// Costs: Insert: 1, Replace: 8, Delete: 6
2018-06-25 22:03:00 +00:00
" distance " => levenshtein ( $query , $literator -> transliterate ( $pageName ), 1 , 8 , 6 )
2016-11-20 13:24:35 +00:00
];
}
2017-06-28 09:44:44 +00:00
// Sort the page names by distance from the original query
2016-11-20 13:24:35 +00:00
usort ( $results , function ( $a , $b ) {
if ( $a [ " distance " ] == $b [ " distance " ])
return strcmp ( $a [ " pagename " ], $b [ " pagename " ]);
return $a [ " distance " ] < $b [ " distance " ] ? - 1 : 1 ;
});
// Send the results to the user
2017-06-28 09:44:44 +00:00
$suggestions = array_slice ( $results , 0 , $settings -> dynamic_page_suggestion_count );
switch ( $type )
{
case " json " :
header ( " content-type: application/json " );
exit ( json_encode ( $suggestions ));
case " opensearch " :
$opensearch_output = [
$_GET [ " query " ],
array_map ( function ( $suggestion ) { return $suggestion [ " pagename " ]; }, $suggestions )
];
header ( " content-type: application/x-suggestions+json " );
exit ( json_encode ( $opensearch_output ));
}
2016-11-20 13:24:35 +00:00
});
if ( $settings -> dynamic_page_suggestion_count > 0 )
{
page_renderer :: AddJSSnippet ( ' /// Dynamic page suggestion system
// Micro snippet 8 - Promisified GET (fetched 20th Nov 2016)
function get ( u ){ return new Promise ( function ( r , t , a ){ a = new XMLHttpRequest (); a . onload = function ( b , c ){ b = a . status ; c = a . response ; if ( b > 199 && b < 300 ){ r ( c )} else { t ( c )}}; a . open ( " GET " , u , true ); a . send ( null )})}
window . addEventListener ( " load " , function ( event ) {
var searchBox = document . querySelector ( " input[type=search] " );
searchBox . dataset . lastValue = " " ;
searchBox . addEventListener ( " keyup " , function ( event ) {
// Make sure that we don\'t keep sending requests to the server if nothing has changed
if ( searchBox . dataset . lastValue == event . target . value )
return ;
searchBox . dataset . lastValue = event . target . value ;
// Fetch the suggestions from the server
get ( " ?action=suggest-pages&query= " + encodeURIComponent ( event . target . value )) . then ( function ( response ) {
var suggestions = JSON . parse ( response ),
dataList = document . getElementById ( " allpages " );
// If the server sent no suggestions, then we shouldn\'t replace the contents of the datalist
if ( suggestions . length == 0 )
return ;
console . info ( `Fetched suggestions for ${event.target.value}:` , suggestions . map ( s => s . pagename ));
// Remove all the existing suggestions
while ( dataList . firstChild ) {
dataList . removeChild ( dataList . firstChild );
}
// Add the new suggestions to the datalist
2016-11-28 13:05:23 +00:00
var optionsFrag = document . createDocumentFragment ();
2016-11-20 13:24:35 +00:00
suggestions . forEach ( function ( suggestion ) {
var suggestionElement = document . createElement ( " option " );
suggestionElement . value = suggestion . pagename ;
suggestionElement . dataset . distance = suggestion . distance ;
2016-11-28 13:05:23 +00:00
optionsFrag . appendChild ( suggestionElement );
2016-11-20 13:24:35 +00:00
});
2016-11-28 13:05:23 +00:00
dataList . appendChild ( optionsFrag );
2016-11-20 13:24:35 +00:00
});
});
});
' );
}
2015-10-27 21:10:05 +00:00
}
]);
2017-11-20 20:40:59 +00:00
/**
* Holds a collection to methods to manipulate various types of search index .
*/
2015-10-28 09:14:41 +00:00
class search
{
2017-11-20 20:40:59 +00:00
/**
* Words that we should exclude from the inverted index
*/
2015-10-28 09:14:41 +00:00
public static $stop_words = [
2015-10-28 14:31:27 +00:00
" a " , " about " , " above " , " above " , " across " , " after " , " afterwards " , " again " ,
" against " , " all " , " almost " , " alone " , " along " , " already " , " also " ,
2015-11-08 21:15:08 +00:00
" although " , " always " , " am " , " among " , " amongst " , " amoungst " , " amount " ,
2015-10-28 14:31:27 +00:00
" an " , " and " , " another " , " any " , " anyhow " , " anyone " , " anything " , " anyway " ,
" anywhere " , " are " , " around " , " as " , " at " , " back " , " be " , " became " ,
" because " , " become " , " becomes " , " becoming " , " been " , " before " ,
" beforehand " , " behind " , " being " , " below " , " beside " , " besides " ,
" between " , " beyond " , " bill " , " both " , " bottom " , " but " , " by " , " call " ,
" can " , " cannot " , " cant " , " co " , " con " , " could " , " couldnt " , " cry " , " de " ,
" describe " , " detail " , " do " , " done " , " down " , " due " , " during " , " each " ,
" eg " , " eight " , " either " , " eleven " , " else " , " elsewhere " , " empty " ,
" enough " , " etc " , " even " , " ever " , " every " , " everyone " , " everything " ,
2016-11-20 13:24:35 +00:00
" everywhere " , " except " , " few " , " fill " , " find " ,
" fire " , " first " , " five " , " for " , " former " , " formerly " , " found " ,
2015-10-28 14:31:27 +00:00
" four " , " from " , " front " , " full " , " further " , " get " , " give " , " go " , " had " ,
" has " , " hasnt " , " have " , " he " , " hence " , " her " , " here " , " hereafter " ,
" hereby " , " herein " , " hereupon " , " hers " , " herself " , " him " , " himself " ,
2016-11-20 13:24:35 +00:00
" his " , " how " , " however " , " ie " , " if " , " in " , " inc " , " indeed " ,
2015-10-28 14:31:27 +00:00
" interest " , " into " , " is " , " it " , " its " , " itself " , " keep " , " last " ,
" latter " , " latterly " , " least " , " less " , " ltd " , " made " , " many " , " may " ,
" me " , " meanwhile " , " might " , " mine " , " more " , " moreover " , " most " ,
" mostly " , " move " , " much " , " must " , " my " , " myself " , " name " , " namely " ,
" neither " , " never " , " nevertheless " , " next " , " nine " , " no " , " none " ,
" nor " , " not " , " nothing " , " now " , " nowhere " , " of " , " off " , " often " , " on " ,
" once " , " one " , " only " , " onto " , " or " , " other " , " others " , " otherwise " ,
" our " , " ours " , " ourselves " , " out " , " over " , " own " , " part " , " per " ,
" perhaps " , " please " , " put " , " rather " , " re " , " same " , " see " , " seem " ,
" seemed " , " seeming " , " seems " , " serious " , " several " , " she " , " should " ,
" show " , " side " , " since " , " sincere " , " six " , " sixty " , " so " , " some " ,
" somehow " , " someone " , " something " , " sometime " , " sometimes " ,
" somewhere " , " still " , " such " , " system " , " take " , " ten " , " than " , " that " ,
" the " , " their " , " them " , " themselves " , " then " , " thence " , " there " ,
" thereafter " , " thereby " , " therefore " , " therein " , " thereupon " , " these " ,
" they " , " thickv " , " thin " , " third " , " this " , " those " , " though " , " three " ,
" through " , " throughout " , " thru " , " thus " , " to " , " together " , " too " , " top " ,
" toward " , " towards " , " twelve " , " twenty " , " two " , " un " , " under " , " until " ,
" up " , " upon " , " us " , " very " , " via " , " was " , " we " , " well " , " were " , " what " ,
" whatever " , " when " , " whence " , " whenever " , " where " , " whereafter " ,
" whereas " , " whereby " , " wherein " , " whereupon " , " wherever " , " whether " ,
" which " , " while " , " whither " , " who " , " whoever " , " whole " , " whom " , " whose " ,
" why " , " will " , " with " , " within " , " without " , " would " , " yet " , " you " ,
" your " , " yours " , " yourself " , " yourselves "
2015-10-28 09:14:41 +00:00
];
2015-10-28 20:56:10 +00:00
2017-11-20 20:40:59 +00:00
/**
* Converts a source string into an index of search terms that can be
* merged into an inverted index .
* @ param string $source The source string to index .
* @ return array An index represents the specified string .
*/
2015-10-28 09:14:41 +00:00
public static function index ( $source )
{
2018-06-25 21:53:53 +00:00
// We don't need to normalise or transliterate here because self::tokenize() does this for us
2015-10-28 09:14:41 +00:00
$source = html_entity_decode ( $source , ENT_QUOTES );
2018-03-18 16:37:25 +00:00
$source_length = mb_strlen ( $source );
2015-10-28 14:31:27 +00:00
2015-10-28 09:14:41 +00:00
$index = [];
2015-10-28 14:31:27 +00:00
2018-06-30 10:19:38 +00:00
$terms = self :: tokenize ( $source , true );
2015-10-28 09:14:41 +00:00
foreach ( $terms as $term )
{
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
2018-06-30 10:19:38 +00:00
if ( in_array ( $term [ 0 ], self :: $stop_words )) continue ;
2015-10-28 11:36:07 +00:00
2018-06-30 10:19:38 +00:00
if ( ! isset ( $index [ $term [ 0 ]]))
$index [ $term [ 0 ]] = [ " freq " => 0 , " offsets " => [] ];
2015-10-28 11:36:07 +00:00
2018-06-30 10:19:38 +00:00
$index [ $term [ 0 ]][ " freq " ] ++ ;
$index [ $term [ 0 ]][ " offsets " ][] = $term [ 1 ];
2015-10-28 09:14:41 +00:00
}
return $index ;
}
2017-11-20 20:40:59 +00:00
/**
* Converts a source string into a series of raw tokens .
2018-06-29 23:08:57 +00:00
* @ param string $source The source string to process .
* @ param boolean $capture_offsets Whether to capture & return the character offsets of the tokens detected . If true , then each token returned will be an array in the form [ token , char_offset ] .
2017-11-20 20:40:59 +00:00
* @ return array An array of raw tokens extracted from the specified source string .
*/
2018-06-29 23:08:57 +00:00
public static function tokenize ( $source , $capture_offsets = false )
2015-10-28 20:56:10 +00:00
{
2018-06-25 21:53:53 +00:00
/** Normalises input characters for searching & indexing */
2018-06-26 13:15:19 +00:00
static $literator ; if ( $literator == null ) $literator = Transliterator :: createFromRules ( ':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;' , Transliterator :: FORWARD );
2018-06-25 21:53:53 +00:00
2018-06-29 23:08:57 +00:00
$flags = PREG_SPLIT_NO_EMPTY ; // Don't return empty items
if ( $capture_offsets )
$flags |= PREG_SPLIT_OFFSET_CAPTURE ;
2018-06-25 21:53:53 +00:00
// We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable
// because PHP doesn't like it very much
$source = $literator -> transliterate ( $source );
2018-03-18 16:19:13 +00:00
$source = preg_replace ( '/[\[\]\|\{\}\/]/u' , " " , $source );
2018-06-29 23:08:57 +00:00
return preg_split ( " /((^ \ p { P}+)|( \ p { P}* \ s+ \ p { P}*)|( \ p { P}+ $ ))| \ |/u " , $source , - 1 , $flags );
2015-10-28 20:56:10 +00:00
}
2017-11-20 20:40:59 +00:00
/**
* Removes ( most ) markdown markup from the specified string .
* Stripped strings are not suitable for indexing !
* @ param string $source The source string to process .
* @ return string The stripped string .
*/
2015-10-29 11:21:04 +00:00
public static function strip_markup ( $source )
{
2018-03-18 16:19:13 +00:00
return preg_replace ( '/([\"*_\[\]]| - |`)/u' , " " , $source );
2015-10-29 11:21:04 +00:00
}
2017-11-20 20:40:59 +00:00
/**
* Rebuilds the master inverted index and clears the page id index .
* @ param boolean $output Whether to send progress information to the user ' s browser .
*/
2017-07-10 20:53:52 +00:00
public static function rebuild_invindex ( $output = true )
2015-10-28 20:56:10 +00:00
{
2017-07-11 19:21:20 +00:00
global $pageindex , $env , $paths , $settings ;
2015-10-28 20:56:10 +00:00
2017-07-11 19:21:20 +00:00
if ( $output ) {
2017-07-10 20:53:52 +00:00
header ( " content-type: text/event-stream " );
2017-07-11 19:21:20 +00:00
ob_end_flush ();
}
2016-08-26 16:47:46 +00:00
2017-03-23 20:48:42 +00:00
// Clear the id index out
ids :: clear ();
// Reindex each page in turn
2015-10-28 20:56:10 +00:00
$invindex = [];
2017-07-10 20:53:52 +00:00
$i = 0 ; $max = count ( get_object_vars ( $pageindex ));
2017-07-11 19:21:20 +00:00
$missing_files = 0 ;
2015-10-28 20:56:10 +00:00
foreach ( $pageindex as $pagename => $pagedetails )
{
2017-07-11 19:21:20 +00:00
$page_filename = $env -> storage_prefix . $pagedetails -> filename ;
if ( ! file_exists ( $page_filename )) {
2018-03-18 16:19:13 +00:00
echo ( " data: [ " . ( $i + 1 ) . " / $max ] Error: Can't find $page_filename\n " );
2017-07-11 19:21:20 +00:00
flush ();
2018-04-07 12:47:39 +00:00
$i ++ ; $missing_files ++ ;
2017-07-11 19:21:20 +00:00
continue ;
}
2018-06-25 21:53:53 +00:00
// We do not transliterate or normalise here because the indexer will take care of this for us
$index = self :: index ( file_get_contents ( $page_filename ));
2015-10-28 20:56:10 +00:00
2017-03-23 20:48:42 +00:00
$pageid = ids :: getid ( $pagename );
self :: merge_into_invindex ( $invindex , $pageid , $index );
2017-07-10 20:53:52 +00:00
if ( $output ) {
2017-07-11 19:21:20 +00:00
echo ( " data: [ " . ( $i + 1 ) . " / $max ] Added $pagename (id # $pageid ) to the new search index. \n \n " );
2017-07-10 20:53:52 +00:00
flush ();
}
$i ++ ;
2015-10-28 20:56:10 +00:00
}
2016-08-19 11:02:09 +00:00
2017-07-10 20:53:52 +00:00
if ( $output ) {
2017-07-11 19:21:20 +00:00
echo ( " data: Search index rebuilding complete. \n \n " );
echo ( " data: Couldn't find $missing_files pages on disk. If $settings->sitename couldn't find some pages on disk, then you might need to manually correct $settings->sitename 's page index (stored in pageindex.json). \n \n " );
echo ( " data: Done! Saving new search index to ' $paths->searchindex '. \n \n " );
2017-07-10 20:53:52 +00:00
}
2016-08-26 16:47:46 +00:00
2015-11-11 18:42:12 +00:00
self :: save_invindex ( $paths -> searchindex , $invindex );
2015-10-28 20:56:10 +00:00
}
2017-11-20 20:40:59 +00:00
/**
* Sorts an index alphabetically . Will also sort an inverted index .
* This allows us to do a binary search instead of a regular
* sequential search .
* @ param array $index The index to sort .
2015-10-28 09:14:41 +00:00
*/
public static function sort_index ( & $index )
{
ksort ( $index , SORT_NATURAL );
}
2015-10-28 14:31:27 +00:00
2017-11-20 20:40:59 +00:00
/**
* Compares two * regular * indexes to find the differences between them .
* @ param array $oldindex The old index .
* @ param array $newindex The new index .
* @ param array $changed An array to be filled with the nterms of all the changed entries .
* @ param array $removed An array to be filled with the nterms of all the removed entries .
2015-10-28 09:14:41 +00:00
*/
2015-11-01 14:26:13 +00:00
public static function compare_indexes ( $oldindex , $newindex , & $changed , & $removed )
2015-10-28 09:14:41 +00:00
{
2015-11-01 14:26:13 +00:00
foreach ( $oldindex as $nterm => $entry )
2015-10-28 14:31:27 +00:00
{
2015-11-01 14:26:13 +00:00
if ( ! isset ( $newindex [ $nterm ]))
2015-10-28 14:31:27 +00:00
$removed [] = $nterm ;
2015-11-01 14:26:13 +00:00
}
foreach ( $newindex as $nterm => $entry )
{
2018-09-12 20:27:51 +00:00
if ( ! isset ( $oldindex [ $nterm ]) or // If this word is new
2015-11-01 14:26:13 +00:00
$newindex [ $nterm ] !== $oldindex [ $nterm ]) // If this word has changed
$changed [ $nterm ] = $newindex [ $nterm ];
2015-10-28 14:31:27 +00:00
}
2015-10-28 09:14:41 +00:00
}
2017-11-20 20:40:59 +00:00
/**
* Reads in and parses an inverted index .
* @ param string $invindex_filename The path tp the inverted index to parse .
* @ todo Remove this function and make everything streamable
2015-10-28 09:14:41 +00:00
*/
2015-10-29 11:21:04 +00:00
public static function load_invindex ( $invindex_filename ) {
2015-10-28 14:31:27 +00:00
$invindex = json_decode ( file_get_contents ( $invindex_filename ), true );
return $invindex ;
}
2017-11-20 20:40:59 +00:00
/**
* Reads in and parses an inverted index , measuring the time it takes to do so .
* @ param string $invindex_filename The path to the file inverted index to parse .
*/
2016-08-20 10:27:26 +00:00
public static function measure_invindex_load_time ( $invindex_filename ) {
global $env ;
$searchindex_decode_start = microtime ( true );
search :: load_invindex ( $invindex_filename );
$env -> perfdata -> searchindex_decode_time = round (( microtime ( true ) - $searchindex_decode_start ) * 1000 , 3 );
}
2017-11-20 20:40:59 +00:00
/**
* Merge an index into an inverted index .
* @ param array $invindex The inverted index to merge into .
* @ param int $pageid The id of the page to assign to the index that ' s being merged .
* @ param array $index The regular index to merge .
* @ param array $removals An array of index entries to remove from the inverted index . Useful for applying changes to an inverted index instead of deleting and remerging an entire page ' s index .
2015-10-28 14:31:27 +00:00
*/
public static function merge_into_invindex ( & $invindex , $pageid , & $index , & $removals = [])
2015-10-28 09:14:41 +00:00
{
2015-10-28 14:31:27 +00:00
// Remove all the subentries that were removed since last time
foreach ( $removals as $nterm )
{
unset ( $invindex [ $nterm ][ $pageid ]);
}
2015-10-28 09:14:41 +00:00
2015-10-28 14:31:27 +00:00
// Merge all the new / changed index entries into the inverted index
foreach ( $index as $nterm => $newentry )
{
// If the nterm isn't in the inverted index, then create a space for it
if ( ! isset ( $invindex [ $nterm ])) $invindex [ $nterm ] = [];
$invindex [ $nterm ][ $pageid ] = $newentry ;
2015-10-28 20:56:10 +00:00
// Sort the page entries for this word by frequency
uasort ( $invindex [ $nterm ], function ( $a , $b ) {
if ( $a [ " freq " ] == $b [ " freq " ]) return 0 ;
return ( $a [ " freq " ] < $b [ " freq " ]) ? + 1 : - 1 ;
});
2015-10-28 14:31:27 +00:00
}
2015-11-01 14:26:13 +00:00
// Sort the inverted index by rank
uasort ( $invindex , function ( $a , $b ) {
$ac = count ( $a ); $bc = count ( $b );
if ( $ac == $bc ) return 0 ;
return ( $ac < $bc ) ? + 1 : - 1 ;
});
2015-10-28 14:31:27 +00:00
}
2015-11-14 17:01:23 +00:00
/**
* Deletes the given pageid from the given pageindex .
* @ param inverted_index & $invindex The inverted index .
* @ param number $pageid The pageid to remove .
*/
public static function delete_entry ( & $invindex , $pageid )
{
$str_pageid = ( string ) $pageid ;
foreach ( $invindex as $nterm => & $entry )
{
if ( isset ( $entry [ $pageid ]))
unset ( $entry [ $pageid ]);
if ( isset ( $entry [ $str_pageid ]))
unset ( $entry [ $str_pageid ]);
if ( count ( $entry ) === 0 )
unset ( $invindex [ $nterm ]);
}
}
2017-11-20 20:40:59 +00:00
/**
* Saves the given inverted index back to disk .
* @ param string $filename The path to the file to save the inverted index to .
* @ param array $invindex The inverted index to save .
*/
2015-10-28 14:31:27 +00:00
public static function save_invindex ( $filename , & $invindex )
{
file_put_contents ( $filename , json_encode ( $invindex ));
2015-10-28 09:14:41 +00:00
}
2015-10-28 20:56:10 +00:00
2017-11-20 20:40:59 +00:00
/**
* Searches the given inverted index for the specified search terms .
* @ param string $query The search query .
* @ param array $invindex The inverted index to search .
* @ return array An array of matching pages .
*/
2015-10-29 11:21:04 +00:00
public static function query_invindex ( $query , & $invindex )
2015-10-28 20:56:10 +00:00
{
2015-11-01 15:05:54 +00:00
global $settings , $pageindex ;
2018-06-25 21:53:53 +00:00
/** Normalises input characters for searching & indexing */
2018-06-26 13:15:19 +00:00
static $literator ; if ( $literator == null ) $literator = Transliterator :: createFromRules ( ':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;' , Transliterator :: FORWARD );
2018-06-25 21:53:53 +00:00
2015-10-28 20:56:10 +00:00
$query_terms = self :: tokenize ( $query );
$matching_pages = [];
2015-11-01 15:05:54 +00:00
2015-10-28 20:56:10 +00:00
// Loop over each term in the query and find the matching page entries
2015-12-26 13:06:45 +00:00
$count = count ( $query_terms );
for ( $i = 0 ; $i < $count ; $i ++ )
2015-10-28 20:56:10 +00:00
{
$qterm = $query_terms [ $i ];
2017-03-23 21:25:55 +00:00
// Stop words aren't worth the bother - make sure we don't search
// the title or the tags for them
if ( in_array ( $qterm , self :: $stop_words ))
continue ;
2015-11-01 15:05:54 +00:00
// Only search the inverted index if it actually exists there
if ( isset ( $invindex [ $qterm ]))
{
// Loop over each page in the inverted index entry
2018-06-26 13:15:19 +00:00
reset ( $invindex [ $qterm ]); // Reset array/object pointer
2015-11-01 15:05:54 +00:00
foreach ( $invindex [ $qterm ] as $pageid => $page_entry )
{
// Create an entry in the matching pages array if it doesn't exist
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
$matching_pages [ $pageid ][ " nterms " ][ $qterm ] = $page_entry ;
}
}
2015-10-28 20:56:10 +00:00
2015-11-01 15:05:54 +00:00
// Loop over the pageindex and search the titles / tags
2018-06-25 23:06:20 +00:00
reset ( $pageindex ); // Reset array/object pointer
2015-11-01 15:05:54 +00:00
foreach ( $pageindex as $pagename => $pagedata )
2015-10-28 20:56:10 +00:00
{
2018-06-29 11:08:38 +00:00
// Setup a variable to hold the current page's id
2018-06-26 13:28:11 +00:00
$pageid = false ; // Only fill this out if we find a match
2015-11-01 15:05:54 +00:00
// Consider matches in the page title
2018-06-26 13:28:11 +00:00
// FUTURE: We may be able to optimise this further by using preg_match_all + preg_quote instead of mb_stripos_all. Experimentation / benchmarking is required to figure out which one is faster
2018-06-25 21:53:53 +00:00
$title_matches = mb_stripos_all ( $literator -> transliterate ( $pagename ), $qterm );
$title_matches_count = $title_matches !== false ? count ( $title_matches ) : 0 ;
if ( $title_matches_count > 0 )
2015-11-01 15:05:54 +00:00
{
2018-06-26 13:28:11 +00:00
$pageid = ids :: getid ( $pagename ); // Fill out the page id
2015-11-01 15:05:54 +00:00
// We found the qterm in the title
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
// Set up a counter for page title matches if it doesn't exist already
if ( ! isset ( $matching_pages [ $pageid ][ " title-matches " ]))
$matching_pages [ $pageid ][ " title-matches " ] = 0 ;
2018-06-25 21:53:53 +00:00
$matching_pages [ $pageid ][ " title-matches " ] += $title_matches_count * strlen ( $qterm );
2015-11-01 15:05:54 +00:00
}
// Consider matches in the page's tags
2018-06-25 21:53:53 +00:00
$tag_matches = isset ( $pagedata -> tags ) ? mb_stripos_all ( $literator -> transliterate ( implode ( " " , $pagedata -> tags )), $qterm ) : false ;
$tag_matches_count = $tag_matches !== false ? count ( $tag_matches ) : 0 ;
if ( $tag_matches_count > 0 ) // And we found the qterm in the tags
2015-11-01 15:05:54 +00:00
{
2018-06-26 13:28:11 +00:00
if ( $pageid == false ) // Fill out the page id if it hasn't been already
$pageid = ids :: getid ( $pagename );
2015-11-01 15:05:54 +00:00
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
// Set up a counter for tag match if there isn't one already
if ( ! isset ( $matching_pages [ $pageid ][ " tag-matches " ]))
$matching_pages [ $pageid ][ " tag-matches " ] = 0 ;
2018-06-25 21:53:53 +00:00
$matching_pages [ $pageid ][ " tag-matches " ] += $tag_matches_count * strlen ( $qterm );
2015-11-01 15:05:54 +00:00
}
2015-10-28 20:56:10 +00:00
}
}
2018-06-26 13:15:19 +00:00
reset ( $matching_pages );
2015-10-29 11:21:04 +00:00
foreach ( $matching_pages as $pageid => & $pagedata )
2015-10-28 20:56:10 +00:00
{
2015-10-29 11:21:04 +00:00
$pagedata [ " pagename " ] = ids :: getpagename ( $pageid );
2015-10-28 20:56:10 +00:00
$pagedata [ " rank " ] = 0 ;
2017-03-23 20:48:42 +00:00
$pageOffsets = [];
// Loop over each search term found on this page
2018-06-26 13:15:19 +00:00
reset ( $pagedata [ " nterms " ]);
2015-10-29 11:21:04 +00:00
foreach ( $pagedata [ " nterms " ] as $pterm => $entry )
2015-10-28 20:56:10 +00:00
{
2017-03-23 20:48:42 +00:00
// Add the number of occurrences of this search term to the ranking
2017-10-15 12:42:15 +00:00
// Multiply it by the length of the word
$pagedata [ " rank " ] += $entry [ " freq " ] * strlen ( $pterm );
2015-10-28 20:56:10 +00:00
2017-03-23 20:48:42 +00:00
// Add the offsets to a listof all offsets on this page
foreach ( $entry [ " offsets " ] as $offset )
$pageOffsets [] = $offset ;
2015-10-28 20:56:10 +00:00
}
2017-03-23 20:48:42 +00:00
/*
// Sort the list of offsets
$pageOffsets = array_unique ( $pageOffsets );
sort ( $pageOffsets );
var_dump ( $pageOffsets );
// Calcualate the clump distances via a variable moving window size
$pageOffsetsCount = count ( $pageOffsets );
$clumpDistanceWindow = min ( $count , $pageOffsetsCount ); // a.k.a. count($query_terms) - see above
$clumpDistances = [];
for ( $i = 0 ; $i < $pageOffsetsCount - $clumpDistanceWindow ; $i ++ )
$clumpDistances [] = $pageOffsets [ $i ] - $pageOffsets [ $i + $clumpDistanceWindow ];
// Sort the new list of clump distances
sort ( $clumpDistances );
2017-11-20 20:40:59 +00:00
// Calcualate a measure of how clumped the offsets are
2017-03-23 20:48:42 +00:00
$tightClumpLimit = floor (( count ( $clumpDistances ) - 1 ) / 0.25 );
$tightClumpsMeasure = $clumpDistances [ $tightClumpLimit ] - $clumpDistances [ 0 ];
$clumpsRange = $clumpDistances [ count ( $clumpDistances ) - 1 ] - $clumpDistances [ 0 ];
$clumpiness = $tightClumpsMeasure / $clumpsRange ;
echo ( " { $pagedata [ " pagename " ] } - $clumpiness " );
*/
2015-10-28 20:56:10 +00:00
2015-11-01 15:05:54 +00:00
// Consider matches in the title / tags
if ( isset ( $pagedata [ " title-matches " ]))
$pagedata [ " rank " ] += $pagedata [ " title-matches " ] * $settings -> search_title_matches_weighting ;
if ( isset ( $pagedata [ " tag-matches " ]))
$pagedata [ " rank " ] += $pagedata [ " tag-matches " ] * $settings -> search_tags_matches_weighting ;
2015-10-28 20:56:10 +00:00
// todo remove items if the rank is below a threshold
}
uasort ( $matching_pages , function ( $a , $b ) {
if ( $a [ " rank " ] == $b [ " rank " ]) return 0 ;
return ( $a [ " rank " ] < $b [ " rank " ]) ? + 1 : - 1 ;
});
return $matching_pages ;
}
2015-10-29 11:21:04 +00:00
2017-11-20 20:40:59 +00:00
/**
* Extracts a context string ( in HTML ) given a search query that could be displayed
* in a list of search results .
2018-06-26 13:28:11 +00:00
* @ param string $invindex The inverted index to consult .
* @ param string $pagename The name of the paget that this source belongs to . Used when consulting the inverted index .
* @ param string $query The search queary to generate the context for .
* @ param string $source The page source to extract the context from .
* @ return string The generated context string .
2017-11-20 20:40:59 +00:00
*/
2018-06-26 13:28:11 +00:00
public static function extract_context ( $invindex , $pagename , $query , $source )
2015-10-29 11:21:04 +00:00
{
global $settings ;
2018-06-26 13:28:11 +00:00
$pageid = ids :: getid ( $pagename );
2015-10-29 11:21:04 +00:00
$nterms = self :: tokenize ( $query );
$matches = [];
2018-06-26 13:15:19 +00:00
foreach ( $nterms as $nterm ) {
// Skip over words that don't appear in the inverted index (e.g. stop words)
if ( ! isset ( $invindex [ $nterm ]))
2017-01-26 20:55:46 +00:00
continue ;
2018-06-26 13:15:19 +00:00
// Skip if the page isn't found in the inverted index for this word
if ( ! isset ( $invindex [ $nterm ][ $pageid ]))
2015-10-29 11:21:04 +00:00
continue ;
2018-06-26 13:15:19 +00:00
foreach ( $invindex [ $nterm ][ $pageid ][ " offsets " ] as $next_offset )
$matches [] = [ $nterm , $next_offset ];
2015-10-29 11:21:04 +00:00
}
2016-08-19 11:02:09 +00:00
// Sort the matches by offset
2015-10-29 11:21:04 +00:00
usort ( $matches , function ( $a , $b ) {
if ( $a [ 1 ] == $b [ 1 ]) return 0 ;
2016-08-19 11:02:09 +00:00
return ( $a [ 1 ] > $b [ 1 ]) ? + 1 : - 1 ;
2015-10-29 11:21:04 +00:00
});
2018-03-18 16:19:13 +00:00
$sourceLength = mb_strlen ( $source );
2016-08-19 12:02:42 +00:00
2015-10-29 11:21:04 +00:00
$contexts = [];
2018-06-25 23:06:20 +00:00
2015-10-29 11:21:04 +00:00
$matches_count = count ( $matches );
2018-06-25 23:06:20 +00:00
$total_context_length = 0 ;
for ( $i = 0 ; $i < $matches_count ; $i ++ ) {
$next_context = [
" from " => max ( 0 , $matches [ $i ][ 1 ] - $settings -> search_characters_context ),
2018-09-29 12:27:17 +00:00
" to " => min ( $sourceLength , $matches [ $i ][ 1 ] + mb_strlen ( $matches [ $i ][ 0 ]) + $settings -> search_characters_context )
2018-06-25 23:06:20 +00:00
];
if ( end ( $contexts ) !== false && end ( $contexts )[ " to " ] > $next_context [ " from " ]) {
// This next context overlaps with the previous one
// Extend the last one instead of adding a new one
2015-10-29 11:21:04 +00:00
2018-06-25 23:06:20 +00:00
// The array pointer is pointing at the last element now because we called end() above
2015-10-29 11:21:04 +00:00
2018-06-25 23:06:20 +00:00
// Update the total context length counter appropriately
$total_context_length += $next_context [ " to " ] - $contexts [ key ( $contexts )][ " to " ];
$contexts [ key ( $contexts )][ " to " ] = $next_context [ " to " ];
}
else { // No overlap here! Business as usual.
$contexts [] = $next_context ;
// Update the total context length counter as normal
$total_context_length += $next_context [ " to " ] - $next_context [ " from " ];
2015-10-29 11:21:04 +00:00
}
2018-06-25 23:06:20 +00:00
end ( $contexts );
$last_context = & $contexts [ key ( $contexts )];
if ( $total_context_length > $settings -> search_characters_context_total ) {
// We've reached the limit on the number of characters this context should contain. Trim off the context to fit and break out
$last_context [ " to " ] -= $total_context_length - $settings -> search_characters_context_total ;
break ;
}
}
$contexts_text = [];
foreach ( $contexts as $context ) {
$contexts_text [] = substr ( $source , $context [ " from " ], $context [ " to " ] - $context [ " from " ]);
2015-10-29 11:21:04 +00:00
}
2018-06-30 10:46:07 +00:00
$result = implode ( " … " , $contexts_text );
end ( $contexts ); // If there's at least one item in the list and were not at the very end of the page, add an extra ellipsis
if ( isset ( $contexts [ 0 ]) && $contexts [ key ( $contexts )][ " to " ] < $sourceLength ) $result .= " … " ;
2018-09-29 12:32:17 +00:00
// Prepend an ellipsis if the context doesn't start at the beginning of a page
if ( isset ( $contexts [ 0 ]) && $contexts [ 0 ][ " from " ] > 0 ) $result = " … $result " ;
2018-06-30 10:46:07 +00:00
return $result ;
2015-10-29 11:21:04 +00:00
}
2015-10-31 13:52:50 +00:00
2017-11-20 20:40:59 +00:00
/**
* Highlights the keywords of a context string .
* @ param string $query The query to use when highlighting .
* @ param string $context The context string to highlight .
* @ return string The highlighted ( HTML ) string .
*/
2015-10-31 13:52:50 +00:00
public static function highlight_context ( $query , $context )
{
$qterms = self :: tokenize ( $query );
foreach ( $qterms as $qterm )
{
2017-01-26 20:55:46 +00:00
if ( in_array ( $qterm , static :: $stop_words ))
continue ;
2018-03-18 16:52:55 +00:00
2018-09-29 22:40:23 +00:00
// From http://stackoverflow.com/a/2483859/1460422
2018-03-18 16:52:55 +00:00
$context = preg_replace ( " / " . preg_replace ( '/\\//u' , " \ / " , preg_quote ( $qterm )) . " /iu " , " <strong class='search-term-highlight'> $ 0</strong> " , $context );
2015-10-31 13:52:50 +00:00
}
return $context ;
}
2015-10-29 11:21:04 +00:00
}
2015-10-27 21:10:05 +00:00
?>