2015-10-27 21:10:05 +00:00
< ? php
register_module ([
" name " => " Search " ,
2015-11-08 21:15:08 +00:00
" version " => " 0.2 " ,
2015-10-27 21:10:05 +00:00
" author " => " Starbeamrainbowlabs " ,
" description " => " Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine. " ,
" id " => " feature-search " ,
" code " => function () {
add_action ( " index " , function () {
global $settings , $env ;
$breakable_chars = " \r \n \t ., \\ /! \" £ $ %^&*[]()+`_~# " ;
header ( " content-type: text/plain " );
2015-11-08 21:15:08 +00:00
$source = file_get_contents ( " $env->storage_prefix $env->page .md " );
2015-10-27 21:10:05 +00:00
2015-10-28 09:14:41 +00:00
$index = search :: index ( $source );
2015-10-27 21:10:05 +00:00
2015-10-28 08:03:56 +00:00
var_dump ( $env -> page );
var_dump ( $source );
2015-10-27 21:10:05 +00:00
var_dump ( $index );
});
2015-10-28 20:56:10 +00:00
add_action ( " invindex-rebuild " , function () {
search :: rebuild_invindex ();
});
add_action ( " search " , function () {
2015-11-09 07:25:28 +00:00
global $settings , $env , $pageindex , $paths ;
2015-10-28 20:56:10 +00:00
2015-12-05 17:27:01 +00:00
// Create the inverted index if it doesn't exist.
// todo In the future perhaps a CLI for this would be good?
if ( ! file_exists ( $paths -> searchindex ))
search :: rebuild_invindex ();
2015-10-28 20:56:10 +00:00
if ( ! isset ( $_GET [ " query " ]))
2015-10-29 11:21:04 +00:00
exit ( page_renderer :: render ( " No Search Terms - Error - $settings->sitename " , " <p>You didn't specify any search terms. Try typing some into the box above.</p> " ));
2015-10-28 20:56:10 +00:00
2015-10-29 11:21:04 +00:00
$search_start = microtime ( true );
2015-10-28 20:56:10 +00:00
2015-11-08 21:15:08 +00:00
$invindex = search :: load_invindex ( $paths -> searchindex );
2015-10-29 11:21:04 +00:00
$results = search :: query_invindex ( $_GET [ " query " ], $invindex );
2015-11-08 21:15:08 +00:00
2015-10-29 11:21:04 +00:00
$search_end = microtime ( true ) - $search_start ;
2015-11-08 21:15:08 +00:00
2015-10-29 11:21:04 +00:00
$title = $_GET [ " query " ] . " - Search results - $settings->sitename " ;
$content = " <section> \n " ;
$content .= " <h1>Search Results</h1> " ;
2015-10-31 14:05:00 +00:00
/// Search Box ///
$content .= " <form method='get' action=''> \n " ;
$content .= " <input type='search' id='search-box' name='query' placeholder='Type your query here and then press enter.' value=' " . $_GET [ " query " ] . " ' /> \n " ;
$content .= " <input type='hidden' name='action' value='search' /> \n " ;
$content .= " </form> " ;
2015-10-29 11:21:04 +00:00
2015-11-01 10:13:35 +00:00
$query = $_GET [ " query " ];
if ( isset ( $pageindex -> $query ))
{
$content .= " <p>There's a page on $settings->sitename called <a href='?page= " . rawurlencode ( $query ) . " '> $query </a>.</p> " ;
}
else
{
$content .= " <p>There isn't a page called $query on $settings->sitename , but you can <a href='?action=edit&page= " . rawurlencode ( $query ) . " '>create it</a>.</p> " ;
}
2015-10-31 14:16:19 +00:00
$i = 0 ; // todo use $_GET["offset"] and $_GET["result-count"] or something
2015-10-29 11:21:04 +00:00
foreach ( $results as $result )
{
$link = " ?page= " . rawurlencode ( $result [ " pagename " ]);
2015-11-08 21:15:08 +00:00
$pagesource = file_get_contents ( $env -> storage_prefix . $result [ " pagename " ] . " .md " );
2015-10-29 11:21:04 +00:00
$context = search :: extract_context ( $_GET [ " query " ], $pagesource );
2015-10-31 13:52:50 +00:00
$context = search :: highlight_context ( $_GET [ " query " ], $context );
2015-11-02 14:42:38 +00:00
/* if ( strlen ( $context ) == 0 )
2015-11-01 15:05:54 +00:00
{
$context = search :: strip_markup ( file_get_contents ( " $env->page .md " , null , null , null , $settings -> search_characters_context * 2 ));
if ( $pageindex -> { $env -> page } -> size > $settings -> search_characters_context * 2 )
$context .= " ... " ;
2015-11-02 14:42:38 +00:00
} */
2015-11-01 15:05:54 +00:00
2015-10-29 11:21:04 +00:00
2015-10-31 14:16:19 +00:00
// We add 1 to $i here to convert it from an index to a result
// number as people expect it to start from 1
$content .= " <div class='search-result' data-result-number=' " . ( $i + 1 ) . " ' data-rank=' " . $result [ " rank " ] . " '> \n " ;
2015-10-29 11:21:04 +00:00
$content .= " <h2><a href=' $link '> " . $result [ " pagename " ] . " </a></h2> \n " ;
$content .= " <p> $context </p> \n " ;
$content .= " </div> \n " ;
2015-10-31 14:16:19 +00:00
$i ++ ;
2015-10-29 11:21:04 +00:00
}
$content .= " </section> \n " ;
exit ( page_renderer :: render ( $title , $content ));
//header("content-type: text/plain");
//var_dump($results);
2015-10-28 20:56:10 +00:00
});
2015-10-27 21:10:05 +00:00
}
]);
2015-10-28 09:14:41 +00:00
class search
{
2015-10-28 14:31:27 +00:00
// Words that we should exclude from the inverted index.
2015-10-28 09:14:41 +00:00
public static $stop_words = [
2015-10-28 14:31:27 +00:00
" a " , " about " , " above " , " above " , " across " , " after " , " afterwards " , " again " ,
" against " , " all " , " almost " , " alone " , " along " , " already " , " also " ,
2015-11-08 21:15:08 +00:00
" although " , " always " , " am " , " among " , " amongst " , " amoungst " , " amount " ,
2015-10-28 14:31:27 +00:00
" an " , " and " , " another " , " any " , " anyhow " , " anyone " , " anything " , " anyway " ,
" anywhere " , " are " , " around " , " as " , " at " , " back " , " be " , " became " ,
" because " , " become " , " becomes " , " becoming " , " been " , " before " ,
" beforehand " , " behind " , " being " , " below " , " beside " , " besides " ,
" between " , " beyond " , " bill " , " both " , " bottom " , " but " , " by " , " call " ,
" can " , " cannot " , " cant " , " co " , " con " , " could " , " couldnt " , " cry " , " de " ,
" describe " , " detail " , " do " , " done " , " down " , " due " , " during " , " each " ,
" eg " , " eight " , " either " , " eleven " , " else " , " elsewhere " , " empty " ,
" enough " , " etc " , " even " , " ever " , " every " , " everyone " , " everything " ,
" everywhere " , " except " , " few " , " fifteen " , " fify " , " fill " , " find " ,
" fire " , " first " , " five " , " for " , " former " , " formerly " , " forty " , " found " ,
" four " , " from " , " front " , " full " , " further " , " get " , " give " , " go " , " had " ,
" has " , " hasnt " , " have " , " he " , " hence " , " her " , " here " , " hereafter " ,
" hereby " , " herein " , " hereupon " , " hers " , " herself " , " him " , " himself " ,
" his " , " how " , " however " , " hundred " , " ie " , " if " , " in " , " inc " , " indeed " ,
" interest " , " into " , " is " , " it " , " its " , " itself " , " keep " , " last " ,
" latter " , " latterly " , " least " , " less " , " ltd " , " made " , " many " , " may " ,
" me " , " meanwhile " , " might " , " mine " , " more " , " moreover " , " most " ,
" mostly " , " move " , " much " , " must " , " my " , " myself " , " name " , " namely " ,
" neither " , " never " , " nevertheless " , " next " , " nine " , " no " , " none " ,
" nor " , " not " , " nothing " , " now " , " nowhere " , " of " , " off " , " often " , " on " ,
" once " , " one " , " only " , " onto " , " or " , " other " , " others " , " otherwise " ,
" our " , " ours " , " ourselves " , " out " , " over " , " own " , " part " , " per " ,
" perhaps " , " please " , " put " , " rather " , " re " , " same " , " see " , " seem " ,
" seemed " , " seeming " , " seems " , " serious " , " several " , " she " , " should " ,
" show " , " side " , " since " , " sincere " , " six " , " sixty " , " so " , " some " ,
" somehow " , " someone " , " something " , " sometime " , " sometimes " ,
" somewhere " , " still " , " such " , " system " , " take " , " ten " , " than " , " that " ,
" the " , " their " , " them " , " themselves " , " then " , " thence " , " there " ,
" thereafter " , " thereby " , " therefore " , " therein " , " thereupon " , " these " ,
" they " , " thickv " , " thin " , " third " , " this " , " those " , " though " , " three " ,
" through " , " throughout " , " thru " , " thus " , " to " , " together " , " too " , " top " ,
" toward " , " towards " , " twelve " , " twenty " , " two " , " un " , " under " , " until " ,
" up " , " upon " , " us " , " very " , " via " , " was " , " we " , " well " , " were " , " what " ,
" whatever " , " when " , " whence " , " whenever " , " where " , " whereafter " ,
" whereas " , " whereby " , " wherein " , " whereupon " , " wherever " , " whether " ,
" which " , " while " , " whither " , " who " , " whoever " , " whole " , " whom " , " whose " ,
" why " , " will " , " with " , " within " , " without " , " would " , " yet " , " you " ,
" your " , " yours " , " yourself " , " yourselves "
2015-10-28 09:14:41 +00:00
];
2015-10-28 20:56:10 +00:00
2015-10-28 09:14:41 +00:00
public static function index ( $source )
{
$source = html_entity_decode ( $source , ENT_QUOTES );
$source_length = strlen ( $source );
2015-10-28 14:31:27 +00:00
2015-10-28 09:14:41 +00:00
$index = [];
2015-10-28 14:31:27 +00:00
2015-10-28 20:56:10 +00:00
$terms = self :: tokenize ( $source );
2015-10-28 09:14:41 +00:00
$i = 0 ;
foreach ( $terms as $term )
{
2015-10-28 20:56:10 +00:00
$nterm = $term ;
2015-10-28 09:14:41 +00:00
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
if ( in_array ( $nterm , self :: $stop_words )) continue ;
if ( ! isset ( $index [ $nterm ]))
{
$index [ $nterm ] = [ " freq " => 0 , " offsets " => [] ];
}
2015-10-28 11:36:07 +00:00
2015-10-28 09:14:41 +00:00
$index [ $nterm ][ " freq " ] ++ ;
$index [ $nterm ][ " offsets " ][] = $i ;
2015-10-28 11:36:07 +00:00
2015-10-28 09:14:41 +00:00
$i ++ ;
}
return $index ;
}
2015-10-28 20:56:10 +00:00
public static function tokenize ( $source )
{
$source = strtolower ( $source );
return preg_split ( " /((^ \ p { P}+)|( \ p { P}* \ s+ \ p { P}*)|( \ p { P}+ $ ))| \ |/ " , $source , - 1 , PREG_SPLIT_NO_EMPTY );
}
2015-10-29 11:21:04 +00:00
public static function strip_markup ( $source )
{
return str_replace ([ " [ " , " ] " , " \" " , " * " , " _ " , " - " , " ` " ], " " , $source );
}
2015-10-28 20:56:10 +00:00
public static function rebuild_invindex ()
{
2015-11-11 18:42:12 +00:00
global $pageindex , $env , $paths ;
2015-10-28 20:56:10 +00:00
$invindex = [];
foreach ( $pageindex as $pagename => $pagedetails )
{
2015-11-08 21:15:08 +00:00
$pagesource = file_get_contents ( " $env->storage_prefix $pagename .md " );
2015-10-28 20:56:10 +00:00
$index = self :: index ( $pagesource );
self :: merge_into_invindex ( $invindex , ids :: getid ( $pagename ), $index );
}
2015-11-11 18:42:12 +00:00
self :: save_invindex ( $paths -> searchindex , $invindex );
2015-10-28 20:56:10 +00:00
}
2015-10-28 09:14:41 +00:00
/*
2015-10-28 14:31:27 +00:00
* @ summary Sorts an index alphabetically . Will also sort an inverted index .
* This allows us to do a binary search instead of a regular
* sequential search .
2015-10-28 09:14:41 +00:00
*/
public static function sort_index ( & $index )
{
ksort ( $index , SORT_NATURAL );
}
2015-10-28 14:31:27 +00:00
2015-10-28 09:14:41 +00:00
/*
2015-10-28 14:31:27 +00:00
* @ summary Compares two * regular * indexes to find the differences between them .
*
* @ param { array } $indexa - The old index .
* @ param { array } $indexb - The new index .
* @ param { array } $changed - An array to be filled with the nterms of all
* the changed entries .
* @ param { array } $removed - An array to be filled with the nterms of all
* the removed entries .
2015-10-28 09:14:41 +00:00
*/
2015-11-01 14:26:13 +00:00
public static function compare_indexes ( $oldindex , $newindex , & $changed , & $removed )
2015-10-28 09:14:41 +00:00
{
2015-11-01 14:26:13 +00:00
foreach ( $oldindex as $nterm => $entry )
2015-10-28 14:31:27 +00:00
{
2015-11-01 14:26:13 +00:00
if ( ! isset ( $newindex [ $nterm ]))
2015-10-28 14:31:27 +00:00
$removed [] = $nterm ;
2015-11-01 14:26:13 +00:00
}
foreach ( $newindex as $nterm => $entry )
{
if ( ! isset ( $oldindex [ $nterm ]) or // If this world is new
$newindex [ $nterm ] !== $oldindex [ $nterm ]) // If this word has changed
$changed [ $nterm ] = $newindex [ $nterm ];
2015-10-28 14:31:27 +00:00
}
2015-10-28 09:14:41 +00:00
}
/*
2015-10-28 14:31:27 +00:00
* @ summary Reads in and parses an inverted index .
2015-10-28 09:14:41 +00:00
*/
2015-10-28 14:31:27 +00:00
// Todo remove this function and make everything streamable
2015-10-29 11:21:04 +00:00
public static function load_invindex ( $invindex_filename ) {
2015-10-28 14:31:27 +00:00
$invindex = json_decode ( file_get_contents ( $invindex_filename ), true );
return $invindex ;
}
/*
* @ summary Merge an index into an inverted index .
*/
public static function merge_into_invindex ( & $invindex , $pageid , & $index , & $removals = [])
2015-10-28 09:14:41 +00:00
{
2015-10-28 14:31:27 +00:00
// Remove all the subentries that were removed since last time
foreach ( $removals as $nterm )
{
unset ( $invindex [ $nterm ][ $pageid ]);
}
2015-10-28 09:14:41 +00:00
2015-10-28 14:31:27 +00:00
// Merge all the new / changed index entries into the inverted index
foreach ( $index as $nterm => $newentry )
{
// If the nterm isn't in the inverted index, then create a space for it
if ( ! isset ( $invindex [ $nterm ])) $invindex [ $nterm ] = [];
$invindex [ $nterm ][ $pageid ] = $newentry ;
2015-10-28 20:56:10 +00:00
// Sort the page entries for this word by frequency
uasort ( $invindex [ $nterm ], function ( $a , $b ) {
if ( $a [ " freq " ] == $b [ " freq " ]) return 0 ;
return ( $a [ " freq " ] < $b [ " freq " ]) ? + 1 : - 1 ;
});
2015-10-28 14:31:27 +00:00
}
2015-11-01 14:26:13 +00:00
// Sort the inverted index by rank
uasort ( $invindex , function ( $a , $b ) {
$ac = count ( $a ); $bc = count ( $b );
if ( $ac == $bc ) return 0 ;
return ( $ac < $bc ) ? + 1 : - 1 ;
});
2015-10-28 14:31:27 +00:00
}
2015-11-14 17:01:23 +00:00
/**
* Deletes the given pageid from the given pageindex .
* @ param inverted_index & $invindex The inverted index .
* @ param number $pageid The pageid to remove .
*/
public static function delete_entry ( & $invindex , $pageid )
{
$str_pageid = ( string ) $pageid ;
foreach ( $invindex as $nterm => & $entry )
{
if ( isset ( $entry [ $pageid ]))
unset ( $entry [ $pageid ]);
if ( isset ( $entry [ $str_pageid ]))
unset ( $entry [ $str_pageid ]);
if ( count ( $entry ) === 0 )
unset ( $invindex [ $nterm ]);
}
}
2015-10-28 14:31:27 +00:00
public static function save_invindex ( $filename , & $invindex )
{
file_put_contents ( $filename , json_encode ( $invindex ));
2015-10-28 09:14:41 +00:00
}
2015-10-28 20:56:10 +00:00
2015-10-29 11:21:04 +00:00
public static function query_invindex ( $query , & $invindex )
2015-10-28 20:56:10 +00:00
{
2015-11-01 15:05:54 +00:00
global $settings , $pageindex ;
2015-10-28 20:56:10 +00:00
$query_terms = self :: tokenize ( $query );
$matching_pages = [];
2015-11-01 15:05:54 +00:00
2015-10-28 20:56:10 +00:00
// Loop over each term in the query and find the matching page entries
for ( $i = 0 ; $i < count ( $query_terms ); $i ++ )
{
$qterm = $query_terms [ $i ];
2015-11-01 15:05:54 +00:00
// Only search the inverted index if it actually exists there
if ( isset ( $invindex [ $qterm ]))
{
// Loop over each page in the inverted index entry
foreach ( $invindex [ $qterm ] as $pageid => $page_entry )
{
// Create an entry in the matching pages array if it doesn't exist
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
$matching_pages [ $pageid ][ " nterms " ][ $qterm ] = $page_entry ;
}
}
2015-10-28 20:56:10 +00:00
2015-11-01 15:05:54 +00:00
// Loop over the pageindex and search the titles / tags
foreach ( $pageindex as $pagename => $pagedata )
2015-10-28 20:56:10 +00:00
{
2015-11-01 15:05:54 +00:00
// Get the current page's id
$pageid = ids :: getid ( $pagename );
// Consider matches in the page title
if ( stripos ( $pagename , $qterm ) !== false )
{
// We found the qterm in the title
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
// Set up a counter for page title matches if it doesn't exist already
if ( ! isset ( $matching_pages [ $pageid ][ " title-matches " ]))
$matching_pages [ $pageid ][ " title-matches " ] = 0 ;
$matching_pages [ $pageid ][ " title-matches " ] += count ( mb_stripos_all ( $pagename , $qterm ));
}
// Consider matches in the page's tags
if ( isset ( $pagedata -> tags ) and // If this page has tags
stripos ( implode ( " " , $pagedata -> tags ), $qterm ) !== false ) // And we found the qterm in the tags
{
if ( ! isset ( $matching_pages [ $pageid ]))
$matching_pages [ $pageid ] = [ " nterms " => [] ];
// Set up a counter for tag match if there isn't one already
if ( ! isset ( $matching_pages [ $pageid ][ " tag-matches " ]))
$matching_pages [ $pageid ][ " tag-matches " ] = 0 ;
$matching_pages [ $pageid ][ " tag-matches " ] += count ( mb_stripos_all ( implode ( " " , $pagedata -> tags ), $qterm ));
}
2015-10-28 20:56:10 +00:00
}
}
2015-11-01 15:05:54 +00:00
2015-10-29 11:21:04 +00:00
foreach ( $matching_pages as $pageid => & $pagedata )
2015-10-28 20:56:10 +00:00
{
2015-10-29 11:21:04 +00:00
$pagedata [ " pagename " ] = ids :: getpagename ( $pageid );
2015-10-28 20:56:10 +00:00
$pagedata [ " rank " ] = 0 ;
2015-10-29 11:21:04 +00:00
foreach ( $pagedata [ " nterms " ] as $pterm => $entry )
2015-10-28 20:56:10 +00:00
{
$pagedata [ " rank " ] += $entry [ " freq " ];
// todo rank by context here
}
2015-11-01 15:05:54 +00:00
// Consider matches in the title / tags
if ( isset ( $pagedata [ " title-matches " ]))
$pagedata [ " rank " ] += $pagedata [ " title-matches " ] * $settings -> search_title_matches_weighting ;
if ( isset ( $pagedata [ " tag-matches " ]))
$pagedata [ " rank " ] += $pagedata [ " tag-matches " ] * $settings -> search_tags_matches_weighting ;
2015-10-28 20:56:10 +00:00
// todo remove items if the rank is below a threshold
}
// todo sort by rank here
uasort ( $matching_pages , function ( $a , $b ) {
if ( $a [ " rank " ] == $b [ " rank " ]) return 0 ;
return ( $a [ " rank " ] < $b [ " rank " ]) ? + 1 : - 1 ;
});
return $matching_pages ;
}
2015-10-29 11:21:04 +00:00
public static function extract_context ( $query , $source )
{
global $settings ;
$nterms = self :: tokenize ( $query );
$matches = [];
// Loop over each nterm and find it in the source
foreach ( $nterms as $nterm )
{
$all_offsets = mb_stripos_all ( $source , $nterm );
// Skip over adding matches if there aren't any
if ( $all_offsets === false )
continue ;
foreach ( $all_offsets as $offset )
{
$matches [] = [ $nterm , $offset ];
}
}
usort ( $matches , function ( $a , $b ) {
if ( $a [ 1 ] == $b [ 1 ]) return 0 ;
return ( $a [ 1 ] < $b [ 1 ]) ? + 1 : - 1 ;
});
$contexts = [];
$basepos = 0 ;
$matches_count = count ( $matches );
while ( $basepos < $matches_count )
{
// Store the next match along - all others will be relative to that
// one
$group = [ $matches [ $basepos ]];
// Start scanning at the next one along - we always store the first match
$scanpos = $basepos + 1 ;
$distance = 0 ;
while ( true )
{
// Break out if we reach the end
if ( $scanpos >= $matches_count ) break ;
// Find the distance between the current one and the last one
$distance = $matches [ $scanpos ][ 1 ] - $matches [ $scanpos - 1 ][ 1 ];
// Store it if the distance is below the threshold
if ( $distance < $settings -> search_characters_context )
$group [] = $matches [ $scanpos ];
else
break ;
$scanpos ++ ;
}
$context_start = $group [ 0 ][ 1 ] - $settings -> search_characters_context ;
$context_end = $group [ count ( $group ) - 1 ][ 1 ] + $settings -> search_characters_context ;
$context = substr ( $source , $context_start , $context_end - $context_start );
// Strip the markdown from the context - it's most likely going to
// be broken anyway.
$context = self :: strip_markup ( $context );
$contexts [] = $context ;
$basepos = $scanpos + 1 ;
}
return implode ( " ... " , $contexts );
}
2015-10-31 13:52:50 +00:00
public static function highlight_context ( $query , $context )
{
$qterms = self :: tokenize ( $query );
foreach ( $qterms as $qterm )
{
2015-10-31 14:05:00 +00:00
// From http://stackoverflow.com/a/2483859/1460422
2015-11-14 21:24:57 +00:00
$context = preg_replace ( " / " . str_replace ( " / " , " \ / " , preg_quote ( $qterm )) . " /i " , " <strong> $ 0</strong> " , $context );
2015-10-31 13:52:50 +00:00
}
return $context ;
}
2015-10-29 11:21:04 +00:00
}
2015-10-27 21:10:05 +00:00
?>