Search: Transliterate characters so you don't have to remember the diacritics when searching

This commit is contained in:
Starbeamrainbowlabs 2018-06-25 22:53:53 +01:00
parent bdf47a2540
commit 49b91aa6f9
Signed by: sbrl
GPG Key ID: 1BE5172E637709C2
4 changed files with 83 additions and 21 deletions

View File

@ -7,6 +7,8 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
- [Module API] Added `save_settings()` convenience method
### Fixed
- Updated the search system to transliterate characters to better support searching pages that are written in other languages.
- You'll want to rebuild your search index via the button in the configuration panel, or the `invindex-rebuild` action.
- [Security] Made the site secret generator cryptographically secure. If you created your wiki before this change, you might want to change your site secret in `peppermint.json` to something more secure with a site like [random.org](https://www.random.org/).
- The PHP function `openssl_pseudo_random_bytes()` was being used before, but [apparently that's not cryptographically secure](https://paragonie.com/blog/2015/07/how-safely-generate-random-strings-and-integers-in-php).
- [Module API] Fix `full_url()` logic

View File

@ -396,7 +396,7 @@ if($settings->sessionprefix == "auto")
/////////////////////////////////////////////////////////////////////////////
/** The version of Pepperminty Wiki currently running. */
$version = "v0.17-dev";
$commit = "b6eda24adaf3607cf3437be0b8419215b52d662b";
$commit = "bdf47a2540bdbb36bd69869fd2a1b90d033d9966";
/// Environment ///
/** Holds information about the current request environment. */
$env = new stdClass();
@ -3768,9 +3768,18 @@ register_module([
$search_start = microtime(true);
$time_start = microtime(true);
$invindex = search::load_invindex($paths->searchindex);
$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
$start = microtime(true);
$results = search::query_invindex($_GET["query"], $invindex);
$resultCount = count($results);
$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
foreach($results as &$result) {
$result["context"] = search::extract_context(
@ -4154,6 +4163,7 @@ class search
*/
public static function index($source)
{
// We don't need to normalise or transliterate here because self::tokenize() does this for us
$source = html_entity_decode($source, ENT_QUOTES);
$source_length = mb_strlen($source);
@ -4189,7 +4199,13 @@ class search
*/
public static function tokenize($source)
{
$source = Normalizer::normalize(strtolower($source), Normalizer::FORM_C);
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
// We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable
// because PHP doesn't like it very much
$source = $literator->transliterate($source);
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
}
@ -4213,6 +4229,9 @@ class search
{
global $pageindex, $env, $paths, $settings;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
if($output) {
header("content-type: text/event-stream");
ob_end_flush();
@ -4234,8 +4253,8 @@ class search
$i++; $missing_files++;
continue;
}
$pagesource = Normalizer::normalize(file_get_contents($page_filename), Normalizer::FORM_C);
$index = self::index($pagesource);
// We do not transliterate or normalise here because the indexer will take care of this for us
$index = self::index(file_get_contents($page_filename));
$pageid = ids::getid($pagename);
self::merge_into_invindex($invindex, $pageid, $index);
@ -4387,6 +4406,9 @@ class search
{
global $settings, $pageindex;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query_terms = self::tokenize($query);
$matching_pages = [];
@ -4422,7 +4444,9 @@ class search
// Get the current page's id
$pageid = ids::getid($pagename);
// Consider matches in the page title
if(stripos($pagename, $qterm) !== false)
$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
if($title_matches_count > 0)
{
// We found the qterm in the title
if(!isset($matching_pages[$pageid]))
@ -4432,12 +4456,14 @@ class search
if(!isset($matching_pages[$pageid]["title-matches"]))
$matching_pages[$pageid]["title-matches"] = 0;
$matching_pages[$pageid]["title-matches"] += count(mb_stripos_all($pagename, $qterm)) * strlen($qterm);
$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
}
// Consider matches in the page's tags
if(isset($pagedata->tags) and // If this page has tags
stripos(implode(" ", $pagedata->tags), $qterm) !== false) // And we found the qterm in the tags
$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
if($tag_matches_count > 0) // And we found the qterm in the tags
{
if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = [ "nterms" => [] ];
@ -4445,7 +4471,7 @@ class search
// Set up a counter for tag match if there isn't one already
if(!isset($matching_pages[$pageid]["tag-matches"]))
$matching_pages[$pageid]["tag-matches"] = 0;
$matching_pages[$pageid]["tag-matches"] += count(mb_stripos_all(implode(" ", $pagedata->tags), $qterm)) * strlen($qterm);
$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
}
}
}
@ -7047,7 +7073,12 @@ DIFFSCRIPT;
. "</p>");
}
]);
/**
* Generates a unique hash of a page's content for edit conflict detection
* purposes.
* @param string $page_data The page text to hash.
* @return string A hash of the given page text.
*/
function generate_page_hash($page_data) {
return sha1($page_data);
}
@ -7744,6 +7775,9 @@ register_module([
}
]);
/**
* Recalculates and updates the password hashing cost.
*/
function do_password_hash_code_update() {
global $settings, $paths;

View File

@ -104,7 +104,7 @@
"author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
"id": "feature-search",
"lastupdate": 1523105081,
"lastupdate": 1529963426,
"optional": false
},
{
@ -176,7 +176,7 @@
"author": "Starbeamrainbowlabs",
"description": "Allows you to edit pages by adding the edit and save actions. You should probably include this one.",
"id": "page-edit",
"lastupdate": 1526037910,
"lastupdate": 1527246338,
"optional": false
},
{
@ -212,7 +212,7 @@
"author": "Starbeamrainbowlabs",
"description": "Adds a pair of actions (login and checklogin) that allow users to login. You need this one if you want your users to be able to login.",
"id": "page-login",
"lastupdate": 1526227977,
"lastupdate": 1527246396,
"optional": false
},
{

View File

@ -125,9 +125,18 @@ register_module([
$search_start = microtime(true);
$time_start = microtime(true);
$invindex = search::load_invindex($paths->searchindex);
$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
$start = microtime(true);
$results = search::query_invindex($_GET["query"], $invindex);
$resultCount = count($results);
$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
foreach($results as &$result) {
$result["context"] = search::extract_context(
@ -511,6 +520,7 @@ class search
*/
public static function index($source)
{
// We don't need to normalise or transliterate here because self::tokenize() does this for us
$source = html_entity_decode($source, ENT_QUOTES);
$source_length = mb_strlen($source);
@ -546,7 +556,13 @@ class search
*/
public static function tokenize($source)
{
$source = Normalizer::normalize(strtolower($source), Normalizer::FORM_C);
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
// We don't need to normalise here because the transliterator handles
// this for us. Also, we can't move the literator to a static variable
// because PHP doesn't like it very much
$source = $literator->transliterate($source);
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
}
@ -570,6 +586,9 @@ class search
{
global $pageindex, $env, $paths, $settings;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
if($output) {
header("content-type: text/event-stream");
ob_end_flush();
@ -591,8 +610,8 @@ class search
$i++; $missing_files++;
continue;
}
$pagesource = Normalizer::normalize(file_get_contents($page_filename), Normalizer::FORM_C);
$index = self::index($pagesource);
// We do not transliterate or normalise here because the indexer will take care of this for us
$index = self::index(file_get_contents($page_filename));
$pageid = ids::getid($pagename);
self::merge_into_invindex($invindex, $pageid, $index);
@ -744,6 +763,9 @@ class search
{
global $settings, $pageindex;
/** Normalises input characters for searching & indexing */
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
$query_terms = self::tokenize($query);
$matching_pages = [];
@ -779,7 +801,9 @@ class search
// Get the current page's id
$pageid = ids::getid($pagename);
// Consider matches in the page title
if(stripos($pagename, $qterm) !== false)
$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
if($title_matches_count > 0)
{
// We found the qterm in the title
if(!isset($matching_pages[$pageid]))
@ -789,12 +813,14 @@ class search
if(!isset($matching_pages[$pageid]["title-matches"]))
$matching_pages[$pageid]["title-matches"] = 0;
$matching_pages[$pageid]["title-matches"] += count(mb_stripos_all($pagename, $qterm)) * strlen($qterm);
$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
}
// Consider matches in the page's tags
if(isset($pagedata->tags) and // If this page has tags
stripos(implode(" ", $pagedata->tags), $qterm) !== false) // And we found the qterm in the tags
$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
if($tag_matches_count > 0) // And we found the qterm in the tags
{
if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = [ "nterms" => [] ];
@ -802,7 +828,7 @@ class search
// Set up a counter for tag match if there isn't one already
if(!isset($matching_pages[$pageid]["tag-matches"]))
$matching_pages[$pageid]["tag-matches"] = 0;
$matching_pages[$pageid]["tag-matches"] += count(mb_stripos_all(implode(" ", $pagedata->tags), $qterm)) * strlen($qterm);
$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
}
}
}