mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-26 05:32:59 +00:00
Search: Transliterate characters so you don't have to remember the diacritics when searching
This commit is contained in:
parent
bdf47a2540
commit
49b91aa6f9
4 changed files with 83 additions and 21 deletions
|
@ -7,6 +7,8 @@ This file holds the changelog for Pepperminty Wiki. This is the master list of t
|
||||||
- [Module API] Added `save_settings()` convenience method
|
- [Module API] Added `save_settings()` convenience method
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
- Updated the search system to transliterate characters to better support searching pages that are written in other languages.
|
||||||
|
- You'll want to rebuild your search index via the button in the configuration panel, or the `invindex-rebuild` action.
|
||||||
- [Security] Made the site secret generator cryptographically secure. If you created your wiki before this change, you might want to change your site secret in `peppermint.json` to something more secure with a site like [random.org](https://www.random.org/).
|
- [Security] Made the site secret generator cryptographically secure. If you created your wiki before this change, you might want to change your site secret in `peppermint.json` to something more secure with a site like [random.org](https://www.random.org/).
|
||||||
- The PHP function `openssl_pseudo_random_bytes()` was being used before, but [apparently that's not cryptographically secure](https://paragonie.com/blog/2015/07/how-safely-generate-random-strings-and-integers-in-php).
|
- The PHP function `openssl_pseudo_random_bytes()` was being used before, but [apparently that's not cryptographically secure](https://paragonie.com/blog/2015/07/how-safely-generate-random-strings-and-integers-in-php).
|
||||||
- [Module API] Fix `full_url()` logic
|
- [Module API] Fix `full_url()` logic
|
||||||
|
|
|
@ -396,7 +396,7 @@ if($settings->sessionprefix == "auto")
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
/** The version of Pepperminty Wiki currently running. */
|
/** The version of Pepperminty Wiki currently running. */
|
||||||
$version = "v0.17-dev";
|
$version = "v0.17-dev";
|
||||||
$commit = "b6eda24adaf3607cf3437be0b8419215b52d662b";
|
$commit = "bdf47a2540bdbb36bd69869fd2a1b90d033d9966";
|
||||||
/// Environment ///
|
/// Environment ///
|
||||||
/** Holds information about the current request environment. */
|
/** Holds information about the current request environment. */
|
||||||
$env = new stdClass();
|
$env = new stdClass();
|
||||||
|
@ -3768,9 +3768,18 @@ register_module([
|
||||||
|
|
||||||
$search_start = microtime(true);
|
$search_start = microtime(true);
|
||||||
|
|
||||||
|
|
||||||
|
$time_start = microtime(true);
|
||||||
$invindex = search::load_invindex($paths->searchindex);
|
$invindex = search::load_invindex($paths->searchindex);
|
||||||
|
$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
|
||||||
|
|
||||||
|
$start = microtime(true);
|
||||||
$results = search::query_invindex($_GET["query"], $invindex);
|
$results = search::query_invindex($_GET["query"], $invindex);
|
||||||
$resultCount = count($results);
|
$resultCount = count($results);
|
||||||
|
$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
|
||||||
|
|
||||||
|
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
||||||
|
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
||||||
|
|
||||||
foreach($results as &$result) {
|
foreach($results as &$result) {
|
||||||
$result["context"] = search::extract_context(
|
$result["context"] = search::extract_context(
|
||||||
|
@ -4154,6 +4163,7 @@ class search
|
||||||
*/
|
*/
|
||||||
public static function index($source)
|
public static function index($source)
|
||||||
{
|
{
|
||||||
|
// We don't need to normalise or transliterate here because self::tokenize() does this for us
|
||||||
$source = html_entity_decode($source, ENT_QUOTES);
|
$source = html_entity_decode($source, ENT_QUOTES);
|
||||||
$source_length = mb_strlen($source);
|
$source_length = mb_strlen($source);
|
||||||
|
|
||||||
|
@ -4189,7 +4199,13 @@ class search
|
||||||
*/
|
*/
|
||||||
public static function tokenize($source)
|
public static function tokenize($source)
|
||||||
{
|
{
|
||||||
$source = Normalizer::normalize(strtolower($source), Normalizer::FORM_C);
|
/** Normalises input characters for searching & indexing */
|
||||||
|
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
|
// We don't need to normalise here because the transliterator handles
|
||||||
|
// this for us. Also, we can't move the literator to a static variable
|
||||||
|
// because PHP doesn't like it very much
|
||||||
|
$source = $literator->transliterate($source);
|
||||||
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
||||||
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
|
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
|
||||||
}
|
}
|
||||||
|
@ -4213,6 +4229,9 @@ class search
|
||||||
{
|
{
|
||||||
global $pageindex, $env, $paths, $settings;
|
global $pageindex, $env, $paths, $settings;
|
||||||
|
|
||||||
|
/** Normalises input characters for searching & indexing */
|
||||||
|
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
if($output) {
|
if($output) {
|
||||||
header("content-type: text/event-stream");
|
header("content-type: text/event-stream");
|
||||||
ob_end_flush();
|
ob_end_flush();
|
||||||
|
@ -4234,8 +4253,8 @@ class search
|
||||||
$i++; $missing_files++;
|
$i++; $missing_files++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$pagesource = Normalizer::normalize(file_get_contents($page_filename), Normalizer::FORM_C);
|
// We do not transliterate or normalise here because the indexer will take care of this for us
|
||||||
$index = self::index($pagesource);
|
$index = self::index(file_get_contents($page_filename));
|
||||||
|
|
||||||
$pageid = ids::getid($pagename);
|
$pageid = ids::getid($pagename);
|
||||||
self::merge_into_invindex($invindex, $pageid, $index);
|
self::merge_into_invindex($invindex, $pageid, $index);
|
||||||
|
@ -4387,6 +4406,9 @@ class search
|
||||||
{
|
{
|
||||||
global $settings, $pageindex;
|
global $settings, $pageindex;
|
||||||
|
|
||||||
|
/** Normalises input characters for searching & indexing */
|
||||||
|
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
$query_terms = self::tokenize($query);
|
$query_terms = self::tokenize($query);
|
||||||
$matching_pages = [];
|
$matching_pages = [];
|
||||||
|
|
||||||
|
@ -4422,7 +4444,9 @@ class search
|
||||||
// Get the current page's id
|
// Get the current page's id
|
||||||
$pageid = ids::getid($pagename);
|
$pageid = ids::getid($pagename);
|
||||||
// Consider matches in the page title
|
// Consider matches in the page title
|
||||||
if(stripos($pagename, $qterm) !== false)
|
$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
|
||||||
|
$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
|
||||||
|
if($title_matches_count > 0)
|
||||||
{
|
{
|
||||||
// We found the qterm in the title
|
// We found the qterm in the title
|
||||||
if(!isset($matching_pages[$pageid]))
|
if(!isset($matching_pages[$pageid]))
|
||||||
|
@ -4432,12 +4456,14 @@ class search
|
||||||
if(!isset($matching_pages[$pageid]["title-matches"]))
|
if(!isset($matching_pages[$pageid]["title-matches"]))
|
||||||
$matching_pages[$pageid]["title-matches"] = 0;
|
$matching_pages[$pageid]["title-matches"] = 0;
|
||||||
|
|
||||||
$matching_pages[$pageid]["title-matches"] += count(mb_stripos_all($pagename, $qterm)) * strlen($qterm);
|
$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Consider matches in the page's tags
|
// Consider matches in the page's tags
|
||||||
if(isset($pagedata->tags) and // If this page has tags
|
$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
|
||||||
stripos(implode(" ", $pagedata->tags), $qterm) !== false) // And we found the qterm in the tags
|
$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
|
||||||
|
|
||||||
|
if($tag_matches_count > 0) // And we found the qterm in the tags
|
||||||
{
|
{
|
||||||
if(!isset($matching_pages[$pageid]))
|
if(!isset($matching_pages[$pageid]))
|
||||||
$matching_pages[$pageid] = [ "nterms" => [] ];
|
$matching_pages[$pageid] = [ "nterms" => [] ];
|
||||||
|
@ -4445,7 +4471,7 @@ class search
|
||||||
// Set up a counter for tag match if there isn't one already
|
// Set up a counter for tag match if there isn't one already
|
||||||
if(!isset($matching_pages[$pageid]["tag-matches"]))
|
if(!isset($matching_pages[$pageid]["tag-matches"]))
|
||||||
$matching_pages[$pageid]["tag-matches"] = 0;
|
$matching_pages[$pageid]["tag-matches"] = 0;
|
||||||
$matching_pages[$pageid]["tag-matches"] += count(mb_stripos_all(implode(" ", $pagedata->tags), $qterm)) * strlen($qterm);
|
$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7047,7 +7073,12 @@ DIFFSCRIPT;
|
||||||
. "</p>");
|
. "</p>");
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
/**
|
||||||
|
* Generates a unique hash of a page's content for edit conflict detection
|
||||||
|
* purposes.
|
||||||
|
* @param string $page_data The page text to hash.
|
||||||
|
* @return string A hash of the given page text.
|
||||||
|
*/
|
||||||
function generate_page_hash($page_data) {
|
function generate_page_hash($page_data) {
|
||||||
return sha1($page_data);
|
return sha1($page_data);
|
||||||
}
|
}
|
||||||
|
@ -7744,6 +7775,9 @@ register_module([
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recalculates and updates the password hashing cost.
|
||||||
|
*/
|
||||||
function do_password_hash_code_update() {
|
function do_password_hash_code_update() {
|
||||||
global $settings, $paths;
|
global $settings, $paths;
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
"description": "Adds proper search functionality to Pepperminty Wiki using an inverted index to provide a full text search engine. If pages don't show up, then you might have hit a stop word. If not, try requesting the `invindex-rebuild` action to rebuild the inverted index from scratch.",
|
||||||
"id": "feature-search",
|
"id": "feature-search",
|
||||||
"lastupdate": 1523105081,
|
"lastupdate": 1529963426,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -176,7 +176,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Allows you to edit pages by adding the edit and save actions. You should probably include this one.",
|
"description": "Allows you to edit pages by adding the edit and save actions. You should probably include this one.",
|
||||||
"id": "page-edit",
|
"id": "page-edit",
|
||||||
"lastupdate": 1526037910,
|
"lastupdate": 1527246338,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -212,7 +212,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds a pair of actions (login and checklogin) that allow users to login. You need this one if you want your users to be able to login.",
|
"description": "Adds a pair of actions (login and checklogin) that allow users to login. You need this one if you want your users to be able to login.",
|
||||||
"id": "page-login",
|
"id": "page-login",
|
||||||
"lastupdate": 1526227977,
|
"lastupdate": 1527246396,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -125,9 +125,18 @@ register_module([
|
||||||
|
|
||||||
$search_start = microtime(true);
|
$search_start = microtime(true);
|
||||||
|
|
||||||
|
|
||||||
|
$time_start = microtime(true);
|
||||||
$invindex = search::load_invindex($paths->searchindex);
|
$invindex = search::load_invindex($paths->searchindex);
|
||||||
|
$env->perfdata->invindex_decode_time = round((microtime(true) - $time_start)*1000, 3);
|
||||||
|
|
||||||
|
$start = microtime(true);
|
||||||
$results = search::query_invindex($_GET["query"], $invindex);
|
$results = search::query_invindex($_GET["query"], $invindex);
|
||||||
$resultCount = count($results);
|
$resultCount = count($results);
|
||||||
|
$env->perfdata->invindex_query_time = round((microtime(true) - $time_start)*1000, 3);
|
||||||
|
|
||||||
|
header("x-invindex-decode-time: {$env->perfdata->invindex_decode_time}ms");
|
||||||
|
header("x-invindex-query-time: {$env->perfdata->invindex_query_time}ms");
|
||||||
|
|
||||||
foreach($results as &$result) {
|
foreach($results as &$result) {
|
||||||
$result["context"] = search::extract_context(
|
$result["context"] = search::extract_context(
|
||||||
|
@ -511,6 +520,7 @@ class search
|
||||||
*/
|
*/
|
||||||
public static function index($source)
|
public static function index($source)
|
||||||
{
|
{
|
||||||
|
// We don't need to normalise or transliterate here because self::tokenize() does this for us
|
||||||
$source = html_entity_decode($source, ENT_QUOTES);
|
$source = html_entity_decode($source, ENT_QUOTES);
|
||||||
$source_length = mb_strlen($source);
|
$source_length = mb_strlen($source);
|
||||||
|
|
||||||
|
@ -546,7 +556,13 @@ class search
|
||||||
*/
|
*/
|
||||||
public static function tokenize($source)
|
public static function tokenize($source)
|
||||||
{
|
{
|
||||||
$source = Normalizer::normalize(strtolower($source), Normalizer::FORM_C);
|
/** Normalises input characters for searching & indexing */
|
||||||
|
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
|
// We don't need to normalise here because the transliterator handles
|
||||||
|
// this for us. Also, we can't move the literator to a static variable
|
||||||
|
// because PHP doesn't like it very much
|
||||||
|
$source = $literator->transliterate($source);
|
||||||
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
$source = preg_replace('/[\[\]\|\{\}\/]/u', " ", $source);
|
||||||
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
|
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/u", $source, -1, PREG_SPLIT_NO_EMPTY);
|
||||||
}
|
}
|
||||||
|
@ -570,6 +586,9 @@ class search
|
||||||
{
|
{
|
||||||
global $pageindex, $env, $paths, $settings;
|
global $pageindex, $env, $paths, $settings;
|
||||||
|
|
||||||
|
/** Normalises input characters for searching & indexing */
|
||||||
|
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
if($output) {
|
if($output) {
|
||||||
header("content-type: text/event-stream");
|
header("content-type: text/event-stream");
|
||||||
ob_end_flush();
|
ob_end_flush();
|
||||||
|
@ -591,8 +610,8 @@ class search
|
||||||
$i++; $missing_files++;
|
$i++; $missing_files++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$pagesource = Normalizer::normalize(file_get_contents($page_filename), Normalizer::FORM_C);
|
// We do not transliterate or normalise here because the indexer will take care of this for us
|
||||||
$index = self::index($pagesource);
|
$index = self::index(file_get_contents($page_filename));
|
||||||
|
|
||||||
$pageid = ids::getid($pagename);
|
$pageid = ids::getid($pagename);
|
||||||
self::merge_into_invindex($invindex, $pageid, $index);
|
self::merge_into_invindex($invindex, $pageid, $index);
|
||||||
|
@ -744,6 +763,9 @@ class search
|
||||||
{
|
{
|
||||||
global $settings, $pageindex;
|
global $settings, $pageindex;
|
||||||
|
|
||||||
|
/** Normalises input characters for searching & indexing */
|
||||||
|
static $literator; $literator = Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', Transliterator::FORWARD);
|
||||||
|
|
||||||
$query_terms = self::tokenize($query);
|
$query_terms = self::tokenize($query);
|
||||||
$matching_pages = [];
|
$matching_pages = [];
|
||||||
|
|
||||||
|
@ -779,7 +801,9 @@ class search
|
||||||
// Get the current page's id
|
// Get the current page's id
|
||||||
$pageid = ids::getid($pagename);
|
$pageid = ids::getid($pagename);
|
||||||
// Consider matches in the page title
|
// Consider matches in the page title
|
||||||
if(stripos($pagename, $qterm) !== false)
|
$title_matches = mb_stripos_all($literator->transliterate($pagename), $qterm);
|
||||||
|
$title_matches_count = $title_matches !== false ? count($title_matches) : 0;
|
||||||
|
if($title_matches_count > 0)
|
||||||
{
|
{
|
||||||
// We found the qterm in the title
|
// We found the qterm in the title
|
||||||
if(!isset($matching_pages[$pageid]))
|
if(!isset($matching_pages[$pageid]))
|
||||||
|
@ -789,12 +813,14 @@ class search
|
||||||
if(!isset($matching_pages[$pageid]["title-matches"]))
|
if(!isset($matching_pages[$pageid]["title-matches"]))
|
||||||
$matching_pages[$pageid]["title-matches"] = 0;
|
$matching_pages[$pageid]["title-matches"] = 0;
|
||||||
|
|
||||||
$matching_pages[$pageid]["title-matches"] += count(mb_stripos_all($pagename, $qterm)) * strlen($qterm);
|
$matching_pages[$pageid]["title-matches"] += $title_matches_count * strlen($qterm);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Consider matches in the page's tags
|
// Consider matches in the page's tags
|
||||||
if(isset($pagedata->tags) and // If this page has tags
|
$tag_matches = isset($pagedata->tags) ? mb_stripos_all($literator->transliterate(implode(" ", $pagedata->tags)), $qterm) : false;
|
||||||
stripos(implode(" ", $pagedata->tags), $qterm) !== false) // And we found the qterm in the tags
|
$tag_matches_count = $tag_matches !== false ? count($tag_matches) : 0;
|
||||||
|
|
||||||
|
if($tag_matches_count > 0) // And we found the qterm in the tags
|
||||||
{
|
{
|
||||||
if(!isset($matching_pages[$pageid]))
|
if(!isset($matching_pages[$pageid]))
|
||||||
$matching_pages[$pageid] = [ "nterms" => [] ];
|
$matching_pages[$pageid] = [ "nterms" => [] ];
|
||||||
|
@ -802,7 +828,7 @@ class search
|
||||||
// Set up a counter for tag match if there isn't one already
|
// Set up a counter for tag match if there isn't one already
|
||||||
if(!isset($matching_pages[$pageid]["tag-matches"]))
|
if(!isset($matching_pages[$pageid]["tag-matches"]))
|
||||||
$matching_pages[$pageid]["tag-matches"] = 0;
|
$matching_pages[$pageid]["tag-matches"] = 0;
|
||||||
$matching_pages[$pageid]["tag-matches"] += count(mb_stripos_all(implode(" ", $pagedata->tags), $qterm)) * strlen($qterm);
|
$matching_pages[$pageid]["tag-matches"] += $tag_matches_count * strlen($qterm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue