mirror of
https://github.com/sbrl/Pepperminty-Wiki.git
synced 2024-11-25 17:23:00 +00:00
begin writing the search engine itself O.o
This commit is contained in:
parent
2307ceaa9d
commit
e016c5f9a7
3 changed files with 157 additions and 7 deletions
|
@ -1336,6 +1336,19 @@ register_module([
|
||||||
|
|
||||||
var_dump($index);
|
var_dump($index);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
add_action("invindex-rebuild", function() {
|
||||||
|
search::rebuild_invindex();
|
||||||
|
});
|
||||||
|
|
||||||
|
add_action("search", function() {
|
||||||
|
global $settings;
|
||||||
|
|
||||||
|
if(!isset($_GET["query"]))
|
||||||
|
exit(page_renderer::render("No Search Terms - Error - $settings->$sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
|
||||||
|
|
||||||
|
|
||||||
|
});
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
@ -1386,7 +1399,7 @@ class search
|
||||||
"why", "will", "with", "within", "without", "would", "yet", "you",
|
"why", "will", "with", "within", "without", "would", "yet", "you",
|
||||||
"your", "yours", "yourself", "yourselves"
|
"your", "yours", "yourself", "yourselves"
|
||||||
];
|
];
|
||||||
|
|
||||||
public static function index($source)
|
public static function index($source)
|
||||||
{
|
{
|
||||||
$source = html_entity_decode($source, ENT_QUOTES);
|
$source = html_entity_decode($source, ENT_QUOTES);
|
||||||
|
@ -1395,11 +1408,11 @@ class search
|
||||||
$index = [];
|
$index = [];
|
||||||
|
|
||||||
// Regex from
|
// Regex from
|
||||||
$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
|
$terms = self::tokenize($source);
|
||||||
$i = 0;
|
$i = 0;
|
||||||
foreach($terms as $term)
|
foreach($terms as $term)
|
||||||
{
|
{
|
||||||
$nterm = strtolower($term);
|
$nterm = $term;
|
||||||
|
|
||||||
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
||||||
if(in_array($nterm, self::$stop_words)) continue;
|
if(in_array($nterm, self::$stop_words)) continue;
|
||||||
|
@ -1418,6 +1431,28 @@ class search
|
||||||
return $index;
|
return $index;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function tokenize($source)
|
||||||
|
{
|
||||||
|
$source = strtolower($source);
|
||||||
|
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/", $source, -1, PREG_SPLIT_NO_EMPTY);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function rebuild_invindex()
|
||||||
|
{
|
||||||
|
global $pageindex;
|
||||||
|
|
||||||
|
$invindex = [];
|
||||||
|
foreach($pageindex as $pagename => $pagedetails)
|
||||||
|
{
|
||||||
|
$pagesource = file_get_contents("$pagename.md");
|
||||||
|
$index = self::index($pagesource);
|
||||||
|
|
||||||
|
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
|
||||||
|
}
|
||||||
|
|
||||||
|
self::save_invindex("invindex.json", $invindex);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @summary Sorts an index alphabetically. Will also sort an inverted index.
|
* @summary Sorts an index alphabetically. Will also sort an inverted index.
|
||||||
* This allows us to do a binary search instead of a regular
|
* This allows us to do a binary search instead of a regular
|
||||||
|
@ -1475,6 +1510,12 @@ class search
|
||||||
// If the nterm isn't in the inverted index, then create a space for it
|
// If the nterm isn't in the inverted index, then create a space for it
|
||||||
if(!isset($invindex[$nterm])) $invindex[$nterm] = [];
|
if(!isset($invindex[$nterm])) $invindex[$nterm] = [];
|
||||||
$invindex[$nterm][$pageid] = $newentry;
|
$invindex[$nterm][$pageid] = $newentry;
|
||||||
|
|
||||||
|
// Sort the page entries for this word by frequency
|
||||||
|
uasort($invindex[$nterm], function($a, $b) {
|
||||||
|
if($a["freq"] == $b["freq"]) return 0;
|
||||||
|
return ($a["freq"] < $b["freq"]) ? +1 : -1;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1482,6 +1523,27 @@ class search
|
||||||
{
|
{
|
||||||
file_put_contents($filename, json_encode($invindex));
|
file_put_contents($filename, json_encode($invindex));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function search_invindex($query, &$invindex)
|
||||||
|
{
|
||||||
|
$query_terms = self::tokenize($query);
|
||||||
|
$matching_pages = [];
|
||||||
|
|
||||||
|
for($i = 0; $i < count($query_terms); $i++)
|
||||||
|
{
|
||||||
|
$qterm = $query_terms[$i];
|
||||||
|
|
||||||
|
// Skip over this term if it isn't in the inverted index
|
||||||
|
if(!isset($invindex[$qterm]))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Loop over each page
|
||||||
|
foreach($invindex[$qterm] as $page_entry)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
"author": "Starbeamrainbowlabs",
|
"author": "Starbeamrainbowlabs",
|
||||||
"description": "Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine.",
|
"description": "Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine.",
|
||||||
"id": "feature-search",
|
"id": "feature-search",
|
||||||
"lastupdate": 1446042661,
|
"lastupdate": 1446059639,
|
||||||
"optional": false
|
"optional": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -22,6 +22,19 @@ register_module([
|
||||||
|
|
||||||
var_dump($index);
|
var_dump($index);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
add_action("invindex-rebuild", function() {
|
||||||
|
search::rebuild_invindex();
|
||||||
|
});
|
||||||
|
|
||||||
|
add_action("search", function() {
|
||||||
|
global $settings;
|
||||||
|
|
||||||
|
if(!isset($_GET["query"]))
|
||||||
|
exit(page_renderer::render("No Search Terms - Error - $settings->$sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
|
||||||
|
|
||||||
|
|
||||||
|
});
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
@ -72,7 +85,7 @@ class search
|
||||||
"why", "will", "with", "within", "without", "would", "yet", "you",
|
"why", "will", "with", "within", "without", "would", "yet", "you",
|
||||||
"your", "yours", "yourself", "yourselves"
|
"your", "yours", "yourself", "yourselves"
|
||||||
];
|
];
|
||||||
|
|
||||||
public static function index($source)
|
public static function index($source)
|
||||||
{
|
{
|
||||||
$source = html_entity_decode($source, ENT_QUOTES);
|
$source = html_entity_decode($source, ENT_QUOTES);
|
||||||
|
@ -81,11 +94,11 @@ class search
|
||||||
$index = [];
|
$index = [];
|
||||||
|
|
||||||
// Regex from
|
// Regex from
|
||||||
$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
|
$terms = self::tokenize($source);
|
||||||
$i = 0;
|
$i = 0;
|
||||||
foreach($terms as $term)
|
foreach($terms as $term)
|
||||||
{
|
{
|
||||||
$nterm = strtolower($term);
|
$nterm = $term;
|
||||||
|
|
||||||
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
|
||||||
if(in_array($nterm, self::$stop_words)) continue;
|
if(in_array($nterm, self::$stop_words)) continue;
|
||||||
|
@ -104,6 +117,28 @@ class search
|
||||||
return $index;
|
return $index;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function tokenize($source)
|
||||||
|
{
|
||||||
|
$source = strtolower($source);
|
||||||
|
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/", $source, -1, PREG_SPLIT_NO_EMPTY);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function rebuild_invindex()
|
||||||
|
{
|
||||||
|
global $pageindex;
|
||||||
|
|
||||||
|
$invindex = [];
|
||||||
|
foreach($pageindex as $pagename => $pagedetails)
|
||||||
|
{
|
||||||
|
$pagesource = file_get_contents("$pagename.md");
|
||||||
|
$index = self::index($pagesource);
|
||||||
|
|
||||||
|
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
|
||||||
|
}
|
||||||
|
|
||||||
|
self::save_invindex("invindex.json", $invindex);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* @summary Sorts an index alphabetically. Will also sort an inverted index.
|
* @summary Sorts an index alphabetically. Will also sort an inverted index.
|
||||||
* This allows us to do a binary search instead of a regular
|
* This allows us to do a binary search instead of a regular
|
||||||
|
@ -161,6 +196,12 @@ class search
|
||||||
// If the nterm isn't in the inverted index, then create a space for it
|
// If the nterm isn't in the inverted index, then create a space for it
|
||||||
if(!isset($invindex[$nterm])) $invindex[$nterm] = [];
|
if(!isset($invindex[$nterm])) $invindex[$nterm] = [];
|
||||||
$invindex[$nterm][$pageid] = $newentry;
|
$invindex[$nterm][$pageid] = $newentry;
|
||||||
|
|
||||||
|
// Sort the page entries for this word by frequency
|
||||||
|
uasort($invindex[$nterm], function($a, $b) {
|
||||||
|
if($a["freq"] == $b["freq"]) return 0;
|
||||||
|
return ($a["freq"] < $b["freq"]) ? +1 : -1;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -168,6 +209,53 @@ class search
|
||||||
{
|
{
|
||||||
file_put_contents($filename, json_encode($invindex));
|
file_put_contents($filename, json_encode($invindex));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function search_invindex($query, &$invindex)
|
||||||
|
{
|
||||||
|
$query_terms = self::tokenize($query);
|
||||||
|
$matching_pages = [];
|
||||||
|
|
||||||
|
// Loop over each term in the query and find the matching page entries
|
||||||
|
for($i = 0; $i < count($query_terms); $i++)
|
||||||
|
{
|
||||||
|
$qterm = $query_terms[$i];
|
||||||
|
|
||||||
|
// Skip over this term if it isn't in the inverted index
|
||||||
|
if(!isset($invindex[$qterm]))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Loop over each page
|
||||||
|
foreach($invindex[$qterm] as $pageid => $page_entry)
|
||||||
|
{
|
||||||
|
// Create an entry in the matching pages array if it doesn't exist
|
||||||
|
if(!isset($matching_pages[$pageid]))
|
||||||
|
$matching_pages[$pageid] = [];
|
||||||
|
$matching_pages[$pageid][$qterm] = $page_entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach($matching_pages as &$pagedata)
|
||||||
|
{
|
||||||
|
$pagedata["rank"] = 0;
|
||||||
|
|
||||||
|
foreach($pagedata as $pterm => $entry)
|
||||||
|
{
|
||||||
|
$pagedata["rank"] += $entry["freq"];
|
||||||
|
|
||||||
|
// todo rank by context here
|
||||||
|
}
|
||||||
|
|
||||||
|
// todo remove items if the rank is below a threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
// todo sort by rank here
|
||||||
|
uasort($matching_pages, function($a, $b) {
|
||||||
|
if($a["rank"] == $b["rank"]) return 0;
|
||||||
|
return ($a["rank"] < $b["rank"]) ? +1 : -1;
|
||||||
|
});
|
||||||
|
|
||||||
|
return $matching_pages;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
?>
|
||||||
|
|
Loading…
Reference in a new issue