begin writing the search engine itself O.o

This commit is contained in:
Starbeamrainbowlabs 2015-10-28 20:56:10 +00:00
parent 2307ceaa9d
commit e016c5f9a7
3 changed files with 157 additions and 7 deletions

View File

@ -1336,6 +1336,19 @@ register_module([
var_dump($index);
});
add_action("invindex-rebuild", function() {
search::rebuild_invindex();
});
add_action("search", function() {
global $settings;
if(!isset($_GET["query"]))
exit(page_renderer::render("No Search Terms - Error - $settings->$sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
});
}
]);
@ -1386,7 +1399,7 @@ class search
"why", "will", "with", "within", "without", "would", "yet", "you",
"your", "yours", "yourself", "yourselves"
];
public static function index($source)
{
$source = html_entity_decode($source, ENT_QUOTES);
@ -1395,11 +1408,11 @@ class search
$index = [];
// Regex from
$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
$terms = self::tokenize($source);
$i = 0;
foreach($terms as $term)
{
$nterm = strtolower($term);
$nterm = $term;
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
if(in_array($nterm, self::$stop_words)) continue;
@ -1418,6 +1431,28 @@ class search
return $index;
}
public static function tokenize($source)
{
$source = strtolower($source);
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/", $source, -1, PREG_SPLIT_NO_EMPTY);
}
public static function rebuild_invindex()
{
global $pageindex;
$invindex = [];
foreach($pageindex as $pagename => $pagedetails)
{
$pagesource = file_get_contents("$pagename.md");
$index = self::index($pagesource);
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
}
self::save_invindex("invindex.json", $invindex);
}
/*
* @summary Sorts an index alphabetically. Will also sort an inverted index.
* This allows us to do a binary search instead of a regular
@ -1475,6 +1510,12 @@ class search
// If the nterm isn't in the inverted index, then create a space for it
if(!isset($invindex[$nterm])) $invindex[$nterm] = [];
$invindex[$nterm][$pageid] = $newentry;
// Sort the page entries for this word by frequency
uasort($invindex[$nterm], function($a, $b) {
if($a["freq"] == $b["freq"]) return 0;
return ($a["freq"] < $b["freq"]) ? +1 : -1;
});
}
}
@ -1482,6 +1523,27 @@ class search
{
file_put_contents($filename, json_encode($invindex));
}
public static function search_invindex($query, &$invindex)
{
$query_terms = self::tokenize($query);
$matching_pages = [];
for($i = 0; $i < count($query_terms); $i++)
{
$qterm = $query_terms[$i];
// Skip over this term if it isn't in the inverted index
if(!isset($invindex[$qterm]))
continue;
// Loop over each page
foreach($invindex[$qterm] as $page_entry)
{
}
}
}
}

View File

@ -50,7 +50,7 @@
"author": "Starbeamrainbowlabs",
"description": "Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine.",
"id": "feature-search",
"lastupdate": 1446042661,
"lastupdate": 1446059639,
"optional": false
},
{

View File

@ -22,6 +22,19 @@ register_module([
var_dump($index);
});
add_action("invindex-rebuild", function() {
search::rebuild_invindex();
});
add_action("search", function() {
global $settings;
if(!isset($_GET["query"]))
exit(page_renderer::render("No Search Terms - Error - $settings->$sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
});
}
]);
@ -72,7 +85,7 @@ class search
"why", "will", "with", "within", "without", "would", "yet", "you",
"your", "yours", "yourself", "yourselves"
];
public static function index($source)
{
$source = html_entity_decode($source, ENT_QUOTES);
@ -81,11 +94,11 @@ class search
$index = [];
// Regex from
$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
$terms = self::tokenize($source);
$i = 0;
foreach($terms as $term)
{
$nterm = strtolower($term);
$nterm = $term;
// Skip over stop words (see https://en.wikipedia.org/wiki/Stop_words)
if(in_array($nterm, self::$stop_words)) continue;
@ -104,6 +117,28 @@ class search
return $index;
}
public static function tokenize($source)
{
$source = strtolower($source);
return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/", $source, -1, PREG_SPLIT_NO_EMPTY);
}
public static function rebuild_invindex()
{
global $pageindex;
$invindex = [];
foreach($pageindex as $pagename => $pagedetails)
{
$pagesource = file_get_contents("$pagename.md");
$index = self::index($pagesource);
self::merge_into_invindex($invindex, ids::getid($pagename), $index);
}
self::save_invindex("invindex.json", $invindex);
}
/*
* @summary Sorts an index alphabetically. Will also sort an inverted index.
* This allows us to do a binary search instead of a regular
@ -161,6 +196,12 @@ class search
// If the nterm isn't in the inverted index, then create a space for it
if(!isset($invindex[$nterm])) $invindex[$nterm] = [];
$invindex[$nterm][$pageid] = $newentry;
// Sort the page entries for this word by frequency
uasort($invindex[$nterm], function($a, $b) {
if($a["freq"] == $b["freq"]) return 0;
return ($a["freq"] < $b["freq"]) ? +1 : -1;
});
}
}
@ -168,6 +209,53 @@ class search
{
file_put_contents($filename, json_encode($invindex));
}
public static function search_invindex($query, &$invindex)
{
$query_terms = self::tokenize($query);
$matching_pages = [];
// Loop over each term in the query and find the matching page entries
for($i = 0; $i < count($query_terms); $i++)
{
$qterm = $query_terms[$i];
// Skip over this term if it isn't in the inverted index
if(!isset($invindex[$qterm]))
continue;
// Loop over each page
foreach($invindex[$qterm] as $pageid => $page_entry)
{
// Create an entry in the matching pages array if it doesn't exist
if(!isset($matching_pages[$pageid]))
$matching_pages[$pageid] = [];
$matching_pages[$pageid][$qterm] = $page_entry;
}
}
foreach($matching_pages as &$pagedata)
{
$pagedata["rank"] = 0;
foreach($pagedata as $pterm => $entry)
{
$pagedata["rank"] += $entry["freq"];
// todo rank by context here
}
// todo remove items if the rank is below a threshold
}
// todo sort by rank here
uasort($matching_pages, function($a, $b) {
if($a["rank"] == $b["rank"]) return 0;
return ($a["rank"] < $b["rank"]) ? +1 : -1;
});
return $matching_pages;
}
}
?>