Continue working on indexer. Why is it converting to html entities....?

2025-04-17 14:14:55 +00:00 · 2015-10-28 08:03:56 +00:00 · 2015-10-28 08:03:56 +00:00 · 725452a172
commit 725452a172
parent db7bf0f7ec
3 changed files with 35 additions and 75 deletions
--- a/build/index.php
+++ b/build/index.php
@ -1268,47 +1268,27 @@ register_module([
 			
 			$index = [];
 			
+			// Regex from 
+			$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
+			$i = 0;
+			foreach($terms as $term)
+			{
+				$nterm = strtolower($term);
+				if(!isset($index[$nterm]))
+				{
+					$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
+				}
+				
+				$index[$nterm]["freq"]++;
+				$index[$nterm]["offsets"][] = $i;
+				
+				$i++;
+			}
+			
 			var_dump($env->page);
 			var_dump($source);
 			echo("source length: $source_length\n");
 			
-			$basepos = 0;
-			$scanpos = $basepos;
-			while($basepos < $source_length)
-			{
-				$word = "";
-				do {
-					// Break if we reach the end of the source text
-					if($scanpos >= $source_length) break;
-					$word .= $source[$scanpos];
-					$scanpos++;
-				} while(strpos($breakable_chars, $source[$scanpos]) === false);
-				
-				// Move the base position up to the scan position (plus one to
-				// skip over the breakable character), saving the old base
-				// position for later
-				$word_start_pos = $basepos;
-				$basepos = $scanpos + 1;
-				// Continue if the word is empty
-				if(strlen($word) === 0) continue;
-				// Normalise the word to be lowercase
-				$word = strtolower($word);
-				
-				var_dump($word);
-				
-				// Initialise the entry in the index if it doesn't exist
-				if(!isset($index[$word]))
-				{
-					$index[$word] = [
-						"freq" => 0,
-						"offsets" => []
-					];
-				}
-				// Update the index entry
-				$index[$word]["freq"]++;
-				$index[$word]["offsets"][] = $word_start_pos;
-			}
-			
 			var_dump($index);
 		});
 	}
--- a/module_index.json
+++ b/module_index.json
@ -50,7 +50,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine.",
        "id": "feature-search",
-        "lastupdate": 1445980152,
+        "lastupdate": 1446019292,
        "optional": false
    },
    {
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -18,47 +18,27 @@ register_module([
 			
 			$index = [];
 			
+			// Regex from 
+			$terms = preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/", $source, -1, PREG_SPLIT_NO_EMPTY);
+			$i = 0;
+			foreach($terms as $term)
+			{
+				$nterm = strtolower($term);
+				if(!isset($index[$nterm]))
+				{
+					$index[$nterm] = [ "freq" => 0, "offsets" => [] ];
+				}
+				
+				$index[$nterm]["freq"]++;
+				$index[$nterm]["offsets"][] = $i;
+				
+				$i++;
+			}
+			
 			var_dump($env->page);
 			var_dump($source);
 			echo("source length: $source_length\n");
 			
-			$basepos = 0;
-			$scanpos = $basepos;
-			while($basepos < $source_length)
-			{
-				$word = "";
-				do {
-					// Break if we reach the end of the source text
-					if($scanpos >= $source_length) break;
-					$word .= $source[$scanpos];
-					$scanpos++;
-				} while(strpos($breakable_chars, $source[$scanpos]) === false);
-				
-				// Move the base position up to the scan position (plus one to
-				// skip over the breakable character), saving the old base
-				// position for later
-				$word_start_pos = $basepos;
-				$basepos = $scanpos + 1;
-				// Continue if the word is empty
-				if(strlen($word) === 0) continue;
-				// Normalise the word to be lowercase
-				$word = strtolower($word);
-				
-				var_dump($word);
-				
-				// Initialise the entry in the index if it doesn't exist
-				if(!isset($index[$word]))
-				{
-					$index[$word] = [
-						"freq" => 0,
-						"offsets" => []
-					];
-				}
-				// Update the index entry
-				$index[$word]["freq"]++;
-				$index[$word]["offsets"][] = $word_start_pos;
-			}
-			
 			var_dump($index);
 		});
 	}