Begin interface work. Why is the <strong> being inserted into another <strong>???

2024-11-22 16:33:00 +00:00 · 2015-10-29 11:21:04 +00:00 · 2015-10-29 11:21:04 +00:00 · 4157a9fb6c
commit 4157a9fb6c
parent e016c5f9a7
5 changed files with 374 additions and 16 deletions
--- a/build/index.php
+++ b/build/index.php
@ -206,6 +206,16 @@ $settings->mime_extension_mappings_location = "/etc/mime.types";
 $settings->min_preview_size = 1;
 $settings->max_preview_size = 2048;
 // The maximum distance terms should be apart in the context display below
 // search results. This is purely aesthetical - it doesn't affect the search
 // algorithm.
 $settings->search_max_distance_context_display = 100;
 // The number of characters that should be displayed either side of a matching
 // term in the context below each search result.
 $settings->search_characters_context = 200;
 // A string of css to include. Will be included in the <head> of every page
 // inside a <style> tag. This may also be a url - urls will be referenced via a
 // <link rel='stylesheet' /> tag.
@ -1345,9 +1355,40 @@ register_module([
 			global $settings;
 			if(!isset($_GET["query"]))
-				exit(page_renderer::render("No Search Terms - Error - $settings->$sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
+				exit(page_renderer::render("No Search Terms - Error - $settings->sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
 			$search_start = microtime(true);
 			$invindex = search::load_invindex("invindex.json");
 			$results = search::query_invindex($_GET["query"], $invindex);
 			$search_end = microtime(true) - $search_start;
 			$title = $_GET["query"] . " - Search results - $settings->sitename";
 			$content = "<section>\n";
 			$content .= "<h1>Search Results</h1>";
 			// todo add a search box here
 			foreach($results as $result)
 			{
 				$link = "?page=" . rawurlencode($result["pagename"]);
 				$pagesource = file_get_contents($result["pagename"] . ".md");
 				$context = search::extract_context($_GET["query"], $pagesource);
 				$content .= "<div>\n";
 				$content .= "	<h2><a href='$link'>" . $result["pagename"] . "</a></h2>\n";
 				$content .= "	<p>$context</p>\n";
 				$content .= "</div>\n";
 			}
 			$content .= "</section>\n";
 			exit(page_renderer::render($title, $content));
 			//header("content-type: text/plain");
 			//var_dump($results);
 		});
 	}
 ]);
@ -1437,6 +1478,11 @@ class search
 		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/", $source, -1, PREG_SPLIT_NO_EMPTY);
 	}
 	public static function strip_markup($source)
 	{
 		return str_replace([ "[", "]", "\"", "*", "_", " - ", "`" ], "", $source);
 	}
 	public static function rebuild_invindex()
 	{
 		global $pageindex;
@ -1488,7 +1534,7 @@ class search
 	 * @summary Reads in and parses an inverted index.
 	 */
 	// Todo remove this function and make everything streamable
-	public static function parse_invindex($invindex_filename) {
+	public static function load_invindex($invindex_filename) {
 		$invindex = json_decode(file_get_contents($invindex_filename), true);
 		return $invindex;
 	}
@ -1524,11 +1570,12 @@ class search
 		file_put_contents($filename, json_encode($invindex));
 	}
-	public static function search_invindex($query, &$invindex)
+	public static function query_invindex($query, &$invindex)
 	{
 		$query_terms = self::tokenize($query);
 		$matching_pages = [];
 		// Loop over each term in the query and find the matching page entries
 		for($i = 0; $i < count($query_terms); $i++)
 		{
 			$qterm = $query_terms[$i];
@ -1538,12 +1585,157 @@ class search
 				continue;
 			// Loop over each page
-			foreach($invindex[$qterm] as $page_entry)
+			foreach($invindex[$qterm] as $pageid => $page_entry)
 			{
-				
+				// Create an entry in the matching pages array if it doesn't exist
 				if(!isset($matching_pages[$pageid]))
 					$matching_pages[$pageid] = [ "nterms" => [] ];
 				$matching_pages[$pageid]["nterms"][$qterm] = $page_entry;
 			}
 		}
 		foreach($matching_pages as $pageid => &$pagedata)
 		{
 			$pagedata["pagename"] = ids::getpagename($pageid);
 			$pagedata["rank"] = 0;
 			foreach($pagedata["nterms"] as $pterm => $entry)
 			{
 				$pagedata["rank"] += $entry["freq"];
 				// todo rank by context here
 			}
 			// todo remove items if the rank is below a threshold
 		}
 		// todo sort by rank here
 		uasort($matching_pages, function($a, $b) {
 			if($a["rank"] == $b["rank"]) return 0;
 			return ($a["rank"] < $b["rank"]) ? +1 : -1;
 		});
 		return $matching_pages;
 	}
 	public static function extract_context($query, $source)
 	{
 		global $settings;
 		$nterms = self::tokenize($query);
 		$matches = [];
 		// Loop over each nterm and find it in the source
 		foreach($nterms as $nterm)
 		{
 			$all_offsets = mb_stripos_all($source, $nterm);
 			// Skip over adding matches if there aren't any
 			if($all_offsets === false)
 				continue;
 			foreach($all_offsets as $offset)
 			{
 				$matches[] = [ $nterm, $offset ];
 			}
 		}
 		usort($matches, function($a, $b) {
 			if($a[1] == $b[1]) return 0;
 			return ($a[1] < $b[1]) ? +1 : -1;
 		});
 		$contexts = [];
 		$basepos = 0;
 		$matches_count = count($matches);
 		while($basepos < $matches_count)
 		{
 			// Store the next match along - all others will be relative to that
 			// one
 			$group = [$matches[$basepos]];
 			// Start scanning at the next one along - we always store the first match
 			$scanpos = $basepos + 1;
 			$distance = 0;
 			while(true)
 			{
 				// Break out if we reach the end
 				if($scanpos >= $matches_count) break;
 				// Find the distance between the current one and the last one
 				$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
 				// Store it if the distance is below the threshold
 				if($distance < $settings->search_characters_context)
 					$group[] = $matches[$scanpos];
 				else
 					break;
 				$scanpos++;
 			}
 			$context_start = $group[0][1] - $settings->search_characters_context;
 			$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
 			$context = substr($source, $context_start, $context_end - $context_start);
 			// Strip the markdown from the context - it's most likely going to
 			// be broken anyway.
 			$context = self::strip_markup($context);
 			// Make the matching words bold.
 			$extraoffset = 0;
 			foreach($group as $match)
 			{
 				$start = $match[1] + $extraoffset;
 				$length = strlen($match[0]);
 				$end = $start + $length;
 				// Insert the end one first to make sure that we don't mess up
 				// the offsets.
 				$context = substr_replace($context, "</strong>", $end, 0);
 				$context = substr_replace($context, "<strong>", $start, 0);
 //				$extraoffset += strlen("<strong></strong>");
 			}
 			$contexts[] = $context;
 			$basepos = $scanpos + 1;
 		}
 		return implode(" ... ", $contexts);
 	}
 }
 /**
 * mb_stripos all occurences
 * from http://www.pontikis.net/tip/?id=16
 * based on http://www.php.net/manual/en/function.strpos.php#87061
 *
 * Find all occurrences of a needle in a haystack (case-insensitive, UTF8)
 *
 * @param string $haystack
 * @param string $needle
 * @return array or false
 */
 function mb_stripos_all($haystack, $needle) {
  $s = 0;
  $i = 0;
  while(is_integer($i)) {
    $i = mb_stripos($haystack, $needle, $s);
    if(is_integer($i)) {
      $aStrPos[] = $i;
      $s = $i + mb_strlen($needle);
    }
  }
  if(isset($aStrPos)) {
    return $aStrPos;
  } else {
    return false;
  }
 }
@ -2821,7 +3013,7 @@ class Slimdown {
 	public static $rules = array (
 		'/\r\n/' => "\n",											// new line normalisation
 		'/^(#+)(.*)/' => 'self::header',								// headers
-		'/(\*)(.*?)\1/' => '<strong>\2</strong>',					// bold
+		'/(\*+)(.*?)\1/' => '<strong>\2</strong>',					// bold
 		'/(_)(.*?)\1/' => '<em>\2</em>',							// emphasis
 		'/!\[(.*)\]\(([^\s]+)\s(\d+.+)\s(left|right)\)/' => '<img src="\2" alt="\1" style="max-width: \3; float: \4;" />',		// images with size
--- a/module_index.json
+++ b/module_index.json
@ -50,7 +50,7 @@
        "author": "Starbeamrainbowlabs",
        "description": "Adds proper search functionality to Pepperminty Wiki. Note that this module, at the moment, just contains test code while I figure out how best to write a search engine.",
        "id": "feature-search",
-        "lastupdate": 1446059639,
+        "lastupdate": 1446117613,
        "optional": false
    },
    {
@ -167,7 +167,7 @@
        "author": "Johnny Broadway & Starbeamrainbowlabs",
        "description": "The default parser for Pepperminty Wiki. Based on Johnny Broadway's Slimdown (with more than a few modifications). This parser's features are documented in the help page.",
        "id": "parser-default",
-        "lastupdate": 1445170746,
+        "lastupdate": 1446116543,
        "optional": false
    },
    {
--- a/modules/feature-search.php
+++ b/modules/feature-search.php
@ -31,9 +31,40 @@ register_module([
 			global $settings;
 			if(!isset($_GET["query"]))
-				exit(page_renderer::render("No Search Terms - Error - $settings->$sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
+				exit(page_renderer::render("No Search Terms - Error - $settings->sitename", "<p>You didn't specify any search terms. Try typing some into the box above.</p>"));
 			$search_start = microtime(true);
 			$invindex = search::load_invindex("invindex.json");
 			$results = search::query_invindex($_GET["query"], $invindex);
 			$search_end = microtime(true) - $search_start;
 			$title = $_GET["query"] . " - Search results - $settings->sitename";
 			$content = "<section>\n";
 			$content .= "<h1>Search Results</h1>";
 			// todo add a search box here
 			foreach($results as $result)
 			{
 				$link = "?page=" . rawurlencode($result["pagename"]);
 				$pagesource = file_get_contents($result["pagename"] . ".md");
 				$context = search::extract_context($_GET["query"], $pagesource);
 				$content .= "<div>\n";
 				$content .= "	<h2><a href='$link'>" . $result["pagename"] . "</a></h2>\n";
 				$content .= "	<p>$context</p>\n";
 				$content .= "</div>\n";
 			}
 			$content .= "</section>\n";
 			exit(page_renderer::render($title, $content));
 			//header("content-type: text/plain");
 			//var_dump($results);
 		});
 	}
 ]);
@ -123,6 +154,11 @@ class search
 		return preg_split("/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|/", $source, -1, PREG_SPLIT_NO_EMPTY);
 	}
 	public static function strip_markup($source)
 	{
 		return str_replace([ "[", "]", "\"", "*", "_", " - ", "`" ], "", $source);
 	}
 	public static function rebuild_invindex()
 	{
 		global $pageindex;
@ -174,7 +210,7 @@ class search
 	 * @summary Reads in and parses an inverted index.
 	 */
 	// Todo remove this function and make everything streamable
-	public static function parse_invindex($invindex_filename) {
+	public static function load_invindex($invindex_filename) {
 		$invindex = json_decode(file_get_contents($invindex_filename), true);
 		return $invindex;
 	}
@ -210,7 +246,7 @@ class search
 		file_put_contents($filename, json_encode($invindex));
 	}
-	public static function search_invindex($query, &$invindex)
+	public static function query_invindex($query, &$invindex)
 	{
 		$query_terms = self::tokenize($query);
 		$matching_pages = [];
@ -229,16 +265,17 @@ class search
 			{
 				// Create an entry in the matching pages array if it doesn't exist
 				if(!isset($matching_pages[$pageid]))
-					$matching_pages[$pageid] = [];
+					$matching_pages[$pageid] = [ "nterms" => [] ];
-				$matching_pages[$pageid][$qterm] = $page_entry;
+				$matching_pages[$pageid]["nterms"][$qterm] = $page_entry;
 			}
 		}
-		foreach($matching_pages as &$pagedata)
+		foreach($matching_pages as $pageid => &$pagedata)
 		{
 			$pagedata["pagename"] = ids::getpagename($pageid);
 			$pagedata["rank"] = 0;
-			foreach($pagedata as $pterm => $entry)
+			foreach($pagedata["nterms"] as $pterm => $entry)
 			{
 				$pagedata["rank"] += $entry["freq"];
@ -256,6 +293,125 @@ class search
 		return $matching_pages;
 	}
 	public static function extract_context($query, $source)
 	{
 		global $settings;
 		$nterms = self::tokenize($query);
 		$matches = [];
 		// Loop over each nterm and find it in the source
 		foreach($nterms as $nterm)
 		{
 			$all_offsets = mb_stripos_all($source, $nterm);
 			// Skip over adding matches if there aren't any
 			if($all_offsets === false)
 				continue;
 			foreach($all_offsets as $offset)
 			{
 				$matches[] = [ $nterm, $offset ];
 			}
 		}
 		usort($matches, function($a, $b) {
 			if($a[1] == $b[1]) return 0;
 			return ($a[1] < $b[1]) ? +1 : -1;
 		});
 		$contexts = [];
 		$basepos = 0;
 		$matches_count = count($matches);
 		while($basepos < $matches_count)
 		{
 			// Store the next match along - all others will be relative to that
 			// one
 			$group = [$matches[$basepos]];
 			// Start scanning at the next one along - we always store the first match
 			$scanpos = $basepos + 1;
 			$distance = 0;
 			while(true)
 			{
 				// Break out if we reach the end
 				if($scanpos >= $matches_count) break;
 				// Find the distance between the current one and the last one
 				$distance = $matches[$scanpos][1] - $matches[$scanpos - 1][1];
 				// Store it if the distance is below the threshold
 				if($distance < $settings->search_characters_context)
 					$group[] = $matches[$scanpos];
 				else
 					break;
 				$scanpos++;
 			}
 			$context_start = $group[0][1] - $settings->search_characters_context;
 			$context_end = $group[count($group) - 1][1] + $settings->search_characters_context;
 			$context = substr($source, $context_start, $context_end - $context_start);
 			// Strip the markdown from the context - it's most likely going to
 			// be broken anyway.
 			$context = self::strip_markup($context);
 			// Make the matching words bold.
 			$extraoffset = 0;
 			foreach($group as $match)
 			{
 				$start = $match[1] + $extraoffset;
 				$length = strlen($match[0]);
 				$end = $start + $length;
 				// Insert the end one first to make sure that we don't mess up
 				// the offsets.
 				$context = substr_replace($context, "</strong>", $end, 0);
 				$context = substr_replace($context, "<strong>", $start, 0);
 //				$extraoffset += strlen("<strong></strong>");
 			}
 			$contexts[] = $context;
 			$basepos = $scanpos + 1;
 		}
 		return implode(" ... ", $contexts);
 	}
 }
 /**
 * mb_stripos all occurences
 * from http://www.pontikis.net/tip/?id=16
 * based on http://www.php.net/manual/en/function.strpos.php#87061
 *
 * Find all occurrences of a needle in a haystack (case-insensitive, UTF8)
 *
 * @param string $haystack
 * @param string $needle
 * @return array or false
 */
 function mb_stripos_all($haystack, $needle) {
  $s = 0;
  $i = 0;
  while(is_integer($i)) {
    $i = mb_stripos($haystack, $needle, $s);
    if(is_integer($i)) {
      $aStrPos[] = $i;
      $s = $i + mb_strlen($needle);
    }
  }
  if(isset($aStrPos)) {
    return $aStrPos;
  } else {
    return false;
  }
 }
 ?>
--- a/modules/parser-default.php
+++ b/modules/parser-default.php
@ -51,7 +51,7 @@ class Slimdown {
 	public static $rules = array (
 		'/\r\n/' => "\n",											// new line normalisation
 		'/^(#+)(.*)/' => 'self::header',								// headers
-		'/(\*)(.*?)\1/' => '<strong>\2</strong>',					// bold
+		'/(\*+)(.*?)\1/' => '<strong>\2</strong>',					// bold
 		'/(_)(.*?)\1/' => '<em>\2</em>',							// emphasis
 		'/!\[(.*)\]\(([^\s]+)\s(\d+.+)\s(left|right)\)/' => '<img src="\2" alt="\1" style="max-width: \3; float: \4;" />',		// images with size
--- a/settings.fragment.php
+++ b/settings.fragment.php
@ -203,6 +203,16 @@ $settings->mime_extension_mappings_location = "/etc/mime.types";
 $settings->min_preview_size = 1;
 $settings->max_preview_size = 2048;
 // The maximum distance terms should be apart in the context display below
 // search results. This is purely aesthetical - it doesn't affect the search
 // algorithm.
 $settings->search_max_distance_context_display = 100;
 // The number of characters that should be displayed either side of a matching
 // term in the context below each search result.
 $settings->search_characters_context = 200;
 // A string of css to include. Will be included in the <head> of every page
 // inside a <style> tag. This may also be a url - urls will be referenced via a
 // <link rel='stylesheet' /> tag.