Bugfix textual context generation

2018-09-22 17:12:46 +01:00 · 2018-09-22 17:12:46 +01:00 · 2aba3a9d86
commit 2aba3a9d86
parent b41f7f524a
7 changed files with 239 additions and 57 deletions
--- a/SearchBox-CLI/Program.cs
+++ b/SearchBox-CLI/Program.cs
@ -2,13 +2,13 @@
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+
 using Newtonsoft.Json;

-using LibSearchBox;
-using System.Text.RegularExpressions;
-using Newtonsoft.Json.Serialization;
-using System.Threading.Tasks;
 using SBRL.Utilities;
+using LibSearchBox;

 namespace SearchBoxCLI
 {
@ -18,28 +18,34 @@ namespace SearchBoxCLI
 		Index,
 		Add,
 		Remove,
-		Update
+		Update,
+		GenerateContext
 	}

 	enum OutputModes
 	{
 		Json,
-		Text
+		Text,
+		Html
 	}

 	class MainClass {
 		private static List<string> Extras = new List<string>();

 		private static OperatingModes Mode = OperatingModes.Query;
+		private static OutputModes OutputMode = OutputModes.Text;
 		private static bool Batch = false;
+
 		private static string Name = string.Empty;
 		private static IEnumerable<string> Tags;
+
 		private static string SearchIndexFilepath = string.Empty;
 		private static TextReader Source = Console.In;
 		private static TextReader SourceOld = null, SourceNew = null;
+
+		private static string Query = string.Empty;
 		private static int ResultsLimit = -1;
 		private static int ResultsOffset = 0;
-		private static OutputModes OutputMode = OutputModes.Text;

 		public static int Main(string[] args)
 		{
@ -92,6 +98,10 @@ namespace SearchBoxCLI
 						ResultsOffset = int.Parse(args[++i]);
 						break;

+					case "query":
+						Query = args[++i];
+						break;
+
 					case "format":
 						OutputMode = (OutputModes)Enum.Parse(typeof(OutputModes), args[++i], true);
 						break;
@ -105,7 +115,7 @@ namespace SearchBoxCLI
 				}
 			}
 			if (Extras.Count < 1) return HandleHelp();
-			string modeText = Extras.First(); Extras.RemoveAt(0);
+			string modeText = Extras.First().Replace("context", "generatecontext"); Extras.RemoveAt(0);
 			Mode = (OperatingModes)Enum.Parse(typeof(OperatingModes), modeText, true);

 			switch (Mode) {
@ -113,9 +123,11 @@ namespace SearchBoxCLI
 				case OperatingModes.Add: return HandleAdd();
 				case OperatingModes.Remove: return HandleRemove();
 				case OperatingModes.Query: return HandleQuery();
+				case OperatingModes.GenerateContext: return HandleContextGeneration();
+				default:
+					Console.Error.WriteLine($"Error: Don't know how to handle mode {Mode}.");
+					return 128;
 			}
-
-			return 128;
 		}

 		private static int HandleHelp()
@ -126,26 +138,27 @@ namespace SearchBoxCLI
 			Console.WriteLine();
 			Console.WriteLine("Usage:");
 			Console.WriteLine("    ./SearchBox.exe {mode} [options]");
-			Console.WriteLine("    ./SearchBox.exe query \"{query string}\" [options]");
 			Console.WriteLine();
 			Console.WriteLine("Modes:");
 			Console.WriteLine("    query     Query a pre-existing inverted search index");
+			Console.WriteLine("    context   Generate a context string similar to a search result on the internet");
 			Console.WriteLine("    index     Generate a raw index of the source document.");
 			Console.WriteLine("    add       Add a named document to a search index.");
 			Console.WriteLine("    remove    Remove a named document from a search index.");
 			Console.WriteLine("    update    Update a named document in a search index.");
 			Console.WriteLine();
 			Console.WriteLine("Options:");
-			Console.WriteLine("    --format        Sets the format of the output. Possible values: text (default), json {query,index}");
-			Console.WriteLine("    --source, -s    Specifies the path to the source document {index, add}");
-			Console.WriteLine("    --old-source    Specifies the path to the old version of the source document to update {update}");
-			Console.WriteLine("    --new-source    Specifies the path to the new version of the source document to update {update}");
-			Console.WriteLine("    --name, -n      Sets the name of the source document {add, remove}");
-			Console.WriteLine("    --index         Specifies the location of the search index to use {add, remove, update}");
-			Console.WriteLine("    --tags          Sets the tags to associate with the document. {add, update}");
 			Console.WriteLine("    --batch         Enters a mode where the operations to process are specified via the source (by default stdin; change with --source as usual) - one per line in the format \"{filename}|{name}|{tags}\" {add}");
+			Console.WriteLine("    --format        Sets the format of the output. Possible values: text (default), json, html (context generation only) {query, index, context}");
+			Console.WriteLine("    --index         Specifies the location of the search index to use {add, remove, update}");
+			Console.WriteLine("    --name, -n      Sets the name of the source document {add, remove, title}");
+			Console.WriteLine("    --new-source    Specifies the path to the new version of the source document to update {update}");
 			Console.WriteLine("    --limit         Limits the number of results returned, -1 = no limit {query}");
 			Console.WriteLine("    --offset        Skips the specified number of results from the beginning of the results list {query}");
+			Console.WriteLine("    --old-source    Specifies the path to the old version of the source document to update {update}");
+			Console.WriteLine("    --query         Specifies the query string {query, context}");
+			Console.WriteLine("    --source, -s    Specifies the path to the source document {index, add,context}");
+			Console.WriteLine("    --tags          Sets the tags to associate with the document. {add, update}");
 			Console.WriteLine();
 			Console.WriteLine("Examples:");
 			Console.WriteLine("    cat books/complex_knots.txt | ./SearchBox.exe add --name \"Complex Knots: How to do and undo them\"");
@ -155,13 +168,11 @@ namespace SearchBoxCLI

 		private static int HandleAdd()
 		{
-			if (Name == string.Empty && !Batch)
-			{
+			if (Name == string.Empty && !Batch) {
 				Console.Error.WriteLine("Error: The document name must be specified when reading from stdin!");
 				return 1;
 			}
-			if (SearchIndexFilepath == string.Empty)
-			{
+			if (SearchIndexFilepath == string.Empty) {
 				Console.Error.WriteLine("Error: No search index file path specified.");
 				return 1;
 			}
@ -209,7 +220,7 @@ namespace SearchBoxCLI

 		private static int HandleRemove()
 		{
-			if (Name == string.Empty) {
+			if (string.IsNullOrEmpty(Name)) {
 				Console.Error.WriteLine("Error: The document name must be specified when removing a document!");
 				return 1;
 			}
@ -232,16 +243,20 @@ namespace SearchBoxCLI

 		private static int HandleQuery()
 		{
-			if (Extras.Count < 1) {
+			if (string.IsNullOrEmpty(Query)) {
 				Console.Error.WriteLine("Error: No query specified!");
 				return 1;
 			}
+			if (SearchIndexFilepath == string.Empty) {
+				Console.Error.WriteLine("Error: No search index file path specified.");
+				return 1;
+			}

 			SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
 				File.ReadAllText(SearchIndexFilepath)
 			);

-			IEnumerable<SearchResult> resultsRaw = searchBox.Query(Extras[0], new QuerySettings()).Skip(ResultsOffset);
+			IEnumerable<SearchResult> resultsRaw = searchBox.Query(Query, new QuerySettings()).Skip(ResultsOffset);
 			List<SearchResult> results = new List<SearchResult>(
 				ResultsLimit > 0 ? resultsRaw.Take(ResultsLimit) : resultsRaw
 			);
@ -262,6 +277,43 @@ namespace SearchBoxCLI
 			return 0;
 		}

+		private static int HandleContextGeneration()
+		{
+			if (string.IsNullOrEmpty(Name)) {
+				Console.Error.WriteLine("Error: No document name specified.");
+				return 1;
+			}
+			if (string.IsNullOrEmpty(Query)) {
+				Console.Error.WriteLine("Error: No query specified.");
+				return 1;
+			}
+			if (SearchIndexFilepath == string.Empty) {
+				Console.Error.WriteLine("Error: No search index file path specified.");
+				return 1;
+			}
+
+			SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
+				File.ReadAllText(SearchIndexFilepath)
+			);
+
+			ContextSettings generationSettings = new ContextSettings();
+			switch (OutputMode) {
+				case OutputModes.Json:
+					Console.Error.WriteLine("Error: JSON output for context generation is not supported.");
+					return 1;
+				case OutputModes.Html:
+					generationSettings.Html = true;
+					break;
+				case OutputModes.Text:
+					generationSettings.Html = false;
+					break;
+			}
+
+			Console.WriteLine(searchBox.GenerateContext(Name, Source.ReadToEnd(), Query, generationSettings));
+
+			return 0;
+		}
+
 		private static int HandleIndex()
 		{
 			Index index = new Index(Source.ReadToEnd());
--- a/SearchBox/Index.cs
+++ b/SearchBox/Index.cs
@ -25,7 +25,7 @@ namespace LibSearchBox
 			
 			// Tokenize the input and file it in our index
 			Tokenizer tokenizer = new Tokenizer(inSource);
-			foreach (Tuple<int, string> token in tokenizer) {
+			foreach ((int, string) token in tokenizer) {
 				if (stopwordTester.IsStopword(token.Item2)) continue;
 				insert(token.Item2, token.Item1);
 			}
--- a/SearchBox/QuerySettings.cs
+++ b/SearchBox/QuerySettings.cs
@ -1,13 +0,0 @@
-using System;
-namespace LibSearchBox
-{
-	public class QuerySettings
-	{
-		public int WeightTitleMatch = 100;
-		public int WeightTagMatch = 10;
-
-		public QuerySettings()
-		{
-		}
-	}
-}
--- a/SearchBox/SearchBox.cs
+++ b/SearchBox/SearchBox.cs
@ -2,11 +2,45 @@
 using System.Collections.Concurrent;
 using System.Collections.Generic;
 using System.Linq;
+using System.Net;
 using System.Threading.Tasks;
 using Newtonsoft.Json;

 namespace LibSearchBox
 {
+	public class QuerySettings
+	{
+		public int WeightTitleMatch = 100;
+		public int WeightTagMatch = 10;
+
+		public QuerySettings()
+		{
+		}
+	}
+
+	public class ContextSettings
+	{
+		/// <summary>
+		/// The number of characters that should be displayed either side of a
+		/// matching term.
+		/// </summary>
+		public int ContextCharacters = 75;
+		/// <summary>
+		/// The maximum length of the generated context string.
+		/// </summary>
+		public int MaxLength = 250;
+
+		/// <summary>
+		/// The separator to use between snippets in the generated context.
+		/// </summary>
+		public string Separator = " … ";
+
+		/// <summary>
+		/// Whether to output the generated context in HTML.
+		/// </summary>
+		public bool Html = false;
+	}
+
 	public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) {  } }

 	[JsonObject(MemberSerialization.OptIn)]
@ -80,7 +114,7 @@ namespace LibSearchBox
 			ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();

 			Tokenizer tokenizer = new Tokenizer(query);
-			foreach(Tuple<int, string> token in tokenizer.IterateTokens())
+			foreach((int, string) token in tokenizer.IterateTokens())
 			{
 				ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);

@ -95,13 +129,17 @@ namespace LibSearchBox
 			}

 			ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
-			Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
+			Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
+			{
 				int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
-				if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
+				if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
+				{
 					if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
 				}
-				else {
-					foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
+				else
+				{
+					foreach ((int, string) token in tokenizer.IterateTokens())
+					{
 						if (metaInfo.SearchableTitle.Contains(token.Item2))
 							rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched

@ -111,13 +149,7 @@ namespace LibSearchBox
 					}
 				}

-				List<SearchOffset> offsets = new List<SearchOffset>();
-				foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
-					ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
-					if (!tokenQuery.ContainsKey(pageDef.Key)) continue; // Don't bother if this page doesn't contain this token
-					offsets.AddRange(tokenQuery[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset)));
-				}
-				offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
+				List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);

 				resultsRaw.Add(new SearchResult(
 					idMap.GetPageName(pageDef.Key),
@ -131,6 +163,114 @@ namespace LibSearchBox
 			return results;
 		}

+		private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
+		{
+			List<SearchOffset> offsets = new List<SearchOffset>();
+			foreach ((int, string) token in tokenizer.IterateTokens())
+			{
+				ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
+				if (!tokenQuery.ContainsKey(pageId)) continue; // Don't bother if this page doesn't contain this token
+				offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
+			}
+			offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
+			return offsets;
+		}
+
+		public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
+		{
+			int pageId = idMap.GetId(pageName);
+
+			Tokenizer tokenizer = new Tokenizer(query);
+			List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);
+
+			int currentLength = 0;
+			List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
+			List<(int, int)> snippets = new List<(int, int)>(); // from, to
+			for (int i = 0; i < offsets.Count; i++)
+			{
+				// Don't go over the maximum length
+				// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
+				if (currentLength > settings.MaxLength)
+					break;
+				
+				// Generate the next snippet
+				(int, int) nextSnippet = (
+					Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
+					Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
+				);
+
+
+
+				tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
+				// If the next snippet overlaps with the previous one, then combine the 2
+				if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
+					// BUG: This *might* exceed the MaxLength a bit
+					// Pop the last snippet from the list
+					(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
+					currentLength += nextSnippet.Item2 - lastSnippet.Item2;
+
+					lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
+					snippets.Add(lastSnippet); // Push it back on again
+
+					continue;
+				}
+
+
+				// No overlap! Add it to the list
+				snippets.Add(nextSnippet);
+				currentLength += nextSnippet.Item2 - nextSnippet.Item1;
+			}
+
+			List<string> snippetsText = new List<string>(snippets.Select(((int, int) snippet) => {
+				string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);
+
+				if (string.IsNullOrWhiteSpace(result.Trim()))
+					return "";
+
+				if (settings.Html) {
+					List<string> parts = new List<string>() { result };
+					int remainingStartIndex = snippet.Item1;
+					foreach ((int, int) tokenDef in tokenLocations) {
+						if (tokenDef.Item1 - remainingStartIndex < result.Length) {
+							string remainingString = parts.Last();
+							parts.RemoveAt(parts.Count - 1); // Remove the last element
+															 // The bit before the token
+							string nextPart = WebUtility.HtmlEncode(
+								remainingString.Substring(0, tokenDef.Item1 - remainingStartIndex)
+							);
+							// The token itself
+							nextPart += $"<span class='token'>{WebUtility.HtmlEncode(remainingString.Substring(tokenDef.Item1 - remainingStartIndex, tokenDef.Item2))}</span>";
+							parts.Add(nextPart);
+							// The bit after the token - this will be processed by the next loop, so ti doesn't need to be HTML-encoded (yet)
+							parts.Add(remainingString.Substring((tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2));
+
+							// Update our marker as to where we've got up to
+							remainingStartIndex = (tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2;
+						}
+					}
+
+					// HTML-encode the last part
+					parts[parts.Count - 1] = WebUtility.HtmlEncode(parts[parts.Count - 1]);
+
+					result = string.Join("", parts);
+				}
+
+				return result;
+			}).Where((string snippet) => !string.IsNullOrWhiteSpace(snippet)));
+
+			// Add the separator at the beginning and end if we aren't at the bounds of the source document
+			if (snippets.First().Item1 > 0)
+				snippetsText.Insert(0, "");
+			if (snippets.Last().Item2 < source.Length)
+				snippetsText.Add("");
+
+			return string.Join(
+				settings.Separator,
+				snippetsText
+			);
+		}
+
+
 		#endregion
 	}
 }
--- a/SearchBox/SearchBox.csproj
+++ b/SearchBox/SearchBox.csproj
@ -36,6 +36,10 @@
    <Reference Include="Newtonsoft.Json">
      <HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
    </Reference>
+    <Reference Include="System.ValueTuple">
+      <HintPath>..\packages\System.ValueTuple.4.5.0\lib\net47\System.ValueTuple.dll</HintPath>
+    </Reference>
+    <Reference Include="mscorlib" />
  </ItemGroup>
  <ItemGroup>
    <Compile Include="Properties\AssemblyInfo.cs" />
@ -50,7 +54,6 @@
    <Compile Include="Utilities\BiDictionary.cs" />
    <Compile Include="DocumentMeta.cs" />
    <Compile Include="SearchResult.cs" />
-    <Compile Include="QuerySettings.cs" />
  </ItemGroup>
  <ItemGroup>
    <Folder Include="EmbeddedFiles\" />
--- a/SearchBox/Tokenizer.cs
+++ b/SearchBox/Tokenizer.cs
@ -16,7 +16,7 @@ namespace LibSearchBox
 		HidePunctuation = 4,
 		DecodeHtmlEntities = 8
 	}
-	public class Tokenizer : IEnumerable<Tuple<int, string>>
+	public class Tokenizer : IEnumerable<(int, string)>
 	{
 		private static Regex splitter = new Regex(
 			@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
@ -40,7 +40,7 @@ namespace LibSearchBox
 			source = inSource;
 		}

-		public IEnumerable<Tuple<int, string>> IterateTokens()
+		public IEnumerable<(int, string)> IterateTokens()
 		{
 			int index = 0;
 			string[] parts = splitter.Split(source);
@ -50,12 +50,11 @@ namespace LibSearchBox

 				if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);

-				// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
-				yield return new Tuple<int, string>(index, parts[i]);
+				yield return (index, parts[i]);
 			}
 		}

-		public IEnumerator<Tuple<int, string>> GetEnumerator() {
+		public IEnumerator<(int, string)> GetEnumerator() {
 			return IterateTokens().GetEnumerator();
 		}
 		IEnumerator IEnumerable.GetEnumerator()
--- a/SearchBox/packages.config
+++ b/SearchBox/packages.config
@ -1,4 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
+  <package id="System.ValueTuple" version="4.5.0" targetFramework="net47" />
  <package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
 </packages>