Bugfix textual context generation

2018-09-22 17:12:46 +01:00 · 2018-09-22 17:12:46 +01:00 · 2aba3a9d86
commit 2aba3a9d86
parent b41f7f524a
7 changed files with 239 additions and 57 deletions
--- a/SearchBox-CLI/Program.cs
+++ b/SearchBox-CLI/Program.cs
@ -2,13 +2,13 @@
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text.RegularExpressions;
 using System.Threading.Tasks;
 using Newtonsoft.Json;
 using LibSearchBox;
 using System.Text.RegularExpressions;
 using Newtonsoft.Json.Serialization;
 using System.Threading.Tasks;
 using SBRL.Utilities;
 using LibSearchBox;
 namespace SearchBoxCLI
 {
@ -18,28 +18,34 @@ namespace SearchBoxCLI
 		Index,
 		Add,
 		Remove,
-		Update
+		Update,
 		GenerateContext
 	}
 	enum OutputModes
 	{
 		Json,
-		Text
+		Text,
 		Html
 	}
 	class MainClass {
 		private static List<string> Extras = new List<string>();
 		private static OperatingModes Mode = OperatingModes.Query;
 		private static OutputModes OutputMode = OutputModes.Text;
 		private static bool Batch = false;
 		private static string Name = string.Empty;
 		private static IEnumerable<string> Tags;
 		private static string SearchIndexFilepath = string.Empty;
 		private static TextReader Source = Console.In;
 		private static TextReader SourceOld = null, SourceNew = null;
 		private static string Query = string.Empty;
 		private static int ResultsLimit = -1;
 		private static int ResultsOffset = 0;
 		private static OutputModes OutputMode = OutputModes.Text;
 		public static int Main(string[] args)
 		{
@ -92,6 +98,10 @@ namespace SearchBoxCLI
 						ResultsOffset = int.Parse(args[++i]);
 						break;
 					case "query":
 						Query = args[++i];
 						break;
 					case "format":
 						OutputMode = (OutputModes)Enum.Parse(typeof(OutputModes), args[++i], true);
 						break;
@ -105,7 +115,7 @@ namespace SearchBoxCLI
 				}
 			}
 			if (Extras.Count < 1) return HandleHelp();
-			string modeText = Extras.First(); Extras.RemoveAt(0);
+			string modeText = Extras.First().Replace("context", "generatecontext"); Extras.RemoveAt(0);
 			Mode = (OperatingModes)Enum.Parse(typeof(OperatingModes), modeText, true);
 			switch (Mode) {
@ -113,9 +123,11 @@ namespace SearchBoxCLI
 				case OperatingModes.Add: return HandleAdd();
 				case OperatingModes.Remove: return HandleRemove();
 				case OperatingModes.Query: return HandleQuery();
 				case OperatingModes.GenerateContext: return HandleContextGeneration();
 				default:
 					Console.Error.WriteLine($"Error: Don't know how to handle mode {Mode}.");
 					return 128;
 			}
 			return 128;
 		}
 		private static int HandleHelp()
@ -126,26 +138,27 @@ namespace SearchBoxCLI
 			Console.WriteLine();
 			Console.WriteLine("Usage:");
 			Console.WriteLine("    ./SearchBox.exe {mode} [options]");
 			Console.WriteLine("    ./SearchBox.exe query \"{query string}\" [options]");
 			Console.WriteLine();
 			Console.WriteLine("Modes:");
 			Console.WriteLine("    query     Query a pre-existing inverted search index");
 			Console.WriteLine("    context   Generate a context string similar to a search result on the internet");
 			Console.WriteLine("    index     Generate a raw index of the source document.");
 			Console.WriteLine("    add       Add a named document to a search index.");
 			Console.WriteLine("    remove    Remove a named document from a search index.");
 			Console.WriteLine("    update    Update a named document in a search index.");
 			Console.WriteLine();
 			Console.WriteLine("Options:");
 			Console.WriteLine("    --format        Sets the format of the output. Possible values: text (default), json {query,index}");
 			Console.WriteLine("    --source, -s    Specifies the path to the source document {index, add}");
 			Console.WriteLine("    --old-source    Specifies the path to the old version of the source document to update {update}");
 			Console.WriteLine("    --new-source    Specifies the path to the new version of the source document to update {update}");
 			Console.WriteLine("    --name, -n      Sets the name of the source document {add, remove}");
 			Console.WriteLine("    --index         Specifies the location of the search index to use {add, remove, update}");
 			Console.WriteLine("    --tags          Sets the tags to associate with the document. {add, update}");
 			Console.WriteLine("    --batch         Enters a mode where the operations to process are specified via the source (by default stdin; change with --source as usual) - one per line in the format \"{filename}|{name}|{tags}\" {add}");
 			Console.WriteLine("    --format        Sets the format of the output. Possible values: text (default), json, html (context generation only) {query, index, context}");
 			Console.WriteLine("    --index         Specifies the location of the search index to use {add, remove, update}");
 			Console.WriteLine("    --name, -n      Sets the name of the source document {add, remove, title}");
 			Console.WriteLine("    --new-source    Specifies the path to the new version of the source document to update {update}");
 			Console.WriteLine("    --limit         Limits the number of results returned, -1 = no limit {query}");
 			Console.WriteLine("    --offset        Skips the specified number of results from the beginning of the results list {query}");
 			Console.WriteLine("    --old-source    Specifies the path to the old version of the source document to update {update}");
 			Console.WriteLine("    --query         Specifies the query string {query, context}");
 			Console.WriteLine("    --source, -s    Specifies the path to the source document {index, add,context}");
 			Console.WriteLine("    --tags          Sets the tags to associate with the document. {add, update}");
 			Console.WriteLine();
 			Console.WriteLine("Examples:");
 			Console.WriteLine("    cat books/complex_knots.txt | ./SearchBox.exe add --name \"Complex Knots: How to do and undo them\"");
@ -155,13 +168,11 @@ namespace SearchBoxCLI
 		private static int HandleAdd()
 		{
-			if (Name == string.Empty && !Batch)
+			if (Name == string.Empty && !Batch) {
 			{
 				Console.Error.WriteLine("Error: The document name must be specified when reading from stdin!");
 				return 1;
 			}
-			if (SearchIndexFilepath == string.Empty)
+			if (SearchIndexFilepath == string.Empty) {
 			{
 				Console.Error.WriteLine("Error: No search index file path specified.");
 				return 1;
 			}
@ -209,7 +220,7 @@ namespace SearchBoxCLI
 		private static int HandleRemove()
 		{
-			if (Name == string.Empty) {
+			if (string.IsNullOrEmpty(Name)) {
 				Console.Error.WriteLine("Error: The document name must be specified when removing a document!");
 				return 1;
 			}
@ -232,16 +243,20 @@ namespace SearchBoxCLI
 		private static int HandleQuery()
 		{
-			if (Extras.Count < 1) {
+			if (string.IsNullOrEmpty(Query)) {
 				Console.Error.WriteLine("Error: No query specified!");
 				return 1;
 			}
 			if (SearchIndexFilepath == string.Empty) {
 				Console.Error.WriteLine("Error: No search index file path specified.");
 				return 1;
 			}
 			SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
 				File.ReadAllText(SearchIndexFilepath)
 			);
-			IEnumerable<SearchResult> resultsRaw = searchBox.Query(Extras[0], new QuerySettings()).Skip(ResultsOffset);
+			IEnumerable<SearchResult> resultsRaw = searchBox.Query(Query, new QuerySettings()).Skip(ResultsOffset);
 			List<SearchResult> results = new List<SearchResult>(
 				ResultsLimit > 0 ? resultsRaw.Take(ResultsLimit) : resultsRaw
 			);
@ -262,6 +277,43 @@ namespace SearchBoxCLI
 			return 0;
 		}
 		private static int HandleContextGeneration()
 		{
 			if (string.IsNullOrEmpty(Name)) {
 				Console.Error.WriteLine("Error: No document name specified.");
 				return 1;
 			}
 			if (string.IsNullOrEmpty(Query)) {
 				Console.Error.WriteLine("Error: No query specified.");
 				return 1;
 			}
 			if (SearchIndexFilepath == string.Empty) {
 				Console.Error.WriteLine("Error: No search index file path specified.");
 				return 1;
 			}
 			SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
 				File.ReadAllText(SearchIndexFilepath)
 			);
 			ContextSettings generationSettings = new ContextSettings();
 			switch (OutputMode) {
 				case OutputModes.Json:
 					Console.Error.WriteLine("Error: JSON output for context generation is not supported.");
 					return 1;
 				case OutputModes.Html:
 					generationSettings.Html = true;
 					break;
 				case OutputModes.Text:
 					generationSettings.Html = false;
 					break;
 			}
 			Console.WriteLine(searchBox.GenerateContext(Name, Source.ReadToEnd(), Query, generationSettings));
 			return 0;
 		}
 		private static int HandleIndex()
 		{
 			Index index = new Index(Source.ReadToEnd());
--- a/SearchBox/Index.cs
+++ b/SearchBox/Index.cs
@ -25,7 +25,7 @@ namespace LibSearchBox
 			// Tokenize the input and file it in our index
 			Tokenizer tokenizer = new Tokenizer(inSource);
-			foreach (Tuple<int, string> token in tokenizer) {
+			foreach ((int, string) token in tokenizer) {
 				if (stopwordTester.IsStopword(token.Item2)) continue;
 				insert(token.Item2, token.Item1);
 			}
--- a/SearchBox/QuerySettings.cs
+++ b/SearchBox/QuerySettings.cs
@ -1,13 +0,0 @@
 using System;
 namespace LibSearchBox
 {
 	public class QuerySettings
 	{
 		public int WeightTitleMatch = 100;
 		public int WeightTagMatch = 10;
 		public QuerySettings()
 		{
 		}
 	}
 }
--- a/SearchBox/SearchBox.cs
+++ b/SearchBox/SearchBox.cs
@ -2,11 +2,45 @@
 using System.Collections.Concurrent;
 using System.Collections.Generic;
 using System.Linq;
 using System.Net;
 using System.Threading.Tasks;
 using Newtonsoft.Json;
 namespace LibSearchBox
 {
 	public class QuerySettings
 	{
 		public int WeightTitleMatch = 100;
 		public int WeightTagMatch = 10;
 		public QuerySettings()
 		{
 		}
 	}
 	public class ContextSettings
 	{
 		/// <summary>
 		/// The number of characters that should be displayed either side of a
 		/// matching term.
 		/// </summary>
 		public int ContextCharacters = 75;
 		/// <summary>
 		/// The maximum length of the generated context string.
 		/// </summary>
 		public int MaxLength = 250;
 		/// <summary>
 		/// The separator to use between snippets in the generated context.
 		/// </summary>
 		public string Separator = " … ";
 		/// <summary>
 		/// Whether to output the generated context in HTML.
 		/// </summary>
 		public bool Html = false;
 	}
 	public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) {  } }
 	[JsonObject(MemberSerialization.OptIn)]
@ -80,7 +114,7 @@ namespace LibSearchBox
 			ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
 			Tokenizer tokenizer = new Tokenizer(query);
-			foreach(Tuple<int, string> token in tokenizer.IterateTokens())
+			foreach((int, string) token in tokenizer.IterateTokens())
 			{
 				ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
@ -95,13 +129,17 @@ namespace LibSearchBox
 			}
 			ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
-			Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
+			Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
 			{
 				int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
-				if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
+				if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
 				{
 					if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
 				}
-				else {
+				else
-					foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
+				{
 					foreach ((int, string) token in tokenizer.IterateTokens())
 					{
 						if (metaInfo.SearchableTitle.Contains(token.Item2))
 							rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
@ -111,13 +149,7 @@ namespace LibSearchBox
 					}
 				}
-				List<SearchOffset> offsets = new List<SearchOffset>();
+				List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);
 				foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
 					ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
 					if (!tokenQuery.ContainsKey(pageDef.Key)) continue; // Don't bother if this page doesn't contain this token
 					offsets.AddRange(tokenQuery[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset)));
 				}
 				offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
 				resultsRaw.Add(new SearchResult(
 					idMap.GetPageName(pageDef.Key),
@ -131,6 +163,114 @@ namespace LibSearchBox
 			return results;
 		}
 		private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
 		{
 			List<SearchOffset> offsets = new List<SearchOffset>();
 			foreach ((int, string) token in tokenizer.IterateTokens())
 			{
 				ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
 				if (!tokenQuery.ContainsKey(pageId)) continue; // Don't bother if this page doesn't contain this token
 				offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
 			}
 			offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
 			return offsets;
 		}
 		public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
 		{
 			int pageId = idMap.GetId(pageName);
 			Tokenizer tokenizer = new Tokenizer(query);
 			List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);
 			int currentLength = 0;
 			List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
 			List<(int, int)> snippets = new List<(int, int)>(); // from, to
 			for (int i = 0; i < offsets.Count; i++)
 			{
 				// Don't go over the maximum length
 				// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
 				if (currentLength > settings.MaxLength)
 					break;
 				// Generate the next snippet
 				(int, int) nextSnippet = (
 					Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
 					Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
 				);
 				tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
 				// If the next snippet overlaps with the previous one, then combine the 2
 				if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
 					// BUG: This *might* exceed the MaxLength a bit
 					// Pop the last snippet from the list
 					(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
 					currentLength += nextSnippet.Item2 - lastSnippet.Item2;
 					lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
 					snippets.Add(lastSnippet); // Push it back on again
 					continue;
 				}
 				// No overlap! Add it to the list
 				snippets.Add(nextSnippet);
 				currentLength += nextSnippet.Item2 - nextSnippet.Item1;
 			}
 			List<string> snippetsText = new List<string>(snippets.Select(((int, int) snippet) => {
 				string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);
 				if (string.IsNullOrWhiteSpace(result.Trim()))
 					return "";
 				if (settings.Html) {
 					List<string> parts = new List<string>() { result };
 					int remainingStartIndex = snippet.Item1;
 					foreach ((int, int) tokenDef in tokenLocations) {
 						if (tokenDef.Item1 - remainingStartIndex < result.Length) {
 							string remainingString = parts.Last();
 							parts.RemoveAt(parts.Count - 1); // Remove the last element
 															 // The bit before the token
 							string nextPart = WebUtility.HtmlEncode(
 								remainingString.Substring(0, tokenDef.Item1 - remainingStartIndex)
 							);
 							// The token itself
 							nextPart += $"<span class='token'>{WebUtility.HtmlEncode(remainingString.Substring(tokenDef.Item1 - remainingStartIndex, tokenDef.Item2))}</span>";
 							parts.Add(nextPart);
 							// The bit after the token - this will be processed by the next loop, so ti doesn't need to be HTML-encoded (yet)
 							parts.Add(remainingString.Substring((tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2));
 							// Update our marker as to where we've got up to
 							remainingStartIndex = (tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2;
 						}
 					}
 					// HTML-encode the last part
 					parts[parts.Count - 1] = WebUtility.HtmlEncode(parts[parts.Count - 1]);
 					result = string.Join("", parts);
 				}
 				return result;
 			}).Where((string snippet) => !string.IsNullOrWhiteSpace(snippet)));
 			// Add the separator at the beginning and end if we aren't at the bounds of the source document
 			if (snippets.First().Item1 > 0)
 				snippetsText.Insert(0, "");
 			if (snippets.Last().Item2 < source.Length)
 				snippetsText.Add("");
 			return string.Join(
 				settings.Separator,
 				snippetsText
 			);
 		}
 		#endregion
 	}
 }
--- a/SearchBox/SearchBox.csproj
+++ b/SearchBox/SearchBox.csproj
@ -36,6 +36,10 @@
    <Reference Include="Newtonsoft.Json">
      <HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
    </Reference>
    <Reference Include="System.ValueTuple">
      <HintPath>..\packages\System.ValueTuple.4.5.0\lib\net47\System.ValueTuple.dll</HintPath>
    </Reference>
    <Reference Include="mscorlib" />
  </ItemGroup>
  <ItemGroup>
    <Compile Include="Properties\AssemblyInfo.cs" />
@ -50,7 +54,6 @@
    <Compile Include="Utilities\BiDictionary.cs" />
    <Compile Include="DocumentMeta.cs" />
    <Compile Include="SearchResult.cs" />
    <Compile Include="QuerySettings.cs" />
  </ItemGroup>
  <ItemGroup>
    <Folder Include="EmbeddedFiles\" />
--- a/SearchBox/Tokenizer.cs
+++ b/SearchBox/Tokenizer.cs
@ -16,7 +16,7 @@ namespace LibSearchBox
 		HidePunctuation = 4,
 		DecodeHtmlEntities = 8
 	}
-	public class Tokenizer : IEnumerable<Tuple<int, string>>
+	public class Tokenizer : IEnumerable<(int, string)>
 	{
 		private static Regex splitter = new Regex(
 			@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
@ -40,7 +40,7 @@ namespace LibSearchBox
 			source = inSource;
 		}
-		public IEnumerable<Tuple<int, string>> IterateTokens()
+		public IEnumerable<(int, string)> IterateTokens()
 		{
 			int index = 0;
 			string[] parts = splitter.Split(source);
@ -50,12 +50,11 @@ namespace LibSearchBox
 				if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
-				// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
+				yield return (index, parts[i]);
 				yield return new Tuple<int, string>(index, parts[i]);
 			}
 		}
-		public IEnumerator<Tuple<int, string>> GetEnumerator() {
+		public IEnumerator<(int, string)> GetEnumerator() {
 			return IterateTokens().GetEnumerator();
 		}
 		IEnumerator IEnumerable.GetEnumerator()
--- a/SearchBox/packages.config
+++ b/SearchBox/packages.config
@ -1,4 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
  <package id="System.ValueTuple" version="4.5.0" targetFramework="net47" />
  <package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
 </packages>