Bugfix textual context generation

This commit is contained in:
Starbeamrainbowlabs 2018-09-22 17:12:46 +01:00
parent b41f7f524a
commit 2aba3a9d86
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
7 changed files with 239 additions and 57 deletions

View file

@ -2,13 +2,13 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Newtonsoft.Json; using Newtonsoft.Json;
using LibSearchBox;
using System.Text.RegularExpressions;
using Newtonsoft.Json.Serialization;
using System.Threading.Tasks;
using SBRL.Utilities; using SBRL.Utilities;
using LibSearchBox;
namespace SearchBoxCLI namespace SearchBoxCLI
{ {
@ -18,28 +18,34 @@ namespace SearchBoxCLI
Index, Index,
Add, Add,
Remove, Remove,
Update Update,
GenerateContext
} }
enum OutputModes enum OutputModes
{ {
Json, Json,
Text Text,
Html
} }
class MainClass { class MainClass {
private static List<string> Extras = new List<string>(); private static List<string> Extras = new List<string>();
private static OperatingModes Mode = OperatingModes.Query; private static OperatingModes Mode = OperatingModes.Query;
private static OutputModes OutputMode = OutputModes.Text;
private static bool Batch = false; private static bool Batch = false;
private static string Name = string.Empty; private static string Name = string.Empty;
private static IEnumerable<string> Tags; private static IEnumerable<string> Tags;
private static string SearchIndexFilepath = string.Empty; private static string SearchIndexFilepath = string.Empty;
private static TextReader Source = Console.In; private static TextReader Source = Console.In;
private static TextReader SourceOld = null, SourceNew = null; private static TextReader SourceOld = null, SourceNew = null;
private static string Query = string.Empty;
private static int ResultsLimit = -1; private static int ResultsLimit = -1;
private static int ResultsOffset = 0; private static int ResultsOffset = 0;
private static OutputModes OutputMode = OutputModes.Text;
public static int Main(string[] args) public static int Main(string[] args)
{ {
@ -92,6 +98,10 @@ namespace SearchBoxCLI
ResultsOffset = int.Parse(args[++i]); ResultsOffset = int.Parse(args[++i]);
break; break;
case "query":
Query = args[++i];
break;
case "format": case "format":
OutputMode = (OutputModes)Enum.Parse(typeof(OutputModes), args[++i], true); OutputMode = (OutputModes)Enum.Parse(typeof(OutputModes), args[++i], true);
break; break;
@ -105,7 +115,7 @@ namespace SearchBoxCLI
} }
} }
if (Extras.Count < 1) return HandleHelp(); if (Extras.Count < 1) return HandleHelp();
string modeText = Extras.First(); Extras.RemoveAt(0); string modeText = Extras.First().Replace("context", "generatecontext"); Extras.RemoveAt(0);
Mode = (OperatingModes)Enum.Parse(typeof(OperatingModes), modeText, true); Mode = (OperatingModes)Enum.Parse(typeof(OperatingModes), modeText, true);
switch (Mode) { switch (Mode) {
@ -113,9 +123,11 @@ namespace SearchBoxCLI
case OperatingModes.Add: return HandleAdd(); case OperatingModes.Add: return HandleAdd();
case OperatingModes.Remove: return HandleRemove(); case OperatingModes.Remove: return HandleRemove();
case OperatingModes.Query: return HandleQuery(); case OperatingModes.Query: return HandleQuery();
case OperatingModes.GenerateContext: return HandleContextGeneration();
default:
Console.Error.WriteLine($"Error: Don't know how to handle mode {Mode}.");
return 128;
} }
return 128;
} }
private static int HandleHelp() private static int HandleHelp()
@ -126,26 +138,27 @@ namespace SearchBoxCLI
Console.WriteLine(); Console.WriteLine();
Console.WriteLine("Usage:"); Console.WriteLine("Usage:");
Console.WriteLine(" ./SearchBox.exe {mode} [options]"); Console.WriteLine(" ./SearchBox.exe {mode} [options]");
Console.WriteLine(" ./SearchBox.exe query \"{query string}\" [options]");
Console.WriteLine(); Console.WriteLine();
Console.WriteLine("Modes:"); Console.WriteLine("Modes:");
Console.WriteLine(" query Query a pre-existing inverted search index"); Console.WriteLine(" query Query a pre-existing inverted search index");
Console.WriteLine(" context Generate a context string similar to a search result on the internet");
Console.WriteLine(" index Generate a raw index of the source document."); Console.WriteLine(" index Generate a raw index of the source document.");
Console.WriteLine(" add Add a named document to a search index."); Console.WriteLine(" add Add a named document to a search index.");
Console.WriteLine(" remove Remove a named document from a search index."); Console.WriteLine(" remove Remove a named document from a search index.");
Console.WriteLine(" update Update a named document in a search index."); Console.WriteLine(" update Update a named document in a search index.");
Console.WriteLine(); Console.WriteLine();
Console.WriteLine("Options:"); Console.WriteLine("Options:");
Console.WriteLine(" --format Sets the format of the output. Possible values: text (default), json {query,index}");
Console.WriteLine(" --source, -s Specifies the path to the source document {index, add}");
Console.WriteLine(" --old-source Specifies the path to the old version of the source document to update {update}");
Console.WriteLine(" --new-source Specifies the path to the new version of the source document to update {update}");
Console.WriteLine(" --name, -n Sets the name of the source document {add, remove}");
Console.WriteLine(" --index Specifies the location of the search index to use {add, remove, update}");
Console.WriteLine(" --tags Sets the tags to associate with the document. {add, update}");
Console.WriteLine(" --batch Enters a mode where the operations to process are specified via the source (by default stdin; change with --source as usual) - one per line in the format \"{filename}|{name}|{tags}\" {add}"); Console.WriteLine(" --batch Enters a mode where the operations to process are specified via the source (by default stdin; change with --source as usual) - one per line in the format \"{filename}|{name}|{tags}\" {add}");
Console.WriteLine(" --format Sets the format of the output. Possible values: text (default), json, html (context generation only) {query, index, context}");
Console.WriteLine(" --index Specifies the location of the search index to use {add, remove, update}");
Console.WriteLine(" --name, -n Sets the name of the source document {add, remove, title}");
Console.WriteLine(" --new-source Specifies the path to the new version of the source document to update {update}");
Console.WriteLine(" --limit Limits the number of results returned, -1 = no limit {query}"); Console.WriteLine(" --limit Limits the number of results returned, -1 = no limit {query}");
Console.WriteLine(" --offset Skips the specified number of results from the beginning of the results list {query}"); Console.WriteLine(" --offset Skips the specified number of results from the beginning of the results list {query}");
Console.WriteLine(" --old-source Specifies the path to the old version of the source document to update {update}");
Console.WriteLine(" --query Specifies the query string {query, context}");
Console.WriteLine(" --source, -s Specifies the path to the source document {index, add,context}");
Console.WriteLine(" --tags Sets the tags to associate with the document. {add, update}");
Console.WriteLine(); Console.WriteLine();
Console.WriteLine("Examples:"); Console.WriteLine("Examples:");
Console.WriteLine(" cat books/complex_knots.txt | ./SearchBox.exe add --name \"Complex Knots: How to do and undo them\""); Console.WriteLine(" cat books/complex_knots.txt | ./SearchBox.exe add --name \"Complex Knots: How to do and undo them\"");
@ -155,13 +168,11 @@ namespace SearchBoxCLI
private static int HandleAdd() private static int HandleAdd()
{ {
if (Name == string.Empty && !Batch) if (Name == string.Empty && !Batch) {
{
Console.Error.WriteLine("Error: The document name must be specified when reading from stdin!"); Console.Error.WriteLine("Error: The document name must be specified when reading from stdin!");
return 1; return 1;
} }
if (SearchIndexFilepath == string.Empty) if (SearchIndexFilepath == string.Empty) {
{
Console.Error.WriteLine("Error: No search index file path specified."); Console.Error.WriteLine("Error: No search index file path specified.");
return 1; return 1;
} }
@ -209,7 +220,7 @@ namespace SearchBoxCLI
private static int HandleRemove() private static int HandleRemove()
{ {
if (Name == string.Empty) { if (string.IsNullOrEmpty(Name)) {
Console.Error.WriteLine("Error: The document name must be specified when removing a document!"); Console.Error.WriteLine("Error: The document name must be specified when removing a document!");
return 1; return 1;
} }
@ -232,16 +243,20 @@ namespace SearchBoxCLI
private static int HandleQuery() private static int HandleQuery()
{ {
if (Extras.Count < 1) { if (string.IsNullOrEmpty(Query)) {
Console.Error.WriteLine("Error: No query specified!"); Console.Error.WriteLine("Error: No query specified!");
return 1; return 1;
} }
if (SearchIndexFilepath == string.Empty) {
Console.Error.WriteLine("Error: No search index file path specified.");
return 1;
}
SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>( SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
File.ReadAllText(SearchIndexFilepath) File.ReadAllText(SearchIndexFilepath)
); );
IEnumerable<SearchResult> resultsRaw = searchBox.Query(Extras[0], new QuerySettings()).Skip(ResultsOffset); IEnumerable<SearchResult> resultsRaw = searchBox.Query(Query, new QuerySettings()).Skip(ResultsOffset);
List<SearchResult> results = new List<SearchResult>( List<SearchResult> results = new List<SearchResult>(
ResultsLimit > 0 ? resultsRaw.Take(ResultsLimit) : resultsRaw ResultsLimit > 0 ? resultsRaw.Take(ResultsLimit) : resultsRaw
); );
@ -262,6 +277,43 @@ namespace SearchBoxCLI
return 0; return 0;
} }
private static int HandleContextGeneration()
{
if (string.IsNullOrEmpty(Name)) {
Console.Error.WriteLine("Error: No document name specified.");
return 1;
}
if (string.IsNullOrEmpty(Query)) {
Console.Error.WriteLine("Error: No query specified.");
return 1;
}
if (SearchIndexFilepath == string.Empty) {
Console.Error.WriteLine("Error: No search index file path specified.");
return 1;
}
SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
File.ReadAllText(SearchIndexFilepath)
);
ContextSettings generationSettings = new ContextSettings();
switch (OutputMode) {
case OutputModes.Json:
Console.Error.WriteLine("Error: JSON output for context generation is not supported.");
return 1;
case OutputModes.Html:
generationSettings.Html = true;
break;
case OutputModes.Text:
generationSettings.Html = false;
break;
}
Console.WriteLine(searchBox.GenerateContext(Name, Source.ReadToEnd(), Query, generationSettings));
return 0;
}
private static int HandleIndex() private static int HandleIndex()
{ {
Index index = new Index(Source.ReadToEnd()); Index index = new Index(Source.ReadToEnd());

View file

@ -25,7 +25,7 @@ namespace LibSearchBox
// Tokenize the input and file it in our index // Tokenize the input and file it in our index
Tokenizer tokenizer = new Tokenizer(inSource); Tokenizer tokenizer = new Tokenizer(inSource);
foreach (Tuple<int, string> token in tokenizer) { foreach ((int, string) token in tokenizer) {
if (stopwordTester.IsStopword(token.Item2)) continue; if (stopwordTester.IsStopword(token.Item2)) continue;
insert(token.Item2, token.Item1); insert(token.Item2, token.Item1);
} }

View file

@ -1,13 +0,0 @@
using System;
namespace LibSearchBox
{
public class QuerySettings
{
public int WeightTitleMatch = 100;
public int WeightTagMatch = 10;
public QuerySettings()
{
}
}
}

View file

@ -2,11 +2,45 @@
using System.Collections.Concurrent; using System.Collections.Concurrent;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Net;
using System.Threading.Tasks; using System.Threading.Tasks;
using Newtonsoft.Json; using Newtonsoft.Json;
namespace LibSearchBox namespace LibSearchBox
{ {
public class QuerySettings
{
public int WeightTitleMatch = 100;
public int WeightTagMatch = 10;
public QuerySettings()
{
}
}
public class ContextSettings
{
/// <summary>
/// The number of characters that should be displayed either side of a
/// matching term.
/// </summary>
public int ContextCharacters = 75;
/// <summary>
/// The maximum length of the generated context string.
/// </summary>
public int MaxLength = 250;
/// <summary>
/// The separator to use between snippets in the generated context.
/// </summary>
public string Separator = " … ";
/// <summary>
/// Whether to output the generated context in HTML.
/// </summary>
public bool Html = false;
}
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } } public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
[JsonObject(MemberSerialization.OptIn)] [JsonObject(MemberSerialization.OptIn)]
@ -80,7 +114,7 @@ namespace LibSearchBox
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>(); ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
Tokenizer tokenizer = new Tokenizer(query); Tokenizer tokenizer = new Tokenizer(query);
foreach(Tuple<int, string> token in tokenizer.IterateTokens()) foreach((int, string) token in tokenizer.IterateTokens())
{ {
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2); ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
@ -95,13 +129,17 @@ namespace LibSearchBox
} }
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>(); ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => { Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
{
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this? int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) { if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
{
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}"); if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
} }
else { else
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) { {
foreach ((int, string) token in tokenizer.IterateTokens())
{
if (metaInfo.SearchableTitle.Contains(token.Item2)) if (metaInfo.SearchableTitle.Contains(token.Item2))
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
@ -111,13 +149,7 @@ namespace LibSearchBox
} }
} }
List<SearchOffset> offsets = new List<SearchOffset>(); List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
if (!tokenQuery.ContainsKey(pageDef.Key)) continue; // Don't bother if this page doesn't contain this token
offsets.AddRange(tokenQuery[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset)));
}
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
resultsRaw.Add(new SearchResult( resultsRaw.Add(new SearchResult(
idMap.GetPageName(pageDef.Key), idMap.GetPageName(pageDef.Key),
@ -131,6 +163,114 @@ namespace LibSearchBox
return results; return results;
} }
private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
{
List<SearchOffset> offsets = new List<SearchOffset>();
foreach ((int, string) token in tokenizer.IterateTokens())
{
ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
if (!tokenQuery.ContainsKey(pageId)) continue; // Don't bother if this page doesn't contain this token
offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
}
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
return offsets;
}
public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
{
int pageId = idMap.GetId(pageName);
Tokenizer tokenizer = new Tokenizer(query);
List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);
int currentLength = 0;
List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
List<(int, int)> snippets = new List<(int, int)>(); // from, to
for (int i = 0; i < offsets.Count; i++)
{
// Don't go over the maximum length
// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
if (currentLength > settings.MaxLength)
break;
// Generate the next snippet
(int, int) nextSnippet = (
Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
);
tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
// If the next snippet overlaps with the previous one, then combine the 2
if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
// BUG: This *might* exceed the MaxLength a bit
// Pop the last snippet from the list
(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
currentLength += nextSnippet.Item2 - lastSnippet.Item2;
lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
snippets.Add(lastSnippet); // Push it back on again
continue;
}
// No overlap! Add it to the list
snippets.Add(nextSnippet);
currentLength += nextSnippet.Item2 - nextSnippet.Item1;
}
List<string> snippetsText = new List<string>(snippets.Select(((int, int) snippet) => {
string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);
if (string.IsNullOrWhiteSpace(result.Trim()))
return "";
if (settings.Html) {
List<string> parts = new List<string>() { result };
int remainingStartIndex = snippet.Item1;
foreach ((int, int) tokenDef in tokenLocations) {
if (tokenDef.Item1 - remainingStartIndex < result.Length) {
string remainingString = parts.Last();
parts.RemoveAt(parts.Count - 1); // Remove the last element
// The bit before the token
string nextPart = WebUtility.HtmlEncode(
remainingString.Substring(0, tokenDef.Item1 - remainingStartIndex)
);
// The token itself
nextPart += $"<span class='token'>{WebUtility.HtmlEncode(remainingString.Substring(tokenDef.Item1 - remainingStartIndex, tokenDef.Item2))}</span>";
parts.Add(nextPart);
// The bit after the token - this will be processed by the next loop, so ti doesn't need to be HTML-encoded (yet)
parts.Add(remainingString.Substring((tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2));
// Update our marker as to where we've got up to
remainingStartIndex = (tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2;
}
}
// HTML-encode the last part
parts[parts.Count - 1] = WebUtility.HtmlEncode(parts[parts.Count - 1]);
result = string.Join("", parts);
}
return result;
}).Where((string snippet) => !string.IsNullOrWhiteSpace(snippet)));
// Add the separator at the beginning and end if we aren't at the bounds of the source document
if (snippets.First().Item1 > 0)
snippetsText.Insert(0, "");
if (snippets.Last().Item2 < source.Length)
snippetsText.Add("");
return string.Join(
settings.Separator,
snippetsText
);
}
#endregion #endregion
} }
} }

View file

@ -36,6 +36,10 @@
<Reference Include="Newtonsoft.Json"> <Reference Include="Newtonsoft.Json">
<HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath> <HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
</Reference> </Reference>
<Reference Include="System.ValueTuple">
<HintPath>..\packages\System.ValueTuple.4.5.0\lib\net47\System.ValueTuple.dll</HintPath>
</Reference>
<Reference Include="mscorlib" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Properties\AssemblyInfo.cs" />
@ -50,7 +54,6 @@
<Compile Include="Utilities\BiDictionary.cs" /> <Compile Include="Utilities\BiDictionary.cs" />
<Compile Include="DocumentMeta.cs" /> <Compile Include="DocumentMeta.cs" />
<Compile Include="SearchResult.cs" /> <Compile Include="SearchResult.cs" />
<Compile Include="QuerySettings.cs" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Folder Include="EmbeddedFiles\" /> <Folder Include="EmbeddedFiles\" />

View file

@ -16,7 +16,7 @@ namespace LibSearchBox
HidePunctuation = 4, HidePunctuation = 4,
DecodeHtmlEntities = 8 DecodeHtmlEntities = 8
} }
public class Tokenizer : IEnumerable<Tuple<int, string>> public class Tokenizer : IEnumerable<(int, string)>
{ {
private static Regex splitter = new Regex( private static Regex splitter = new Regex(
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|", @"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
@ -40,7 +40,7 @@ namespace LibSearchBox
source = inSource; source = inSource;
} }
public IEnumerable<Tuple<int, string>> IterateTokens() public IEnumerable<(int, string)> IterateTokens()
{ {
int index = 0; int index = 0;
string[] parts = splitter.Split(source); string[] parts = splitter.Split(source);
@ -50,12 +50,11 @@ namespace LibSearchBox
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]); if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector. yield return (index, parts[i]);
yield return new Tuple<int, string>(index, parts[i]);
} }
} }
public IEnumerator<Tuple<int, string>> GetEnumerator() { public IEnumerator<(int, string)> GetEnumerator() {
return IterateTokens().GetEnumerator(); return IterateTokens().GetEnumerator();
} }
IEnumerator IEnumerable.GetEnumerator() IEnumerator IEnumerable.GetEnumerator()

View file

@ -1,4 +1,5 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<packages> <packages>
<package id="System.ValueTuple" version="4.5.0" targetFramework="net47" />
<package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" /> <package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
</packages> </packages>