Bugfix textual context generation
This commit is contained in:
parent
b41f7f524a
commit
2aba3a9d86
7 changed files with 239 additions and 57 deletions
|
@ -2,13 +2,13 @@
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
using Newtonsoft.Json;
|
using Newtonsoft.Json;
|
||||||
|
|
||||||
using LibSearchBox;
|
|
||||||
using System.Text.RegularExpressions;
|
|
||||||
using Newtonsoft.Json.Serialization;
|
|
||||||
using System.Threading.Tasks;
|
|
||||||
using SBRL.Utilities;
|
using SBRL.Utilities;
|
||||||
|
using LibSearchBox;
|
||||||
|
|
||||||
namespace SearchBoxCLI
|
namespace SearchBoxCLI
|
||||||
{
|
{
|
||||||
|
@ -18,28 +18,34 @@ namespace SearchBoxCLI
|
||||||
Index,
|
Index,
|
||||||
Add,
|
Add,
|
||||||
Remove,
|
Remove,
|
||||||
Update
|
Update,
|
||||||
|
GenerateContext
|
||||||
}
|
}
|
||||||
|
|
||||||
enum OutputModes
|
enum OutputModes
|
||||||
{
|
{
|
||||||
Json,
|
Json,
|
||||||
Text
|
Text,
|
||||||
|
Html
|
||||||
}
|
}
|
||||||
|
|
||||||
class MainClass {
|
class MainClass {
|
||||||
private static List<string> Extras = new List<string>();
|
private static List<string> Extras = new List<string>();
|
||||||
|
|
||||||
private static OperatingModes Mode = OperatingModes.Query;
|
private static OperatingModes Mode = OperatingModes.Query;
|
||||||
|
private static OutputModes OutputMode = OutputModes.Text;
|
||||||
private static bool Batch = false;
|
private static bool Batch = false;
|
||||||
|
|
||||||
private static string Name = string.Empty;
|
private static string Name = string.Empty;
|
||||||
private static IEnumerable<string> Tags;
|
private static IEnumerable<string> Tags;
|
||||||
|
|
||||||
private static string SearchIndexFilepath = string.Empty;
|
private static string SearchIndexFilepath = string.Empty;
|
||||||
private static TextReader Source = Console.In;
|
private static TextReader Source = Console.In;
|
||||||
private static TextReader SourceOld = null, SourceNew = null;
|
private static TextReader SourceOld = null, SourceNew = null;
|
||||||
|
|
||||||
|
private static string Query = string.Empty;
|
||||||
private static int ResultsLimit = -1;
|
private static int ResultsLimit = -1;
|
||||||
private static int ResultsOffset = 0;
|
private static int ResultsOffset = 0;
|
||||||
private static OutputModes OutputMode = OutputModes.Text;
|
|
||||||
|
|
||||||
public static int Main(string[] args)
|
public static int Main(string[] args)
|
||||||
{
|
{
|
||||||
|
@ -92,6 +98,10 @@ namespace SearchBoxCLI
|
||||||
ResultsOffset = int.Parse(args[++i]);
|
ResultsOffset = int.Parse(args[++i]);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case "query":
|
||||||
|
Query = args[++i];
|
||||||
|
break;
|
||||||
|
|
||||||
case "format":
|
case "format":
|
||||||
OutputMode = (OutputModes)Enum.Parse(typeof(OutputModes), args[++i], true);
|
OutputMode = (OutputModes)Enum.Parse(typeof(OutputModes), args[++i], true);
|
||||||
break;
|
break;
|
||||||
|
@ -105,7 +115,7 @@ namespace SearchBoxCLI
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Extras.Count < 1) return HandleHelp();
|
if (Extras.Count < 1) return HandleHelp();
|
||||||
string modeText = Extras.First(); Extras.RemoveAt(0);
|
string modeText = Extras.First().Replace("context", "generatecontext"); Extras.RemoveAt(0);
|
||||||
Mode = (OperatingModes)Enum.Parse(typeof(OperatingModes), modeText, true);
|
Mode = (OperatingModes)Enum.Parse(typeof(OperatingModes), modeText, true);
|
||||||
|
|
||||||
switch (Mode) {
|
switch (Mode) {
|
||||||
|
@ -113,10 +123,12 @@ namespace SearchBoxCLI
|
||||||
case OperatingModes.Add: return HandleAdd();
|
case OperatingModes.Add: return HandleAdd();
|
||||||
case OperatingModes.Remove: return HandleRemove();
|
case OperatingModes.Remove: return HandleRemove();
|
||||||
case OperatingModes.Query: return HandleQuery();
|
case OperatingModes.Query: return HandleQuery();
|
||||||
}
|
case OperatingModes.GenerateContext: return HandleContextGeneration();
|
||||||
|
default:
|
||||||
|
Console.Error.WriteLine($"Error: Don't know how to handle mode {Mode}.");
|
||||||
return 128;
|
return 128;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static int HandleHelp()
|
private static int HandleHelp()
|
||||||
{
|
{
|
||||||
|
@ -126,26 +138,27 @@ namespace SearchBoxCLI
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine("Usage:");
|
Console.WriteLine("Usage:");
|
||||||
Console.WriteLine(" ./SearchBox.exe {mode} [options]");
|
Console.WriteLine(" ./SearchBox.exe {mode} [options]");
|
||||||
Console.WriteLine(" ./SearchBox.exe query \"{query string}\" [options]");
|
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine("Modes:");
|
Console.WriteLine("Modes:");
|
||||||
Console.WriteLine(" query Query a pre-existing inverted search index");
|
Console.WriteLine(" query Query a pre-existing inverted search index");
|
||||||
|
Console.WriteLine(" context Generate a context string similar to a search result on the internet");
|
||||||
Console.WriteLine(" index Generate a raw index of the source document.");
|
Console.WriteLine(" index Generate a raw index of the source document.");
|
||||||
Console.WriteLine(" add Add a named document to a search index.");
|
Console.WriteLine(" add Add a named document to a search index.");
|
||||||
Console.WriteLine(" remove Remove a named document from a search index.");
|
Console.WriteLine(" remove Remove a named document from a search index.");
|
||||||
Console.WriteLine(" update Update a named document in a search index.");
|
Console.WriteLine(" update Update a named document in a search index.");
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine("Options:");
|
Console.WriteLine("Options:");
|
||||||
Console.WriteLine(" --format Sets the format of the output. Possible values: text (default), json {query,index}");
|
|
||||||
Console.WriteLine(" --source, -s Specifies the path to the source document {index, add}");
|
|
||||||
Console.WriteLine(" --old-source Specifies the path to the old version of the source document to update {update}");
|
|
||||||
Console.WriteLine(" --new-source Specifies the path to the new version of the source document to update {update}");
|
|
||||||
Console.WriteLine(" --name, -n Sets the name of the source document {add, remove}");
|
|
||||||
Console.WriteLine(" --index Specifies the location of the search index to use {add, remove, update}");
|
|
||||||
Console.WriteLine(" --tags Sets the tags to associate with the document. {add, update}");
|
|
||||||
Console.WriteLine(" --batch Enters a mode where the operations to process are specified via the source (by default stdin; change with --source as usual) - one per line in the format \"{filename}|{name}|{tags}\" {add}");
|
Console.WriteLine(" --batch Enters a mode where the operations to process are specified via the source (by default stdin; change with --source as usual) - one per line in the format \"{filename}|{name}|{tags}\" {add}");
|
||||||
|
Console.WriteLine(" --format Sets the format of the output. Possible values: text (default), json, html (context generation only) {query, index, context}");
|
||||||
|
Console.WriteLine(" --index Specifies the location of the search index to use {add, remove, update}");
|
||||||
|
Console.WriteLine(" --name, -n Sets the name of the source document {add, remove, title}");
|
||||||
|
Console.WriteLine(" --new-source Specifies the path to the new version of the source document to update {update}");
|
||||||
Console.WriteLine(" --limit Limits the number of results returned, -1 = no limit {query}");
|
Console.WriteLine(" --limit Limits the number of results returned, -1 = no limit {query}");
|
||||||
Console.WriteLine(" --offset Skips the specified number of results from the beginning of the results list {query}");
|
Console.WriteLine(" --offset Skips the specified number of results from the beginning of the results list {query}");
|
||||||
|
Console.WriteLine(" --old-source Specifies the path to the old version of the source document to update {update}");
|
||||||
|
Console.WriteLine(" --query Specifies the query string {query, context}");
|
||||||
|
Console.WriteLine(" --source, -s Specifies the path to the source document {index, add,context}");
|
||||||
|
Console.WriteLine(" --tags Sets the tags to associate with the document. {add, update}");
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
Console.WriteLine("Examples:");
|
Console.WriteLine("Examples:");
|
||||||
Console.WriteLine(" cat books/complex_knots.txt | ./SearchBox.exe add --name \"Complex Knots: How to do and undo them\"");
|
Console.WriteLine(" cat books/complex_knots.txt | ./SearchBox.exe add --name \"Complex Knots: How to do and undo them\"");
|
||||||
|
@ -155,13 +168,11 @@ namespace SearchBoxCLI
|
||||||
|
|
||||||
private static int HandleAdd()
|
private static int HandleAdd()
|
||||||
{
|
{
|
||||||
if (Name == string.Empty && !Batch)
|
if (Name == string.Empty && !Batch) {
|
||||||
{
|
|
||||||
Console.Error.WriteLine("Error: The document name must be specified when reading from stdin!");
|
Console.Error.WriteLine("Error: The document name must be specified when reading from stdin!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (SearchIndexFilepath == string.Empty)
|
if (SearchIndexFilepath == string.Empty) {
|
||||||
{
|
|
||||||
Console.Error.WriteLine("Error: No search index file path specified.");
|
Console.Error.WriteLine("Error: No search index file path specified.");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -209,7 +220,7 @@ namespace SearchBoxCLI
|
||||||
|
|
||||||
private static int HandleRemove()
|
private static int HandleRemove()
|
||||||
{
|
{
|
||||||
if (Name == string.Empty) {
|
if (string.IsNullOrEmpty(Name)) {
|
||||||
Console.Error.WriteLine("Error: The document name must be specified when removing a document!");
|
Console.Error.WriteLine("Error: The document name must be specified when removing a document!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -232,16 +243,20 @@ namespace SearchBoxCLI
|
||||||
|
|
||||||
private static int HandleQuery()
|
private static int HandleQuery()
|
||||||
{
|
{
|
||||||
if (Extras.Count < 1) {
|
if (string.IsNullOrEmpty(Query)) {
|
||||||
Console.Error.WriteLine("Error: No query specified!");
|
Console.Error.WriteLine("Error: No query specified!");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
if (SearchIndexFilepath == string.Empty) {
|
||||||
|
Console.Error.WriteLine("Error: No search index file path specified.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
|
SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
|
||||||
File.ReadAllText(SearchIndexFilepath)
|
File.ReadAllText(SearchIndexFilepath)
|
||||||
);
|
);
|
||||||
|
|
||||||
IEnumerable<SearchResult> resultsRaw = searchBox.Query(Extras[0], new QuerySettings()).Skip(ResultsOffset);
|
IEnumerable<SearchResult> resultsRaw = searchBox.Query(Query, new QuerySettings()).Skip(ResultsOffset);
|
||||||
List<SearchResult> results = new List<SearchResult>(
|
List<SearchResult> results = new List<SearchResult>(
|
||||||
ResultsLimit > 0 ? resultsRaw.Take(ResultsLimit) : resultsRaw
|
ResultsLimit > 0 ? resultsRaw.Take(ResultsLimit) : resultsRaw
|
||||||
);
|
);
|
||||||
|
@ -262,6 +277,43 @@ namespace SearchBoxCLI
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int HandleContextGeneration()
|
||||||
|
{
|
||||||
|
if (string.IsNullOrEmpty(Name)) {
|
||||||
|
Console.Error.WriteLine("Error: No document name specified.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (string.IsNullOrEmpty(Query)) {
|
||||||
|
Console.Error.WriteLine("Error: No query specified.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (SearchIndexFilepath == string.Empty) {
|
||||||
|
Console.Error.WriteLine("Error: No search index file path specified.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchBox searchBox = JsonConvert.DeserializeObject<SearchBox>(
|
||||||
|
File.ReadAllText(SearchIndexFilepath)
|
||||||
|
);
|
||||||
|
|
||||||
|
ContextSettings generationSettings = new ContextSettings();
|
||||||
|
switch (OutputMode) {
|
||||||
|
case OutputModes.Json:
|
||||||
|
Console.Error.WriteLine("Error: JSON output for context generation is not supported.");
|
||||||
|
return 1;
|
||||||
|
case OutputModes.Html:
|
||||||
|
generationSettings.Html = true;
|
||||||
|
break;
|
||||||
|
case OutputModes.Text:
|
||||||
|
generationSettings.Html = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Console.WriteLine(searchBox.GenerateContext(Name, Source.ReadToEnd(), Query, generationSettings));
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
private static int HandleIndex()
|
private static int HandleIndex()
|
||||||
{
|
{
|
||||||
Index index = new Index(Source.ReadToEnd());
|
Index index = new Index(Source.ReadToEnd());
|
||||||
|
|
|
@ -25,7 +25,7 @@ namespace LibSearchBox
|
||||||
|
|
||||||
// Tokenize the input and file it in our index
|
// Tokenize the input and file it in our index
|
||||||
Tokenizer tokenizer = new Tokenizer(inSource);
|
Tokenizer tokenizer = new Tokenizer(inSource);
|
||||||
foreach (Tuple<int, string> token in tokenizer) {
|
foreach ((int, string) token in tokenizer) {
|
||||||
if (stopwordTester.IsStopword(token.Item2)) continue;
|
if (stopwordTester.IsStopword(token.Item2)) continue;
|
||||||
insert(token.Item2, token.Item1);
|
insert(token.Item2, token.Item1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
using System;
|
|
||||||
namespace LibSearchBox
|
|
||||||
{
|
|
||||||
public class QuerySettings
|
|
||||||
{
|
|
||||||
public int WeightTitleMatch = 100;
|
|
||||||
public int WeightTagMatch = 10;
|
|
||||||
|
|
||||||
public QuerySettings()
|
|
||||||
{
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -2,11 +2,45 @@
|
||||||
using System.Collections.Concurrent;
|
using System.Collections.Concurrent;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using System.Net;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using Newtonsoft.Json;
|
using Newtonsoft.Json;
|
||||||
|
|
||||||
namespace LibSearchBox
|
namespace LibSearchBox
|
||||||
{
|
{
|
||||||
|
public class QuerySettings
|
||||||
|
{
|
||||||
|
public int WeightTitleMatch = 100;
|
||||||
|
public int WeightTagMatch = 10;
|
||||||
|
|
||||||
|
public QuerySettings()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class ContextSettings
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// The number of characters that should be displayed either side of a
|
||||||
|
/// matching term.
|
||||||
|
/// </summary>
|
||||||
|
public int ContextCharacters = 75;
|
||||||
|
/// <summary>
|
||||||
|
/// The maximum length of the generated context string.
|
||||||
|
/// </summary>
|
||||||
|
public int MaxLength = 250;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The separator to use between snippets in the generated context.
|
||||||
|
/// </summary>
|
||||||
|
public string Separator = " … ";
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Whether to output the generated context in HTML.
|
||||||
|
/// </summary>
|
||||||
|
public bool Html = false;
|
||||||
|
}
|
||||||
|
|
||||||
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
|
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
|
||||||
|
|
||||||
[JsonObject(MemberSerialization.OptIn)]
|
[JsonObject(MemberSerialization.OptIn)]
|
||||||
|
@ -80,7 +114,7 @@ namespace LibSearchBox
|
||||||
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
|
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
|
||||||
|
|
||||||
Tokenizer tokenizer = new Tokenizer(query);
|
Tokenizer tokenizer = new Tokenizer(query);
|
||||||
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
|
foreach((int, string) token in tokenizer.IterateTokens())
|
||||||
{
|
{
|
||||||
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
|
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
|
||||||
|
|
||||||
|
@ -95,13 +129,17 @@ namespace LibSearchBox
|
||||||
}
|
}
|
||||||
|
|
||||||
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
|
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
|
||||||
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
|
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
|
||||||
|
{
|
||||||
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
|
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
|
||||||
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
|
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
|
||||||
|
{
|
||||||
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
|
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
|
||||||
}
|
}
|
||||||
else {
|
else
|
||||||
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
|
{
|
||||||
|
foreach ((int, string) token in tokenizer.IterateTokens())
|
||||||
|
{
|
||||||
if (metaInfo.SearchableTitle.Contains(token.Item2))
|
if (metaInfo.SearchableTitle.Contains(token.Item2))
|
||||||
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
|
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
|
||||||
|
|
||||||
|
@ -111,13 +149,7 @@ namespace LibSearchBox
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List<SearchOffset> offsets = new List<SearchOffset>();
|
List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);
|
||||||
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
|
|
||||||
ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
|
|
||||||
if (!tokenQuery.ContainsKey(pageDef.Key)) continue; // Don't bother if this page doesn't contain this token
|
|
||||||
offsets.AddRange(tokenQuery[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset)));
|
|
||||||
}
|
|
||||||
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
|
|
||||||
|
|
||||||
resultsRaw.Add(new SearchResult(
|
resultsRaw.Add(new SearchResult(
|
||||||
idMap.GetPageName(pageDef.Key),
|
idMap.GetPageName(pageDef.Key),
|
||||||
|
@ -131,6 +163,114 @@ namespace LibSearchBox
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
|
||||||
|
{
|
||||||
|
List<SearchOffset> offsets = new List<SearchOffset>();
|
||||||
|
foreach ((int, string) token in tokenizer.IterateTokens())
|
||||||
|
{
|
||||||
|
ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
|
||||||
|
if (!tokenQuery.ContainsKey(pageId)) continue; // Don't bother if this page doesn't contain this token
|
||||||
|
offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
|
||||||
|
}
|
||||||
|
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
|
||||||
|
return offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
|
||||||
|
{
|
||||||
|
int pageId = idMap.GetId(pageName);
|
||||||
|
|
||||||
|
Tokenizer tokenizer = new Tokenizer(query);
|
||||||
|
List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);
|
||||||
|
|
||||||
|
int currentLength = 0;
|
||||||
|
List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
|
||||||
|
List<(int, int)> snippets = new List<(int, int)>(); // from, to
|
||||||
|
for (int i = 0; i < offsets.Count; i++)
|
||||||
|
{
|
||||||
|
// Don't go over the maximum length
|
||||||
|
// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
|
||||||
|
if (currentLength > settings.MaxLength)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Generate the next snippet
|
||||||
|
(int, int) nextSnippet = (
|
||||||
|
Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
|
||||||
|
Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
|
||||||
|
// If the next snippet overlaps with the previous one, then combine the 2
|
||||||
|
if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
|
||||||
|
// BUG: This *might* exceed the MaxLength a bit
|
||||||
|
// Pop the last snippet from the list
|
||||||
|
(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
|
||||||
|
currentLength += nextSnippet.Item2 - lastSnippet.Item2;
|
||||||
|
|
||||||
|
lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
|
||||||
|
snippets.Add(lastSnippet); // Push it back on again
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// No overlap! Add it to the list
|
||||||
|
snippets.Add(nextSnippet);
|
||||||
|
currentLength += nextSnippet.Item2 - nextSnippet.Item1;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<string> snippetsText = new List<string>(snippets.Select(((int, int) snippet) => {
|
||||||
|
string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(result.Trim()))
|
||||||
|
return "";
|
||||||
|
|
||||||
|
if (settings.Html) {
|
||||||
|
List<string> parts = new List<string>() { result };
|
||||||
|
int remainingStartIndex = snippet.Item1;
|
||||||
|
foreach ((int, int) tokenDef in tokenLocations) {
|
||||||
|
if (tokenDef.Item1 - remainingStartIndex < result.Length) {
|
||||||
|
string remainingString = parts.Last();
|
||||||
|
parts.RemoveAt(parts.Count - 1); // Remove the last element
|
||||||
|
// The bit before the token
|
||||||
|
string nextPart = WebUtility.HtmlEncode(
|
||||||
|
remainingString.Substring(0, tokenDef.Item1 - remainingStartIndex)
|
||||||
|
);
|
||||||
|
// The token itself
|
||||||
|
nextPart += $"<span class='token'>{WebUtility.HtmlEncode(remainingString.Substring(tokenDef.Item1 - remainingStartIndex, tokenDef.Item2))}</span>";
|
||||||
|
parts.Add(nextPart);
|
||||||
|
// The bit after the token - this will be processed by the next loop, so ti doesn't need to be HTML-encoded (yet)
|
||||||
|
parts.Add(remainingString.Substring((tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2));
|
||||||
|
|
||||||
|
// Update our marker as to where we've got up to
|
||||||
|
remainingStartIndex = (tokenDef.Item1 - remainingStartIndex) + tokenDef.Item2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML-encode the last part
|
||||||
|
parts[parts.Count - 1] = WebUtility.HtmlEncode(parts[parts.Count - 1]);
|
||||||
|
|
||||||
|
result = string.Join("", parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}).Where((string snippet) => !string.IsNullOrWhiteSpace(snippet)));
|
||||||
|
|
||||||
|
// Add the separator at the beginning and end if we aren't at the bounds of the source document
|
||||||
|
if (snippets.First().Item1 > 0)
|
||||||
|
snippetsText.Insert(0, "");
|
||||||
|
if (snippets.Last().Item2 < source.Length)
|
||||||
|
snippetsText.Add("");
|
||||||
|
|
||||||
|
return string.Join(
|
||||||
|
settings.Separator,
|
||||||
|
snippetsText
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endregion
|
#endregion
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,10 @@
|
||||||
<Reference Include="Newtonsoft.Json">
|
<Reference Include="Newtonsoft.Json">
|
||||||
<HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
|
<HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
|
||||||
</Reference>
|
</Reference>
|
||||||
|
<Reference Include="System.ValueTuple">
|
||||||
|
<HintPath>..\packages\System.ValueTuple.4.5.0\lib\net47\System.ValueTuple.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
|
<Reference Include="mscorlib" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
|
@ -50,7 +54,6 @@
|
||||||
<Compile Include="Utilities\BiDictionary.cs" />
|
<Compile Include="Utilities\BiDictionary.cs" />
|
||||||
<Compile Include="DocumentMeta.cs" />
|
<Compile Include="DocumentMeta.cs" />
|
||||||
<Compile Include="SearchResult.cs" />
|
<Compile Include="SearchResult.cs" />
|
||||||
<Compile Include="QuerySettings.cs" />
|
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Folder Include="EmbeddedFiles\" />
|
<Folder Include="EmbeddedFiles\" />
|
||||||
|
|
|
@ -16,7 +16,7 @@ namespace LibSearchBox
|
||||||
HidePunctuation = 4,
|
HidePunctuation = 4,
|
||||||
DecodeHtmlEntities = 8
|
DecodeHtmlEntities = 8
|
||||||
}
|
}
|
||||||
public class Tokenizer : IEnumerable<Tuple<int, string>>
|
public class Tokenizer : IEnumerable<(int, string)>
|
||||||
{
|
{
|
||||||
private static Regex splitter = new Regex(
|
private static Regex splitter = new Regex(
|
||||||
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
|
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
|
||||||
|
@ -40,7 +40,7 @@ namespace LibSearchBox
|
||||||
source = inSource;
|
source = inSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IEnumerable<Tuple<int, string>> IterateTokens()
|
public IEnumerable<(int, string)> IterateTokens()
|
||||||
{
|
{
|
||||||
int index = 0;
|
int index = 0;
|
||||||
string[] parts = splitter.Split(source);
|
string[] parts = splitter.Split(source);
|
||||||
|
@ -50,12 +50,11 @@ namespace LibSearchBox
|
||||||
|
|
||||||
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
||||||
|
|
||||||
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
|
yield return (index, parts[i]);
|
||||||
yield return new Tuple<int, string>(index, parts[i]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public IEnumerator<Tuple<int, string>> GetEnumerator() {
|
public IEnumerator<(int, string)> GetEnumerator() {
|
||||||
return IterateTokens().GetEnumerator();
|
return IterateTokens().GetEnumerator();
|
||||||
}
|
}
|
||||||
IEnumerator IEnumerable.GetEnumerator()
|
IEnumerator IEnumerable.GetEnumerator()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<packages>
|
<packages>
|
||||||
|
<package id="System.ValueTuple" version="4.5.0" targetFramework="net47" />
|
||||||
<package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
|
<package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
|
||||||
</packages>
|
</packages>
|
Loading…
Reference in a new issue