SearchBox/SearchBox/SearchBox.cs

265 lines
8.4 KiB
C#

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Newtonsoft.Json;
using UnidecodeSharpFork;
namespace LibSearchBox
{
public class QuerySettings
{
public int WeightTitleMatch = 100;
public int WeightTagMatch = 10;
public QuerySettings()
{
}
}
public class ContextSettings
{
/// <summary>
/// The number of characters that should be displayed either side of a
/// matching term.
/// </summary>
public int ContextCharacters = 75;
/// <summary>
/// The maximum length of the generated context string.
/// </summary>
public int MaxLength = 250;
/// <summary>
/// The separator to use between snippets in the generated context.
/// </summary>
public string Separator = " … ";
/// <summary>
/// Whether to output the generated context in HTML.
/// </summary>
public bool Html = false;
}
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
[JsonObject(MemberSerialization.OptIn)]
public class SearchBox
{
private IdMap idMap = new IdMap();
[JsonProperty("ids")]
public Dictionary<int, string> IdMap {
get {
Dictionary<int, string> result = idMap.MapOut;
if (result.Count == 0) return null;
return result;
}
set {
if (value == null) return;
idMap.Import(value);
}
}
[JsonProperty]
public ConcurrentDictionary<int, DocumentMeta> metaTable = new ConcurrentDictionary<int, DocumentMeta>();
[JsonProperty]
public InvertedIndex index = new InvertedIndex();
public bool Verbose { get; set; } = false;
public SearchBox()
{
}
#region Index Management
public void AddDocument(string title, IEnumerable<string> tags, string content)
{
DocumentMeta info = new DocumentMeta(title, tags);
int id = idMap.GetId(info.Title);
metaTable.AddOrUpdate(id, info, (key, oldValue) => info);
Index upsideIndex = new Index(content);
index.AddIndex(id, upsideIndex);
}
public void UpdateDocument(string title, IEnumerable<string> newTags, string oldContent, string newContent)
{
int id = idMap.GetId(title);
DocumentMeta info = metaTable[id];
info.ReplaceTags(newTags);
Index oldIndex = new Index(oldContent), newIndex = new Index(newContent);
if (!index.ReplaceIndex(id, oldIndex, newIndex))
throw new SearchBoxException($"Error: Failed to replace index for document with title {title}.");
}
public void RemoveDocument(string title)
{
int id = idMap.DeletePageName(title);
metaTable.TryRemove(id, out DocumentMeta noop);
if (!index.RemoveById(id))
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
}
#endregion
#region Query
public List<SearchResult> Query(string query, QuerySettings settings)
{
// pageId => token -> count
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
Tokenizer tokenizer = new Tokenizer(query);
foreach((int, string) token in tokenizer.IterateTokens())
{
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
// pageTokenDef: pageId => List of token offsets
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
"results - the key already exists (are there duplicate tokens for this page id " +
"in the inverted index?");
});
}
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
{
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
{
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
}
else
{
foreach ((int, string) token in tokenizer.IterateTokens())
{
if (metaInfo.SearchableTitle.Contains(token.Item2))
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
foreach (string nextTag in metaInfo.SearchableTags)
if (nextTag.Contains(token.Item2))
rank += settings.WeightTagMatch;
}
}
List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);
resultsRaw.Add(new SearchResult(
idMap.GetPageName(pageDef.Key),
rank,
offsets
));
});
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
results.Sort((SearchResult a, SearchResult b) => (int)Math.Round(b.Rank - a.Rank));
return results;
}
private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
{
List<SearchOffset> offsets = new List<SearchOffset>();
foreach ((int, string) token in tokenizer.IterateTokens())
{
ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
if (tokenQuery == null || !tokenQuery.ContainsKey(pageId))
continue; // Don't bother if this page doesn't contain this token
offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
}
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
return offsets;
}
public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
{
int pageId = idMap.GetId(pageName);
Tokenizer tokenizer = new Tokenizer(query);
List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);
int currentLength = 0;
List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
List<(int, int)> snippets = new List<(int, int)>(); // from, to
for (int i = 0; i < offsets.Count; i++)
{
// Don't go over the maximum length
// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
if (currentLength > settings.MaxLength)
break;
// Generate the next snippet
(int, int) nextSnippet = (
Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
);
tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
// If the next snippet overlaps with the previous one, then combine the 2
if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
// BUG: This *might* exceed the MaxLength a bit
// Pop the last snippet from the list
(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
currentLength += nextSnippet.Item2 - lastSnippet.Item2;
lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
snippets.Add(lastSnippet); // Push it back on again
continue;
}
// No overlap! Add it to the list
snippets.Add(nextSnippet);
currentLength += nextSnippet.Item2 - nextSnippet.Item1;
}
List<string> snippetsText = new List<string>(
snippets.Select(((int, int) snippet) => {
string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);
if (settings.Html) {
result = WebUtility.HtmlEncode(result);
foreach((int, string) nextToken in tokenizer.IterateTokens()) {
result = Regex.Replace(
result,
Regex.Escape(nextToken.Item2),
$"<strong class='query-term'>{nextToken.Item2}</strong>",
RegexOptions.IgnoreCase // Also ignores accents, apparently
);
}
}
return result;
})
.Where((string snippet) => !string.IsNullOrWhiteSpace(snippet))
);
// Add the separator at the beginning and end if we aren't at the bounds of the source document
if (snippets.Count > 0 && snippets.First().Item1 > 0)
snippetsText.Insert(0, "");
if (snippets.Count > 0 && snippets.Last().Item2 < source.Length)
snippetsText.Add("");
return string.Join(
settings.Separator,
snippetsText
);
}
#endregion
}
}