265 lines
8.4 KiB
C#
265 lines
8.4 KiB
C#
using System;
|
|
using System.Collections.Concurrent;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
using Newtonsoft.Json;
|
|
using UnidecodeSharpFork;
|
|
|
|
namespace LibSearchBox
|
|
{
|
|
public class QuerySettings
|
|
{
|
|
public int WeightTitleMatch = 100;
|
|
public int WeightTagMatch = 10;
|
|
|
|
public QuerySettings()
|
|
{
|
|
}
|
|
}
|
|
|
|
public class ContextSettings
|
|
{
|
|
/// <summary>
|
|
/// The number of characters that should be displayed either side of a
|
|
/// matching term.
|
|
/// </summary>
|
|
public int ContextCharacters = 75;
|
|
/// <summary>
|
|
/// The maximum length of the generated context string.
|
|
/// </summary>
|
|
public int MaxLength = 250;
|
|
|
|
/// <summary>
|
|
/// The separator to use between snippets in the generated context.
|
|
/// </summary>
|
|
public string Separator = " … ";
|
|
|
|
/// <summary>
|
|
/// Whether to output the generated context in HTML.
|
|
/// </summary>
|
|
public bool Html = false;
|
|
}
|
|
|
|
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
|
|
|
|
[JsonObject(MemberSerialization.OptIn)]
|
|
public class SearchBox
|
|
{
|
|
private IdMap idMap = new IdMap();
|
|
|
|
[JsonProperty("ids")]
|
|
public Dictionary<int, string> IdMap {
|
|
get {
|
|
Dictionary<int, string> result = idMap.MapOut;
|
|
if (result.Count == 0) return null;
|
|
return result;
|
|
}
|
|
set {
|
|
if (value == null) return;
|
|
idMap.Import(value);
|
|
}
|
|
}
|
|
|
|
[JsonProperty]
|
|
public ConcurrentDictionary<int, DocumentMeta> metaTable = new ConcurrentDictionary<int, DocumentMeta>();
|
|
[JsonProperty]
|
|
public InvertedIndex index = new InvertedIndex();
|
|
|
|
public bool Verbose { get; set; } = false;
|
|
|
|
public SearchBox()
|
|
{
|
|
|
|
}
|
|
|
|
#region Index Management
|
|
|
|
public void AddDocument(string title, IEnumerable<string> tags, string content)
|
|
{
|
|
DocumentMeta info = new DocumentMeta(title, tags);
|
|
int id = idMap.GetId(info.Title);
|
|
metaTable.AddOrUpdate(id, info, (key, oldValue) => info);
|
|
Index upsideIndex = new Index(content);
|
|
index.AddIndex(id, upsideIndex);
|
|
}
|
|
|
|
public void UpdateDocument(string title, IEnumerable<string> newTags, string oldContent, string newContent)
|
|
{
|
|
int id = idMap.GetId(title);
|
|
DocumentMeta info = metaTable[id];
|
|
info.ReplaceTags(newTags);
|
|
|
|
Index oldIndex = new Index(oldContent), newIndex = new Index(newContent);
|
|
if (!index.ReplaceIndex(id, oldIndex, newIndex))
|
|
throw new SearchBoxException($"Error: Failed to replace index for document with title {title}.");
|
|
}
|
|
|
|
public void RemoveDocument(string title)
|
|
{
|
|
int id = idMap.DeletePageName(title);
|
|
metaTable.TryRemove(id, out DocumentMeta noop);
|
|
if (!index.RemoveById(id))
|
|
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Query
|
|
|
|
public List<SearchResult> Query(string query, QuerySettings settings)
|
|
{
|
|
// pageId => token -> count
|
|
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
|
|
|
|
Tokenizer tokenizer = new Tokenizer(query);
|
|
foreach((int, string) token in tokenizer.IterateTokens())
|
|
{
|
|
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
|
|
|
|
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
|
|
// pageTokenDef: pageId => List of token offsets
|
|
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
|
|
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
|
|
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
|
|
"results - the key already exists (are there duplicate tokens for this page id " +
|
|
"in the inverted index?");
|
|
});
|
|
}
|
|
|
|
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
|
|
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
|
|
{
|
|
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
|
|
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
|
|
{
|
|
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
|
|
}
|
|
else
|
|
{
|
|
foreach ((int, string) token in tokenizer.IterateTokens())
|
|
{
|
|
if (metaInfo.SearchableTitle.Contains(token.Item2))
|
|
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
|
|
|
|
foreach (string nextTag in metaInfo.SearchableTags)
|
|
if (nextTag.Contains(token.Item2))
|
|
rank += settings.WeightTagMatch;
|
|
}
|
|
}
|
|
|
|
List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);
|
|
|
|
resultsRaw.Add(new SearchResult(
|
|
idMap.GetPageName(pageDef.Key),
|
|
rank,
|
|
offsets
|
|
));
|
|
});
|
|
|
|
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
|
|
results.Sort((SearchResult a, SearchResult b) => (int)Math.Round(b.Rank - a.Rank));
|
|
return results;
|
|
}
|
|
|
|
private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
|
|
{
|
|
List<SearchOffset> offsets = new List<SearchOffset>();
|
|
foreach ((int, string) token in tokenizer.IterateTokens())
|
|
{
|
|
ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
|
|
if (tokenQuery == null || !tokenQuery.ContainsKey(pageId))
|
|
continue; // Don't bother if this page doesn't contain this token
|
|
offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
|
|
}
|
|
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
|
|
return offsets;
|
|
}
|
|
|
|
public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
|
|
{
|
|
int pageId = idMap.GetId(pageName);
|
|
|
|
Tokenizer tokenizer = new Tokenizer(query);
|
|
List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);
|
|
|
|
int currentLength = 0;
|
|
List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
|
|
List<(int, int)> snippets = new List<(int, int)>(); // from, to
|
|
for (int i = 0; i < offsets.Count; i++)
|
|
{
|
|
// Don't go over the maximum length
|
|
// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
|
|
if (currentLength > settings.MaxLength)
|
|
break;
|
|
|
|
// Generate the next snippet
|
|
(int, int) nextSnippet = (
|
|
Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
|
|
Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
|
|
);
|
|
|
|
|
|
|
|
tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
|
|
// If the next snippet overlaps with the previous one, then combine the 2
|
|
if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
|
|
// BUG: This *might* exceed the MaxLength a bit
|
|
// Pop the last snippet from the list
|
|
(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
|
|
currentLength += nextSnippet.Item2 - lastSnippet.Item2;
|
|
|
|
lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
|
|
snippets.Add(lastSnippet); // Push it back on again
|
|
|
|
continue;
|
|
}
|
|
|
|
|
|
// No overlap! Add it to the list
|
|
snippets.Add(nextSnippet);
|
|
currentLength += nextSnippet.Item2 - nextSnippet.Item1;
|
|
}
|
|
|
|
List<string> snippetsText = new List<string>(
|
|
snippets.Select(((int, int) snippet) => {
|
|
string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);
|
|
|
|
if (settings.Html) {
|
|
result = WebUtility.HtmlEncode(result);
|
|
|
|
foreach((int, string) nextToken in tokenizer.IterateTokens()) {
|
|
result = Regex.Replace(
|
|
result,
|
|
Regex.Escape(nextToken.Item2),
|
|
$"<strong class='query-term'>{nextToken.Item2}</strong>",
|
|
RegexOptions.IgnoreCase // Also ignores accents, apparently
|
|
);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
})
|
|
.Where((string snippet) => !string.IsNullOrWhiteSpace(snippet))
|
|
);
|
|
|
|
// Add the separator at the beginning and end if we aren't at the bounds of the source document
|
|
if (snippets.Count > 0 && snippets.First().Item1 > 0)
|
|
snippetsText.Insert(0, "");
|
|
if (snippets.Count > 0 && snippets.Last().Item2 < source.Length)
|
|
snippetsText.Add("");
|
|
|
|
return string.Join(
|
|
settings.Separator,
|
|
snippetsText
|
|
);
|
|
}
|
|
|
|
|
|
#endregion
|
|
}
|
|
}
|