125 lines
4.1 KiB
C#
125 lines
4.1 KiB
C#
using System;
|
|
using System.Collections.Concurrent;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Threading.Tasks;
|
|
using Newtonsoft.Json;
|
|
|
|
namespace LibSearchBox
|
|
{
|
|
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
|
|
|
|
[JsonObject(MemberSerialization.OptIn)]
|
|
public class SearchBox
|
|
{
|
|
private IdMap idMap = new IdMap();
|
|
|
|
[JsonProperty("ids")]
|
|
public Dictionary<int, string> IdMap {
|
|
get {
|
|
Dictionary<int, string> result = idMap.MapOut;
|
|
if (result.Count == 0) return null;
|
|
return result;
|
|
}
|
|
set {
|
|
if (value == null) return;
|
|
idMap.Import(value);
|
|
}
|
|
}
|
|
|
|
[JsonProperty]
|
|
public ConcurrentDictionary<int, DocumentMeta> metaTable = new ConcurrentDictionary<int, DocumentMeta>();
|
|
[JsonProperty]
|
|
public InvertedIndex index = new InvertedIndex();
|
|
|
|
public bool Verbose { get; set; } = false;
|
|
|
|
public SearchBox()
|
|
{
|
|
|
|
}
|
|
|
|
#region Index Management
|
|
|
|
public void AddDocument(string title, IEnumerable<string> tags, string content)
|
|
{
|
|
DocumentMeta info = new DocumentMeta(title, tags);
|
|
int id = idMap.GetId(info.Title);
|
|
metaTable.AddOrUpdate(id, info, (key, oldValue) => info);
|
|
Index upsideIndex = new Index(content);
|
|
index.AddIndex(id, upsideIndex);
|
|
}
|
|
|
|
public void UpdateDocument(string title, IEnumerable<string> newTags, string oldContent, string newContent)
|
|
{
|
|
int id = idMap.GetId(title);
|
|
DocumentMeta info = metaTable[id];
|
|
info.ReplaceTags(newTags);
|
|
|
|
Index oldIndex = new Index(oldContent), newIndex = new Index(newContent);
|
|
if (!index.ReplaceIndex(id, oldIndex, newIndex))
|
|
throw new SearchBoxException($"Error: Failed to replace index for document with title {title}.");
|
|
}
|
|
|
|
public void RemoveDocument(string title)
|
|
{
|
|
int id = idMap.DeletePageName(title);
|
|
metaTable.TryRemove(id, out DocumentMeta noop);
|
|
if (!index.RemoveById(id))
|
|
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Query
|
|
|
|
public List<SearchResult> Query(string query, QuerySettings settings)
|
|
{
|
|
// pageId => token -> count
|
|
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
|
|
|
|
Tokenizer tokenizer = new Tokenizer(query);
|
|
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
|
|
{
|
|
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
|
|
|
|
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
|
|
// pageTokenDef: pageId => List of token offsets
|
|
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
|
|
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
|
|
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
|
|
"results - the key already exists (are there duplicate tokens for this page id " +
|
|
"in the inverted index?");
|
|
});
|
|
}
|
|
|
|
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
|
|
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
|
|
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this?
|
|
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
|
|
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
|
|
}
|
|
else {
|
|
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
|
|
if (metaInfo.SearchableTitle.Contains(token.Item2))
|
|
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
|
|
|
|
foreach (string nextTag in metaInfo.SearchableTags)
|
|
if (nextTag.Contains(token.Item2))
|
|
rank += settings.WeightTagMatch;
|
|
}
|
|
}
|
|
|
|
resultsRaw.Add(new SearchResult(idMap.GetPageName(pageDef.Key), rank));
|
|
});
|
|
|
|
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
|
|
results.OrderBy((SearchResult result) => result.Rank);
|
|
return results;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|