A standalone full-text search engine written in C#.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
4.5 KiB

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Newtonsoft.Json;
namespace LibSearchBox
{
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
[JsonObject(MemberSerialization.OptIn)]
public class SearchBox
{
private IdMap idMap = new IdMap();
[JsonProperty("ids")]
public Dictionary<int, string> IdMap {
get {
Dictionary<int, string> result = idMap.MapOut;
if (result.Count == 0) return null;
return result;
}
set {
if (value == null) return;
idMap.Import(value);
}
}
[JsonProperty]
public ConcurrentDictionary<int, DocumentMeta> metaTable = new ConcurrentDictionary<int, DocumentMeta>();
[JsonProperty]
public InvertedIndex index = new InvertedIndex();
public bool Verbose { get; set; } = false;
public SearchBox()
{
}
#region Index Management
public void AddDocument(string title, IEnumerable<string> tags, string content)
{
DocumentMeta info = new DocumentMeta(title, tags);
int id = idMap.GetId(info.Title);
metaTable.AddOrUpdate(id, info, (key, oldValue) => info);
Index upsideIndex = new Index(content);
index.AddIndex(id, upsideIndex);
}
public void UpdateDocument(string title, IEnumerable<string> newTags, string oldContent, string newContent)
{
int id = idMap.GetId(title);
DocumentMeta info = metaTable[id];
info.ReplaceTags(newTags);
Index oldIndex = new Index(oldContent), newIndex = new Index(newContent);
if (!index.ReplaceIndex(id, oldIndex, newIndex))
throw new SearchBoxException($"Error: Failed to replace index for document with title {title}.");
}
public void RemoveDocument(string title)
{
int id = idMap.DeletePageName(title);
metaTable.TryRemove(id, out DocumentMeta noop);
if (!index.RemoveById(id))
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
}
#endregion
#region Query
public List<SearchResult> Query(string query, QuerySettings settings)
{
// pageId => token -> count
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
Tokenizer tokenizer = new Tokenizer(query);
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
{
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
// pageTokenDef: pageId => List of token offsets
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
"results - the key already exists (are there duplicate tokens for this page id " +
"in the inverted index?");
});
}
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this?
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
}
else {
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
if (metaInfo.SearchableTitle.Contains(token.Item2))
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
foreach (string nextTag in metaInfo.SearchableTags)
if (nextTag.Contains(token.Item2))
rank += settings.WeightTagMatch;
}
}
List<SearchOffset> offsets = new List<SearchOffset>();
foreach (Tuple<int, string> token in tokenizer.IterateTokens())
offsets.AddRange(index.Query(token.Item2)[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset)));
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
resultsRaw.Add(new SearchResult(
idMap.GetPageName(pageDef.Key),
rank,
offsets
));
});
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
results.OrderBy((SearchResult result) => result.Rank);
return results;
}
#endregion
}
}