using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; using Newtonsoft.Json; namespace LibSearchBox { public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } } [JsonObject(MemberSerialization.OptIn)] public class SearchBox { private IdMap idMap = new IdMap(); [JsonProperty("ids")] public Dictionary IdMap { get { Dictionary result = idMap.MapOut; if (result.Count == 0) return null; return result; } set { if (value == null) return; idMap.Import(value); } } [JsonProperty] public ConcurrentDictionary metaTable = new ConcurrentDictionary(); [JsonProperty] public InvertedIndex index = new InvertedIndex(); public bool Verbose { get; set; } = false; public SearchBox() { } #region Index Management public void AddDocument(string title, IEnumerable tags, string content) { DocumentMeta info = new DocumentMeta(title, tags); int id = idMap.GetId(info.Title); metaTable.AddOrUpdate(id, info, (key, oldValue) => info); Index upsideIndex = new Index(content); index.AddIndex(id, upsideIndex); } public void UpdateDocument(string title, IEnumerable newTags, string oldContent, string newContent) { int id = idMap.GetId(title); DocumentMeta info = metaTable[id]; info.ReplaceTags(newTags); Index oldIndex = new Index(oldContent), newIndex = new Index(newContent); if (!index.ReplaceIndex(id, oldIndex, newIndex)) throw new SearchBoxException($"Error: Failed to replace index for document with title {title}."); } public void RemoveDocument(string title) { int id = idMap.DeletePageName(title); metaTable.TryRemove(id, out DocumentMeta noop); if (!index.RemoveById(id)) throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index."); } #endregion #region Query public List Query(string query, QuerySettings settings) { // pageId => token -> count ConcurrentDictionary> matchingPages = new ConcurrentDictionary>(); Tokenizer tokenizer = new Tokenizer(query); foreach(Tuple token in tokenizer.IterateTokens()) { ConcurrentDictionary> tokenResults = index.Query(token.Item2); Parallel.ForEach(tokenResults, (KeyValuePair> pageTokenDef) => { // pageTokenDef: pageId => List of token offsets ConcurrentDictionary pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary()); if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count)) throw new SearchBoxException("Error: Failed to add token count to page data in search " + "results - the key already exists (are there duplicate tokens for this page id " + "in the inverted index?"); }); } ConcurrentBag resultsRaw = new ConcurrentBag(); Parallel.ForEach(matchingPages, (KeyValuePair> pageDef) => { int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this? if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) { if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}"); } else { foreach (Tuple token in tokenizer.IterateTokens()) { if (metaInfo.SearchableTitle.Contains(token.Item2)) rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched foreach (string nextTag in metaInfo.SearchableTags) if (nextTag.Contains(token.Item2)) rank += settings.WeightTagMatch; } } List offsets = new List(); foreach (Tuple token in tokenizer.IterateTokens()) offsets.AddRange(index.Query(token.Item2)[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset))); offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset); resultsRaw.Add(new SearchResult( idMap.GetPageName(pageDef.Key), rank, offsets )); }); List results = new List(resultsRaw.AsEnumerable()); results.OrderBy((SearchResult result) => result.Rank); return results; } #endregion } }