using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text.RegularExpressions; using System.Threading.Tasks; using Newtonsoft.Json; using UnidecodeSharpFork; namespace LibSearchBox { public class QuerySettings { public int WeightTitleMatch = 100; public int WeightTagMatch = 10; public QuerySettings() { } } public class ContextSettings { ///

/// The number of characters that should be displayed either side of a /// matching term. ///

public int ContextCharacters = 75; ///

/// The maximum length of the generated context string. ///

public int MaxLength = 250; ///

/// The separator to use between snippets in the generated context. ///

public string Separator = " … "; ///

/// Whether to output the generated context in HTML. ///

public bool Html = false; } public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } } [JsonObject(MemberSerialization.OptIn)] public class SearchBox { private IdMap idMap = new IdMap(); [JsonProperty("ids")] public Dictionary IdMap { get { Dictionary result = idMap.MapOut; if (result.Count == 0) return null; return result; } set { if (value == null) return; idMap.Import(value); } } [JsonProperty] public ConcurrentDictionary metaTable = new ConcurrentDictionary(); [JsonProperty] public InvertedIndex index = new InvertedIndex(); public bool Verbose { get; set; } = false; public SearchBox() { } #region Index Management public void AddDocument(string title, IEnumerable tags, string content) { DocumentMeta info = new DocumentMeta(title, tags); int id = idMap.GetId(info.Title); metaTable.AddOrUpdate(id, info, (key, oldValue) => info); Index upsideIndex = new Index(content); index.AddIndex(id, upsideIndex); } public void UpdateDocument(string title, IEnumerable newTags, string oldContent, string newContent) { int id = idMap.GetId(title); DocumentMeta info = metaTable[id]; info.ReplaceTags(newTags); Index oldIndex = new Index(oldContent), newIndex = new Index(newContent); if (!index.ReplaceIndex(id, oldIndex, newIndex)) throw new SearchBoxException($"Error: Failed to replace index for document with title {title}."); } public void RemoveDocument(string title) { int id = idMap.DeletePageName(title); metaTable.TryRemove(id, out DocumentMeta noop); if (!index.RemoveById(id)) throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index."); } #endregion #region Query public List Query(string query, QuerySettings settings) { // pageId => token -> count ConcurrentDictionary> matchingPages = new ConcurrentDictionary>(); Tokenizer tokenizer = new Tokenizer(query); foreach((int, string) token in tokenizer.IterateTokens()) { ConcurrentDictionary> tokenResults = index.Query(token.Item2); Parallel.ForEach(tokenResults, (KeyValuePair> pageTokenDef) => { // pageTokenDef: pageId => List of token offsets ConcurrentDictionary pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary()); if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count)) throw new SearchBoxException("Error: Failed to add token count to page data in search " + "results - the key already exists (are there duplicate tokens for this page id " + "in the inverted index?"); }); } ConcurrentBag resultsRaw = new ConcurrentBag(); Parallel.ForEach(matchingPages, (KeyValuePair> pageDef) => { int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this? if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) { if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}"); } else { foreach ((int, string) token in tokenizer.IterateTokens()) { if (metaInfo.SearchableTitle.Contains(token.Item2)) rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched foreach (string nextTag in metaInfo.SearchableTags) if (nextTag.Contains(token.Item2)) rank += settings.WeightTagMatch; } } List offsets = getPageOffsets(pageDef.Key, tokenizer); resultsRaw.Add(new SearchResult( idMap.GetPageName(pageDef.Key), rank, offsets )); }); List results = new List(resultsRaw.AsEnumerable()); results.Sort((SearchResult a, SearchResult b) => (int)Math.Round(b.Rank - a.Rank)); return results; } private List getPageOffsets(int pageId, Tokenizer tokenizer) { List offsets = new List(); foreach ((int, string) token in tokenizer.IterateTokens()) { ConcurrentDictionary> tokenQuery = index.Query(token.Item2); if (tokenQuery == null || !tokenQuery.ContainsKey(pageId)) continue; // Don't bother if this page doesn't contain this token offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset))); } offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset); return offsets; } public string GenerateContext(string pageName, string source, string query, ContextSettings settings) { int pageId = idMap.GetId(pageName); Tokenizer tokenizer = new Tokenizer(query); List offsets = getPageOffsets(pageId, tokenizer); int currentLength = 0; List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length List<(int, int)> snippets = new List<(int, int)>(); // from, to for (int i = 0; i < offsets.Count; i++) { // Don't go over the maximum length // FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see. if (currentLength > settings.MaxLength) break; // Generate the next snippet (int, int) nextSnippet = ( Math.Max(0, offsets[i].Offset - settings.ContextCharacters), Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters) ); tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length)); // If the next snippet overlaps with the previous one, then combine the 2 if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) { // BUG: This *might* exceed the MaxLength a bit // Pop the last snippet from the list (int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1); currentLength += nextSnippet.Item2 - lastSnippet.Item2; lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit snippets.Add(lastSnippet); // Push it back on again continue; } // No overlap! Add it to the list snippets.Add(nextSnippet); currentLength += nextSnippet.Item2 - nextSnippet.Item1; } List snippetsText = new List( snippets.Select(((int, int) snippet) => { string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1); if (settings.Html) { result = WebUtility.HtmlEncode(result); foreach((int, string) nextToken in tokenizer.IterateTokens()) { result = Regex.Replace( result, Regex.Escape(nextToken.Item2), $"{nextToken.Item2}", RegexOptions.IgnoreCase // Also ignores accents, apparently ); } } return result; }) .Where((string snippet) => !string.IsNullOrWhiteSpace(snippet)) ); // Add the separator at the beginning and end if we aren't at the bounds of the source document if (snippets.Count > 0 && snippets.First().Item1 > 0) snippetsText.Insert(0, ""); if (snippets.Count > 0 && snippets.Last().Item2 < source.Length) snippetsText.Add(""); return string.Join( settings.Separator, snippetsText ); } #endregion } }