SearchBox/SearchBox/SearchBox.cs

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Newtonsoft.Json;
using UnidecodeSharpFork;

namespace LibSearchBox
{
	public class QuerySettings
	{
		public int WeightTitleMatch = 100;
		public int WeightTagMatch = 10;

		public QuerySettings()
		{
		}
	}

	public class ContextSettings
	{
		/// <summary>
		/// The number of characters that should be displayed either side of a
		/// matching term.
		/// </summary>
		public int ContextCharacters = 75;
		/// <summary>
		/// The maximum length of the generated context string.
		/// </summary>
		public int MaxLength = 250;

		/// <summary>
		/// The separator to use between snippets in the generated context.
		/// </summary>
		public string Separator = " … ";

		/// <summary>
		/// Whether to output the generated context in HTML.
		/// </summary>
		public bool Html = false;
	}

	public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) {  } }

	[JsonObject(MemberSerialization.OptIn)]
	public class SearchBox
	{
		private IdMap idMap = new IdMap();

		[JsonProperty("ids")]
		public Dictionary<int, string> IdMap {
			get {
				Dictionary<int, string> result = idMap.MapOut;
				if (result.Count == 0) return null;
				return result;
			}
			set {
				if (value == null) return;
				idMap.Import(value);
			}
		}

		[JsonProperty]
		public ConcurrentDictionary<int, DocumentMeta> metaTable = new ConcurrentDictionary<int, DocumentMeta>();
		[JsonProperty]
		public InvertedIndex index = new InvertedIndex();

		public bool Verbose { get; set; } = false;

		public SearchBox()
		{

		}

		#region Index Management

		public void AddDocument(string title, IEnumerable<string> tags, string content)
		{
			DocumentMeta info = new DocumentMeta(title, tags);
			int id = idMap.GetId(info.Title);
			metaTable.AddOrUpdate(id, info, (key, oldValue) => info);
			Index upsideIndex = new Index(content);
			index.AddIndex(id, upsideIndex);
		}

		public void UpdateDocument(string title, IEnumerable<string> newTags, string oldContent, string newContent)
		{
			int id = idMap.GetId(title);
			DocumentMeta info = metaTable[id];
			info.ReplaceTags(newTags);

			Index oldIndex = new Index(oldContent), newIndex = new Index(newContent);
			if (!index.ReplaceIndex(id, oldIndex, newIndex))
				throw new SearchBoxException($"Error: Failed to replace index for document with title {title}.");
		}

		public void RemoveDocument(string title)
		{
			int id = idMap.DeletePageName(title);
			metaTable.TryRemove(id, out DocumentMeta noop);
			if (!index.RemoveById(id))
				throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");

		}

		#endregion

		#region Query

		public List<SearchResult> Query(string query, QuerySettings settings)
		{
			// pageId => token -> count
			ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();

			Tokenizer tokenizer = new Tokenizer(query);
			foreach((int, string) token in tokenizer.IterateTokens())
			{
				ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);

				Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
					// pageTokenDef: pageId => List of token offsets
					ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
					if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
						throw new SearchBoxException("Error: Failed to add token count to page data in search " +
						    "results - the key already exists (are there duplicate tokens for this page id " +
						    "in the inverted index?");
				});
			}

			ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
			Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) =>
			{
				int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is there a faster way of doing this?
				if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo))
				{
					if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
				}
				else
				{
					foreach ((int, string) token in tokenizer.IterateTokens())
					{
						if (metaInfo.SearchableTitle.Contains(token.Item2))
							rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched

						foreach (string nextTag in metaInfo.SearchableTags)
							if (nextTag.Contains(token.Item2))
								rank += settings.WeightTagMatch;
					}
				}

				List<SearchOffset> offsets = getPageOffsets(pageDef.Key, tokenizer);

				resultsRaw.Add(new SearchResult(
					idMap.GetPageName(pageDef.Key),
					rank,
					offsets
				));
			});

			List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
			results.Sort((SearchResult a, SearchResult b) => (int)Math.Round(b.Rank - a.Rank));
			return results;
		}

		private List<SearchOffset> getPageOffsets(int pageId, Tokenizer tokenizer)
		{
			List<SearchOffset> offsets = new List<SearchOffset>();
			foreach ((int, string) token in tokenizer.IterateTokens())
			{
				ConcurrentDictionary<int, List<int>> tokenQuery = index.Query(token.Item2);
				if (tokenQuery == null || !tokenQuery.ContainsKey(pageId))
					continue; // Don't bother if this page doesn't contain this token
				offsets.AddRange(tokenQuery[pageId].Select((int offset) => new SearchOffset(token.Item2, offset)));
			}
			offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
			return offsets;
		}

		public string GenerateContext(string pageName, string source, string query, ContextSettings settings)
		{
			int pageId = idMap.GetId(pageName);

			Tokenizer tokenizer = new Tokenizer(query);
			List<SearchOffset> offsets = getPageOffsets(pageId, tokenizer);

			int currentLength = 0;
			List<(int, int)> tokenLocations = new List<(int, int)>(); // offset, length
			List<(int, int)> snippets = new List<(int, int)>(); // from, to
			for (int i = 0; i < offsets.Count; i++)
			{
				// Don't go over the maximum length
				// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
				if (currentLength > settings.MaxLength)
					break;

				// Generate the next snippet
				(int, int) nextSnippet = (
					Math.Max(0, offsets[i].Offset - settings.ContextCharacters),
					Math.Min(source.Length, offsets[i].Offset + offsets[i].Term.Length + settings.ContextCharacters)
				);


				tokenLocations.Add((offsets[i].Offset, offsets[i].Term.Length));
				// If the next snippet overlaps with the previous one, then combine the 2
				if (snippets.Count > 0 && snippets.Last().Item2 > nextSnippet.Item1) {
					// BUG: This *might* exceed the MaxLength a bit
					// Pop the last snippet from the list
					(int, int) lastSnippet = snippets[snippets.Count - 1]; snippets.RemoveAt(snippets.Count - 1);
					currentLength += nextSnippet.Item2 - lastSnippet.Item2;

					lastSnippet.Item2 = nextSnippet.Item2; // Fiddle it a bit
					snippets.Add(lastSnippet); // Push it back on again

					continue;
				}


				// No overlap! Add it to the list
				snippets.Add(nextSnippet);
				currentLength += nextSnippet.Item2 - nextSnippet.Item1;
			}

			List<string> snippetsText = new List<string>(
				snippets.Select(((int, int) snippet) => {
					string result = source.Substring(snippet.Item1, snippet.Item2 - snippet.Item1);

					if (settings.Html) {
						result = WebUtility.HtmlEncode(result);

						foreach((int, string) nextToken in tokenizer.IterateTokens()) {
							result = Regex.Replace(
								result,
								Regex.Escape(nextToken.Item2),
								$"<strong class='query-term'>{nextToken.Item2}</strong>",
								RegexOptions.IgnoreCase // Also ignores accents, apparently
							);
						}
					}

					return result;
				})
				.Where((string snippet) => !string.IsNullOrWhiteSpace(snippet))
			);

			// Add the separator at the beginning and end if we aren't at the bounds of the source document
			if (snippets.Count > 0 && snippets.First().Item1 > 0)
				snippetsText.Insert(0, "");
			if (snippets.Count > 0 && snippets.Last().Item2 < source.Length)
				snippetsText.Add("");

			return string.Join(
				settings.Separator,
				snippetsText
			);
		}


		#endregion
	}
}