A standalone full-text search engine written in C#.
133 lines
4.5 KiB

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Newtonsoft.Json;
namespace LibSearchBox
public class SearchBoxException : Exception { public SearchBoxException(string message) : base(message) { } }
public class SearchBox
private IdMap idMap = new IdMap();
public Dictionary<int, string> IdMap {
get {
Dictionary<int, string> result = idMap.MapOut;
if (result.Count == 0) return null;
return result;
set {
if (value == null) return;
public ConcurrentDictionary<int, DocumentMeta> metaTable = new ConcurrentDictionary<int, DocumentMeta>();
public InvertedIndex index = new InvertedIndex();
public bool Verbose { get; set; } = false;
public SearchBox()
#region Index Management
public void AddDocument(string title, IEnumerable<string> tags, string content)
DocumentMeta info = new DocumentMeta(title, tags);
int id = idMap.GetId(info.Title);
metaTable.AddOrUpdate(id, info, (key, oldValue) => info);
Index upsideIndex = new Index(content);
index.AddIndex(id, upsideIndex);
public void UpdateDocument(string title, IEnumerable<string> newTags, string oldContent, string newContent)
int id = idMap.GetId(title);
DocumentMeta info = metaTable[id];
Index oldIndex = new Index(oldContent), newIndex = new Index(newContent);
if (!index.ReplaceIndex(id, oldIndex, newIndex))
throw new SearchBoxException($"Error: Failed to replace index for document with title {title}.");
public void RemoveDocument(string title)
int id = idMap.DeletePageName(title);
metaTable.TryRemove(id, out DocumentMeta noop);
if (!index.RemoveById(id))
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
#region Query
public List<SearchResult> Query(string query, QuerySettings settings)
// pageId => token -> count
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
Tokenizer tokenizer = new Tokenizer(query);
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
// pageTokenDef: pageId => List of token offsets
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
"results - the key already exists (are there duplicate tokens for this page id " +
"in the inverted index?");
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this?
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
else {
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
if (metaInfo.SearchableTitle.Contains(token.Item2))
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
foreach (string nextTag in metaInfo.SearchableTags)
if (nextTag.Contains(token.Item2))
rank += settings.WeightTagMatch;
List<SearchOffset> offsets = new List<SearchOffset>();
foreach (Tuple<int, string> token in tokenizer.IterateTokens())
offsets.AddRange(index.Query(token.Item2)[pageDef.Key].Select((int offset) => new SearchOffset(token.Item2, offset)));
offsets.Sort((SearchOffset x, SearchOffset y) => x.Offset - y.Offset);
resultsRaw.Add(new SearchResult(
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
results.OrderBy((SearchResult result) => result.Rank);
return results;