Add initial Query() method to SearchBox, but it's untested.

This commit is contained in:
Starbeamrainbowlabs 2018-09-18 19:49:03 +01:00
parent 0dd0de0b52
commit 2a5a50458f
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
6 changed files with 113 additions and 2 deletions

View file

@ -1,5 +1,7 @@
using System;
using System.Collections.Generic;
using Newtonsoft.Json;
using UnidecodeSharpFork;
namespace LibSearchBox
{
@ -8,6 +10,21 @@ namespace LibSearchBox
public string Title { get; set; }
public List<string> Tags { get; private set; } = new List<string>();
[JsonIgnore]
public string SearchableTitle {
get {
return Title.ToLower().Unidecode();
}
}
[JsonIgnore]
public IEnumerable<string> SearchableTags {
get {
foreach (string nextTag in Tags)
yield return nextTag.ToLower().Unidecode();
}
}
public DocumentMeta(string inTitle, IEnumerable<string> inTags)
{
Title = inTitle;

View file

@ -56,5 +56,11 @@ namespace LibSearchBox
}
return true;
}
public ConcurrentDictionary<int, List<int>> Query(string normalisedToken)
{
invertedIndex.TryGetValue(normalisedToken, out ConcurrentDictionary<int, List<int>> result);
return result;
}
}
}

View file

@ -0,0 +1,13 @@
using System;
namespace LibSearchBox
{
public class QuerySettings
{
public int WeightTitleMatch = 100;
public int WeightTagMatch = 10;
public QuerySettings()
{
}
}
}

View file

@ -1,6 +1,8 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Newtonsoft.Json;
namespace LibSearchBox
@ -30,11 +32,15 @@ namespace LibSearchBox
[JsonProperty]
public InvertedIndex index = new InvertedIndex();
public bool Verbose { get; set; } = false;
public SearchBox()
{
}
#region Index Management
public void AddDocument(string title, IEnumerable<string> tags, string content)
{
DocumentMeta info = new DocumentMeta(title, tags);
@ -63,5 +69,56 @@ namespace LibSearchBox
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
}
#endregion
#region Query
public List<SearchResult> Query(string query, QuerySettings settings)
{
// pageId => token -> count
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
Tokenizer tokenizer = new Tokenizer(query);
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
{
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
// pageTokenDef: pageId => List of token offsets
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
"results - the key already exists (are there duplicate tokens for this page id " +
"in the inverted index?");
});
}
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this?
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
}
else {
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
if (metaInfo.SearchableTitle.Contains(token.Item2))
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
foreach (string nextTag in metaInfo.SearchableTags)
if (nextTag.Contains(token.Item2))
rank += settings.WeightTagMatch;
}
}
resultsRaw.Add(new SearchResult(idMap.GetPageName(pageDef.Key), rank));
});
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
results.OrderBy((SearchResult result) => result.Rank);
return results;
}
#endregion
}
}

View file

@ -5,7 +5,7 @@
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
<ProjectGuid>{5243F60A-F822-4C52-A333-E4089754EC6A}</ProjectGuid>
<OutputType>Library</OutputType>
<RootNamespace>SearchBox</RootNamespace>
<RootNamespace>LibSearchBox</RootNamespace>
<AssemblyName>SearchBox</AssemblyName>
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
</PropertyGroup>
@ -49,6 +49,8 @@
<Compile Include="IdMap.cs" />
<Compile Include="Utilities\BiDictionary.cs" />
<Compile Include="DocumentMeta.cs" />
<Compile Include="SearchResult.cs" />
<Compile Include="QuerySettings.cs" />
</ItemGroup>
<ItemGroup>
<Folder Include="EmbeddedFiles\" />

16
SearchBox/SearchResult.cs Normal file
View file

@ -0,0 +1,16 @@
using System;
namespace LibSearchBox
{
public class SearchResult
{
public string PageName { get; private set; }
public float Rank { get; private set; }
public SearchResult(string inPageName, float inRank)
{
PageName = inPageName;
Rank = inRank;
}
}
}