Add initial Query() method to SearchBox, but it's untested.
This commit is contained in:
parent
0dd0de0b52
commit
2a5a50458f
6 changed files with 113 additions and 2 deletions
|
@ -1,5 +1,7 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Newtonsoft.Json;
|
||||
using UnidecodeSharpFork;
|
||||
|
||||
namespace LibSearchBox
|
||||
{
|
||||
|
@ -8,6 +10,21 @@ namespace LibSearchBox
|
|||
public string Title { get; set; }
|
||||
public List<string> Tags { get; private set; } = new List<string>();
|
||||
|
||||
[JsonIgnore]
|
||||
public string SearchableTitle {
|
||||
get {
|
||||
return Title.ToLower().Unidecode();
|
||||
}
|
||||
}
|
||||
|
||||
[JsonIgnore]
|
||||
public IEnumerable<string> SearchableTags {
|
||||
get {
|
||||
foreach (string nextTag in Tags)
|
||||
yield return nextTag.ToLower().Unidecode();
|
||||
}
|
||||
}
|
||||
|
||||
public DocumentMeta(string inTitle, IEnumerable<string> inTags)
|
||||
{
|
||||
Title = inTitle;
|
||||
|
|
|
@ -56,5 +56,11 @@ namespace LibSearchBox
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public ConcurrentDictionary<int, List<int>> Query(string normalisedToken)
|
||||
{
|
||||
invertedIndex.TryGetValue(normalisedToken, out ConcurrentDictionary<int, List<int>> result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
13
SearchBox/QuerySettings.cs
Normal file
13
SearchBox/QuerySettings.cs
Normal file
|
@ -0,0 +1,13 @@
|
|||
using System;
|
||||
namespace LibSearchBox
|
||||
{
|
||||
public class QuerySettings
|
||||
{
|
||||
public int WeightTitleMatch = 100;
|
||||
public int WeightTagMatch = 10;
|
||||
|
||||
public QuerySettings()
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,6 +1,8 @@
|
|||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using Newtonsoft.Json;
|
||||
|
||||
namespace LibSearchBox
|
||||
|
@ -30,11 +32,15 @@ namespace LibSearchBox
|
|||
[JsonProperty]
|
||||
public InvertedIndex index = new InvertedIndex();
|
||||
|
||||
public bool Verbose { get; set; } = false;
|
||||
|
||||
public SearchBox()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
#region Index Management
|
||||
|
||||
public void AddDocument(string title, IEnumerable<string> tags, string content)
|
||||
{
|
||||
DocumentMeta info = new DocumentMeta(title, tags);
|
||||
|
@ -63,5 +69,56 @@ namespace LibSearchBox
|
|||
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Query
|
||||
|
||||
public List<SearchResult> Query(string query, QuerySettings settings)
|
||||
{
|
||||
// pageId => token -> count
|
||||
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
|
||||
|
||||
Tokenizer tokenizer = new Tokenizer(query);
|
||||
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
|
||||
{
|
||||
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
|
||||
|
||||
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
|
||||
// pageTokenDef: pageId => List of token offsets
|
||||
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
|
||||
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
|
||||
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
|
||||
"results - the key already exists (are there duplicate tokens for this page id " +
|
||||
"in the inverted index?");
|
||||
});
|
||||
}
|
||||
|
||||
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
|
||||
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
|
||||
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this?
|
||||
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
|
||||
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
|
||||
}
|
||||
else {
|
||||
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
|
||||
if (metaInfo.SearchableTitle.Contains(token.Item2))
|
||||
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
|
||||
|
||||
foreach (string nextTag in metaInfo.SearchableTags)
|
||||
if (nextTag.Contains(token.Item2))
|
||||
rank += settings.WeightTagMatch;
|
||||
}
|
||||
}
|
||||
|
||||
resultsRaw.Add(new SearchResult(idMap.GetPageName(pageDef.Key), rank));
|
||||
});
|
||||
|
||||
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
|
||||
results.OrderBy((SearchResult result) => result.Rank);
|
||||
return results;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
|
||||
<ProjectGuid>{5243F60A-F822-4C52-A333-E4089754EC6A}</ProjectGuid>
|
||||
<OutputType>Library</OutputType>
|
||||
<RootNamespace>SearchBox</RootNamespace>
|
||||
<RootNamespace>LibSearchBox</RootNamespace>
|
||||
<AssemblyName>SearchBox</AssemblyName>
|
||||
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
|
||||
</PropertyGroup>
|
||||
|
@ -49,6 +49,8 @@
|
|||
<Compile Include="IdMap.cs" />
|
||||
<Compile Include="Utilities\BiDictionary.cs" />
|
||||
<Compile Include="DocumentMeta.cs" />
|
||||
<Compile Include="SearchResult.cs" />
|
||||
<Compile Include="QuerySettings.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Folder Include="EmbeddedFiles\" />
|
||||
|
|
16
SearchBox/SearchResult.cs
Normal file
16
SearchBox/SearchResult.cs
Normal file
|
@ -0,0 +1,16 @@
|
|||
using System;
|
||||
|
||||
namespace LibSearchBox
|
||||
{
|
||||
public class SearchResult
|
||||
{
|
||||
public string PageName { get; private set; }
|
||||
public float Rank { get; private set; }
|
||||
|
||||
public SearchResult(string inPageName, float inRank)
|
||||
{
|
||||
PageName = inPageName;
|
||||
Rank = inRank;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue