Add initial Query() method to SearchBox, but it's untested.
This commit is contained in:
parent
0dd0de0b52
commit
2a5a50458f
6 changed files with 113 additions and 2 deletions
|
@ -1,5 +1,7 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
|
using Newtonsoft.Json;
|
||||||
|
using UnidecodeSharpFork;
|
||||||
|
|
||||||
namespace LibSearchBox
|
namespace LibSearchBox
|
||||||
{
|
{
|
||||||
|
@ -8,6 +10,21 @@ namespace LibSearchBox
|
||||||
public string Title { get; set; }
|
public string Title { get; set; }
|
||||||
public List<string> Tags { get; private set; } = new List<string>();
|
public List<string> Tags { get; private set; } = new List<string>();
|
||||||
|
|
||||||
|
[JsonIgnore]
|
||||||
|
public string SearchableTitle {
|
||||||
|
get {
|
||||||
|
return Title.ToLower().Unidecode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[JsonIgnore]
|
||||||
|
public IEnumerable<string> SearchableTags {
|
||||||
|
get {
|
||||||
|
foreach (string nextTag in Tags)
|
||||||
|
yield return nextTag.ToLower().Unidecode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public DocumentMeta(string inTitle, IEnumerable<string> inTags)
|
public DocumentMeta(string inTitle, IEnumerable<string> inTags)
|
||||||
{
|
{
|
||||||
Title = inTitle;
|
Title = inTitle;
|
||||||
|
|
|
@ -56,5 +56,11 @@ namespace LibSearchBox
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ConcurrentDictionary<int, List<int>> Query(string normalisedToken)
|
||||||
|
{
|
||||||
|
invertedIndex.TryGetValue(normalisedToken, out ConcurrentDictionary<int, List<int>> result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
13
SearchBox/QuerySettings.cs
Normal file
13
SearchBox/QuerySettings.cs
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
using System;
|
||||||
|
namespace LibSearchBox
|
||||||
|
{
|
||||||
|
public class QuerySettings
|
||||||
|
{
|
||||||
|
public int WeightTitleMatch = 100;
|
||||||
|
public int WeightTagMatch = 10;
|
||||||
|
|
||||||
|
public QuerySettings()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,8 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Concurrent;
|
using System.Collections.Concurrent;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Threading.Tasks;
|
||||||
using Newtonsoft.Json;
|
using Newtonsoft.Json;
|
||||||
|
|
||||||
namespace LibSearchBox
|
namespace LibSearchBox
|
||||||
|
@ -30,11 +32,15 @@ namespace LibSearchBox
|
||||||
[JsonProperty]
|
[JsonProperty]
|
||||||
public InvertedIndex index = new InvertedIndex();
|
public InvertedIndex index = new InvertedIndex();
|
||||||
|
|
||||||
|
public bool Verbose { get; set; } = false;
|
||||||
|
|
||||||
public SearchBox()
|
public SearchBox()
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#region Index Management
|
||||||
|
|
||||||
public void AddDocument(string title, IEnumerable<string> tags, string content)
|
public void AddDocument(string title, IEnumerable<string> tags, string content)
|
||||||
{
|
{
|
||||||
DocumentMeta info = new DocumentMeta(title, tags);
|
DocumentMeta info = new DocumentMeta(title, tags);
|
||||||
|
@ -63,5 +69,56 @@ namespace LibSearchBox
|
||||||
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
|
throw new SearchBoxException($"Failed to remove page with title '{title}' from inverted index.");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Query
|
||||||
|
|
||||||
|
public List<SearchResult> Query(string query, QuerySettings settings)
|
||||||
|
{
|
||||||
|
// pageId => token -> count
|
||||||
|
ConcurrentDictionary<int, ConcurrentDictionary<string, int>> matchingPages = new ConcurrentDictionary<int, ConcurrentDictionary<string, int>>();
|
||||||
|
|
||||||
|
Tokenizer tokenizer = new Tokenizer(query);
|
||||||
|
foreach(Tuple<int, string> token in tokenizer.IterateTokens())
|
||||||
|
{
|
||||||
|
ConcurrentDictionary<int, List<int>> tokenResults = index.Query(token.Item2);
|
||||||
|
|
||||||
|
Parallel.ForEach(tokenResults, (KeyValuePair<int, List<int>> pageTokenDef) => {
|
||||||
|
// pageTokenDef: pageId => List of token offsets
|
||||||
|
ConcurrentDictionary<string, int> pageData = matchingPages.GetOrAdd(pageTokenDef.Key, new ConcurrentDictionary<string, int>());
|
||||||
|
if (!pageData.TryAdd(token.Item2, pageTokenDef.Value.Count))
|
||||||
|
throw new SearchBoxException("Error: Failed to add token count to page data in search " +
|
||||||
|
"results - the key already exists (are there duplicate tokens for this page id " +
|
||||||
|
"in the inverted index?");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
ConcurrentBag<SearchResult> resultsRaw = new ConcurrentBag<SearchResult>();
|
||||||
|
Parallel.ForEach(matchingPages, (KeyValuePair<int, ConcurrentDictionary<string, int>> pageDef) => {
|
||||||
|
int rank = pageDef.Value.Values.Sum(); // FUTURE: Linq is slow. Is theree a faster way of doing this?
|
||||||
|
if (!metaTable.TryGetValue(pageDef.Key, out DocumentMeta metaInfo)) {
|
||||||
|
if (Verbose) Console.Error.WriteLine($"Warning: Failed to fetch meta info for page id {pageDef.Key}");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
foreach (Tuple<int, string> token in tokenizer.IterateTokens()) {
|
||||||
|
if (metaInfo.SearchableTitle.Contains(token.Item2))
|
||||||
|
rank += settings.WeightTitleMatch; // FUTURE: Alter the amount added by the length of the word matched
|
||||||
|
|
||||||
|
foreach (string nextTag in metaInfo.SearchableTags)
|
||||||
|
if (nextTag.Contains(token.Item2))
|
||||||
|
rank += settings.WeightTagMatch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resultsRaw.Add(new SearchResult(idMap.GetPageName(pageDef.Key), rank));
|
||||||
|
});
|
||||||
|
|
||||||
|
List<SearchResult> results = new List<SearchResult>(resultsRaw.AsEnumerable());
|
||||||
|
results.OrderBy((SearchResult result) => result.Rank);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
|
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
|
||||||
<ProjectGuid>{5243F60A-F822-4C52-A333-E4089754EC6A}</ProjectGuid>
|
<ProjectGuid>{5243F60A-F822-4C52-A333-E4089754EC6A}</ProjectGuid>
|
||||||
<OutputType>Library</OutputType>
|
<OutputType>Library</OutputType>
|
||||||
<RootNamespace>SearchBox</RootNamespace>
|
<RootNamespace>LibSearchBox</RootNamespace>
|
||||||
<AssemblyName>SearchBox</AssemblyName>
|
<AssemblyName>SearchBox</AssemblyName>
|
||||||
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
|
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
@ -49,6 +49,8 @@
|
||||||
<Compile Include="IdMap.cs" />
|
<Compile Include="IdMap.cs" />
|
||||||
<Compile Include="Utilities\BiDictionary.cs" />
|
<Compile Include="Utilities\BiDictionary.cs" />
|
||||||
<Compile Include="DocumentMeta.cs" />
|
<Compile Include="DocumentMeta.cs" />
|
||||||
|
<Compile Include="SearchResult.cs" />
|
||||||
|
<Compile Include="QuerySettings.cs" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Folder Include="EmbeddedFiles\" />
|
<Folder Include="EmbeddedFiles\" />
|
||||||
|
|
16
SearchBox/SearchResult.cs
Normal file
16
SearchBox/SearchResult.cs
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace LibSearchBox
|
||||||
|
{
|
||||||
|
public class SearchResult
|
||||||
|
{
|
||||||
|
public string PageName { get; private set; }
|
||||||
|
public float Rank { get; private set; }
|
||||||
|
|
||||||
|
public SearchResult(string inPageName, float inRank)
|
||||||
|
{
|
||||||
|
PageName = inPageName;
|
||||||
|
Rank = inRank;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue