Add Id mapping system & start on implementing an inverted index
This commit is contained in:
parent
e8069c93ff
commit
c46a789c28
6 changed files with 360 additions and 1 deletions
49
SearchBox/IdMapper.cs
Normal file
49
SearchBox/IdMapper.cs
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
using System;
|
||||||
|
using Stackoverflow.Utilities;
|
||||||
|
|
||||||
|
namespace SearchBox
|
||||||
|
{
|
||||||
|
public class IdNotFoundException : Exception { public IdNotFoundException(string message) : base(message) { } }
|
||||||
|
|
||||||
|
public class IdMapper
|
||||||
|
{
|
||||||
|
private int nextId = 0;
|
||||||
|
public BiDictionary<int, string> map = new BiDictionary<int, string>();
|
||||||
|
|
||||||
|
public IdMapper()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public int GetId(string pageName)
|
||||||
|
{
|
||||||
|
// Perform unicode normalization
|
||||||
|
pageName = pageName.Normalize(System.Text.NormalizationForm.FormC);
|
||||||
|
|
||||||
|
int result;
|
||||||
|
if (!map.TryGetBySecond(pageName, out result)) {
|
||||||
|
map.Add(result = nextId++, pageName);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public string GetPageName(int id)
|
||||||
|
{
|
||||||
|
string result;
|
||||||
|
if (!map.TryGetByFirst(id, out result))
|
||||||
|
throw new IdNotFoundException($"Error: Couldn't find {id} in the ID map.");
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void MovePageName(string oldPageName, string newPageName)
|
||||||
|
{
|
||||||
|
int id = map.GetBySecond(oldPageName);
|
||||||
|
map.RemoveBySecond(oldPageName);
|
||||||
|
map.Add(id, newPageName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void DeletePageName(string pageName)
|
||||||
|
{
|
||||||
|
map.RemoveBySecond(pageName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,4 +1,5 @@
|
||||||
using System;
|
using System;
|
||||||
|
using System.Collections;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
|
@ -12,7 +13,7 @@ namespace SearchBox
|
||||||
ExcludeStopwords = 1
|
ExcludeStopwords = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
public class Index
|
public class Index : IEnumerable<KeyValuePair<string, List<int>>>
|
||||||
{
|
{
|
||||||
private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>();
|
private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>();
|
||||||
private StopwordTester stopwordTester;
|
private StopwordTester stopwordTester;
|
||||||
|
@ -53,6 +54,22 @@ namespace SearchBox
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public IEnumerable<KeyValuePair<string, List<int>>> IterateItems()
|
||||||
|
{
|
||||||
|
foreach(KeyValuePair<string, List<int>> item in index)
|
||||||
|
yield return item;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IEnumerator<KeyValuePair<string, List<int>>> GetEnumerator()
|
||||||
|
{
|
||||||
|
return IterateItems().GetEnumerator();
|
||||||
|
}
|
||||||
|
IEnumerator IEnumerable.GetEnumerator()
|
||||||
|
{
|
||||||
|
return GetEnumerator();
|
||||||
|
}
|
||||||
|
|
||||||
public override string ToString()
|
public override string ToString()
|
||||||
{
|
{
|
||||||
StringBuilder result = new StringBuilder("Index: \n");
|
StringBuilder result = new StringBuilder("Index: \n");
|
||||||
|
@ -67,5 +84,6 @@ namespace SearchBox
|
||||||
{
|
{
|
||||||
return new Index(File.ReadAllText(filename));
|
return new Index(File.ReadAllText(filename));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
23
SearchBox/InvertedIndex.cs
Normal file
23
SearchBox/InvertedIndex.cs
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
namespace SearchBox
|
||||||
|
{
|
||||||
|
public class InvertedIndex
|
||||||
|
{
|
||||||
|
private Dictionary<string, Dictionary<int, List<int>>> invertedIndex = new Dictionary<string, Dictionary<int, List<int>>>();
|
||||||
|
|
||||||
|
public InvertedIndex()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public bool AddIndex(Index newIndex)
|
||||||
|
{
|
||||||
|
foreach (KeyValuePair<string, List<int>> token in newIndex)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -42,6 +42,9 @@
|
||||||
<Compile Include="Index.cs" />
|
<Compile Include="Index.cs" />
|
||||||
<Compile Include="Utilities\StringPlus.cs" />
|
<Compile Include="Utilities\StringPlus.cs" />
|
||||||
<Compile Include="StopwordTester.cs" />
|
<Compile Include="StopwordTester.cs" />
|
||||||
|
<Compile Include="InvertedIndex.cs" />
|
||||||
|
<Compile Include="IdMapper.cs" />
|
||||||
|
<Compile Include="Utilities\BiDictionary.cs" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Folder Include="EmbeddedFiles\" />
|
<Folder Include="EmbeddedFiles\" />
|
||||||
|
|
|
@ -50,6 +50,7 @@ namespace SearchBox
|
||||||
|
|
||||||
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
||||||
|
|
||||||
|
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
|
||||||
yield return new Tuple<int, string>(index, parts[i]);
|
yield return new Tuple<int, string>(index, parts[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
265
SearchBox/Utilities/BiDictionary.cs
Normal file
265
SearchBox/Utilities/BiDictionary.cs
Normal file
|
@ -0,0 +1,265 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
namespace Stackoverflow.Utilities
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// This is a dictionary guaranteed to have only one of each value and key.
|
||||||
|
/// It may be searched either by TFirst or by TSecond, giving a unique answer because it is 1 to 1.
|
||||||
|
/// It implements garbage-collector-friendly IEnumerable.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>From https://stackoverflow.com/a/35949314/1460422</remarks>
|
||||||
|
/// <typeparam name="TFirst">The type of the "key"</typeparam>
|
||||||
|
/// <typeparam name="TSecond">The type of the "value"</typeparam>
|
||||||
|
public class BiDictionary<TFirst, TSecond> : IEnumerable<BiDictionary<TFirst, TSecond>.Pair>
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
public struct Pair
|
||||||
|
{
|
||||||
|
public TFirst First;
|
||||||
|
public TSecond Second;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public struct Enumerator : IEnumerator<Pair>, IEnumerator
|
||||||
|
{
|
||||||
|
|
||||||
|
public Enumerator(Dictionary<TFirst, TSecond>.Enumerator dictEnumerator)
|
||||||
|
{
|
||||||
|
_dictEnumerator = dictEnumerator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair Current {
|
||||||
|
get {
|
||||||
|
Pair pair;
|
||||||
|
pair.First = _dictEnumerator.Current.Key;
|
||||||
|
pair.Second = _dictEnumerator.Current.Value;
|
||||||
|
return pair;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object IEnumerator.Current {
|
||||||
|
get {
|
||||||
|
return Current;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
_dictEnumerator.Dispose();
|
||||||
|
}
|
||||||
|
|
||||||
|
public bool MoveNext()
|
||||||
|
{
|
||||||
|
return _dictEnumerator.MoveNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Reset()
|
||||||
|
{
|
||||||
|
throw new NotSupportedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Dictionary<TFirst, TSecond>.Enumerator _dictEnumerator;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#region Exception throwing methods
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Tries to add the pair to the dictionary.
|
||||||
|
/// Throws an exception if either element is already in the dictionary
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="first"></param>
|
||||||
|
/// <param name="second"></param>
|
||||||
|
public void Add(TFirst first, TSecond second)
|
||||||
|
{
|
||||||
|
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
|
||||||
|
throw new ArgumentException("Duplicate first or second");
|
||||||
|
|
||||||
|
_firstToSecond.Add(first, second);
|
||||||
|
_secondToFirst.Add(second, first);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Find the TSecond corresponding to the TFirst first
|
||||||
|
/// Throws an exception if first is not in the dictionary.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="first">the key to search for</param>
|
||||||
|
/// <returns>the value corresponding to first</returns>
|
||||||
|
public TSecond GetByFirst(TFirst first)
|
||||||
|
{
|
||||||
|
TSecond second;
|
||||||
|
if (!_firstToSecond.TryGetValue(first, out second))
|
||||||
|
throw new ArgumentException("first");
|
||||||
|
|
||||||
|
return second;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Find the TFirst corresponing to the Second second.
|
||||||
|
/// Throws an exception if second is not in the dictionary.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="second">the key to search for</param>
|
||||||
|
/// <returns>the value corresponding to second</returns>
|
||||||
|
public TFirst GetBySecond(TSecond second)
|
||||||
|
{
|
||||||
|
TFirst first;
|
||||||
|
if (!_secondToFirst.TryGetValue(second, out first))
|
||||||
|
throw new ArgumentException("second");
|
||||||
|
|
||||||
|
return first;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Remove the record containing first.
|
||||||
|
/// If first is not in the dictionary, throws an Exception.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="first">the key of the record to delete</param>
|
||||||
|
public void RemoveByFirst(TFirst first)
|
||||||
|
{
|
||||||
|
TSecond second;
|
||||||
|
if (!_firstToSecond.TryGetValue(first, out second))
|
||||||
|
throw new ArgumentException("first");
|
||||||
|
|
||||||
|
_firstToSecond.Remove(first);
|
||||||
|
_secondToFirst.Remove(second);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Remove the record containing second.
|
||||||
|
/// If second is not in the dictionary, throws an Exception.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="second">the key of the record to delete</param>
|
||||||
|
public void RemoveBySecond(TSecond second)
|
||||||
|
{
|
||||||
|
TFirst first;
|
||||||
|
if (!_secondToFirst.TryGetValue(second, out first))
|
||||||
|
throw new ArgumentException("second");
|
||||||
|
|
||||||
|
_secondToFirst.Remove(second);
|
||||||
|
_firstToSecond.Remove(first);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Try methods
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Tries to add the pair to the dictionary.
|
||||||
|
/// Returns false if either element is already in the dictionary
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="first"></param>
|
||||||
|
/// <param name="second"></param>
|
||||||
|
/// <returns>true if successfully added, false if either element are already in the dictionary</returns>
|
||||||
|
public bool TryAdd(TFirst first, TSecond second)
|
||||||
|
{
|
||||||
|
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
_firstToSecond.Add(first, second);
|
||||||
|
_secondToFirst.Add(second, first);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Find the TSecond corresponding to the TFirst first.
|
||||||
|
/// Returns false if first is not in the dictionary.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="first">the key to search for</param>
|
||||||
|
/// <param name="second">the corresponding value</param>
|
||||||
|
/// <returns>true if first is in the dictionary, false otherwise</returns>
|
||||||
|
public bool TryGetByFirst(TFirst first, out TSecond second)
|
||||||
|
{
|
||||||
|
return _firstToSecond.TryGetValue(first, out second);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Find the TFirst corresponding to the TSecond second.
|
||||||
|
/// Returns false if second is not in the dictionary.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="second">the key to search for</param>
|
||||||
|
/// <param name="first">the corresponding value</param>
|
||||||
|
/// <returns>true if second is in the dictionary, false otherwise</returns>
|
||||||
|
public bool TryGetBySecond(TSecond second, out TFirst first)
|
||||||
|
{
|
||||||
|
return _secondToFirst.TryGetValue(second, out first);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Remove the record containing first, if there is one.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="first"></param>
|
||||||
|
/// <returns> If first is not in the dictionary, returns false, otherwise true</returns>
|
||||||
|
public bool TryRemoveByFirst(TFirst first)
|
||||||
|
{
|
||||||
|
TSecond second;
|
||||||
|
if (!_firstToSecond.TryGetValue(first, out second))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
_firstToSecond.Remove(first);
|
||||||
|
_secondToFirst.Remove(second);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Remove the record containing second, if there is one.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="second"></param>
|
||||||
|
/// <returns> If second is not in the dictionary, returns false, otherwise true</returns>
|
||||||
|
public bool TryRemoveBySecond(TSecond second)
|
||||||
|
{
|
||||||
|
TFirst first;
|
||||||
|
if (!_secondToFirst.TryGetValue(second, out first))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
_secondToFirst.Remove(second);
|
||||||
|
_firstToSecond.Remove(first);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The number of pairs stored in the dictionary
|
||||||
|
/// </summary>
|
||||||
|
public Int32 Count {
|
||||||
|
get { return _firstToSecond.Count; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Removes all items from the dictionary.
|
||||||
|
/// </summary>
|
||||||
|
public void Clear()
|
||||||
|
{
|
||||||
|
_firstToSecond.Clear();
|
||||||
|
_secondToFirst.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Enumerator GetEnumerator()
|
||||||
|
{
|
||||||
|
//enumerator.Reset(firstToSecond.GetEnumerator());
|
||||||
|
return new Enumerator(_firstToSecond.GetEnumerator());
|
||||||
|
}
|
||||||
|
|
||||||
|
IEnumerator<Pair> IEnumerable<Pair>.GetEnumerator()
|
||||||
|
{
|
||||||
|
return GetEnumerator();
|
||||||
|
}
|
||||||
|
|
||||||
|
IEnumerator IEnumerable.GetEnumerator()
|
||||||
|
{
|
||||||
|
return GetEnumerator();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private Dictionary<TFirst, TSecond> _firstToSecond = new Dictionary<TFirst, TSecond>();
|
||||||
|
private Dictionary<TSecond, TFirst> _secondToFirst = new Dictionary<TSecond, TFirst>();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue