Add Id mapping system & start on implementing an inverted index
This commit is contained in:
parent
e8069c93ff
commit
c46a789c28
6 changed files with 360 additions and 1 deletions
49
SearchBox/IdMapper.cs
Normal file
49
SearchBox/IdMapper.cs
Normal file
|
@ -0,0 +1,49 @@
|
|||
using System;
|
||||
using Stackoverflow.Utilities;
|
||||
|
||||
namespace SearchBox
|
||||
{
|
||||
public class IdNotFoundException : Exception { public IdNotFoundException(string message) : base(message) { } }
|
||||
|
||||
public class IdMapper
|
||||
{
|
||||
private int nextId = 0;
|
||||
public BiDictionary<int, string> map = new BiDictionary<int, string>();
|
||||
|
||||
public IdMapper()
|
||||
{
|
||||
}
|
||||
|
||||
public int GetId(string pageName)
|
||||
{
|
||||
// Perform unicode normalization
|
||||
pageName = pageName.Normalize(System.Text.NormalizationForm.FormC);
|
||||
|
||||
int result;
|
||||
if (!map.TryGetBySecond(pageName, out result)) {
|
||||
map.Add(result = nextId++, pageName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public string GetPageName(int id)
|
||||
{
|
||||
string result;
|
||||
if (!map.TryGetByFirst(id, out result))
|
||||
throw new IdNotFoundException($"Error: Couldn't find {id} in the ID map.");
|
||||
return result;
|
||||
}
|
||||
|
||||
public void MovePageName(string oldPageName, string newPageName)
|
||||
{
|
||||
int id = map.GetBySecond(oldPageName);
|
||||
map.RemoveBySecond(oldPageName);
|
||||
map.Add(id, newPageName);
|
||||
}
|
||||
|
||||
public void DeletePageName(string pageName)
|
||||
{
|
||||
map.RemoveBySecond(pageName);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
|
@ -12,7 +13,7 @@ namespace SearchBox
|
|||
ExcludeStopwords = 1
|
||||
}
|
||||
|
||||
public class Index
|
||||
public class Index : IEnumerable<KeyValuePair<string, List<int>>>
|
||||
{
|
||||
private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>();
|
||||
private StopwordTester stopwordTester;
|
||||
|
@ -53,6 +54,22 @@ namespace SearchBox
|
|||
}
|
||||
|
||||
|
||||
|
||||
public IEnumerable<KeyValuePair<string, List<int>>> IterateItems()
|
||||
{
|
||||
foreach(KeyValuePair<string, List<int>> item in index)
|
||||
yield return item;
|
||||
}
|
||||
|
||||
public IEnumerator<KeyValuePair<string, List<int>>> GetEnumerator()
|
||||
{
|
||||
return IterateItems().GetEnumerator();
|
||||
}
|
||||
IEnumerator IEnumerable.GetEnumerator()
|
||||
{
|
||||
return GetEnumerator();
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
StringBuilder result = new StringBuilder("Index: \n");
|
||||
|
@ -67,5 +84,6 @@ namespace SearchBox
|
|||
{
|
||||
return new Index(File.ReadAllText(filename));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
23
SearchBox/InvertedIndex.cs
Normal file
23
SearchBox/InvertedIndex.cs
Normal file
|
@ -0,0 +1,23 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace SearchBox
|
||||
{
|
||||
public class InvertedIndex
|
||||
{
|
||||
private Dictionary<string, Dictionary<int, List<int>>> invertedIndex = new Dictionary<string, Dictionary<int, List<int>>>();
|
||||
|
||||
public InvertedIndex()
|
||||
{
|
||||
}
|
||||
|
||||
public bool AddIndex(Index newIndex)
|
||||
{
|
||||
foreach (KeyValuePair<string, List<int>> token in newIndex)
|
||||
{
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -42,6 +42,9 @@
|
|||
<Compile Include="Index.cs" />
|
||||
<Compile Include="Utilities\StringPlus.cs" />
|
||||
<Compile Include="StopwordTester.cs" />
|
||||
<Compile Include="InvertedIndex.cs" />
|
||||
<Compile Include="IdMapper.cs" />
|
||||
<Compile Include="Utilities\BiDictionary.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Folder Include="EmbeddedFiles\" />
|
||||
|
|
|
@ -50,6 +50,7 @@ namespace SearchBox
|
|||
|
||||
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
||||
|
||||
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
|
||||
yield return new Tuple<int, string>(index, parts[i]);
|
||||
}
|
||||
}
|
||||
|
|
265
SearchBox/Utilities/BiDictionary.cs
Normal file
265
SearchBox/Utilities/BiDictionary.cs
Normal file
|
@ -0,0 +1,265 @@
|
|||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace Stackoverflow.Utilities
|
||||
{
|
||||
/// <summary>
|
||||
/// This is a dictionary guaranteed to have only one of each value and key.
|
||||
/// It may be searched either by TFirst or by TSecond, giving a unique answer because it is 1 to 1.
|
||||
/// It implements garbage-collector-friendly IEnumerable.
|
||||
/// </summary>
|
||||
/// <remarks>From https://stackoverflow.com/a/35949314/1460422</remarks>
|
||||
/// <typeparam name="TFirst">The type of the "key"</typeparam>
|
||||
/// <typeparam name="TSecond">The type of the "value"</typeparam>
|
||||
public class BiDictionary<TFirst, TSecond> : IEnumerable<BiDictionary<TFirst, TSecond>.Pair>
|
||||
{
|
||||
|
||||
|
||||
public struct Pair
|
||||
{
|
||||
public TFirst First;
|
||||
public TSecond Second;
|
||||
}
|
||||
|
||||
|
||||
public struct Enumerator : IEnumerator<Pair>, IEnumerator
|
||||
{
|
||||
|
||||
public Enumerator(Dictionary<TFirst, TSecond>.Enumerator dictEnumerator)
|
||||
{
|
||||
_dictEnumerator = dictEnumerator;
|
||||
}
|
||||
|
||||
public Pair Current {
|
||||
get {
|
||||
Pair pair;
|
||||
pair.First = _dictEnumerator.Current.Key;
|
||||
pair.Second = _dictEnumerator.Current.Value;
|
||||
return pair;
|
||||
}
|
||||
}
|
||||
|
||||
object IEnumerator.Current {
|
||||
get {
|
||||
return Current;
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_dictEnumerator.Dispose();
|
||||
}
|
||||
|
||||
public bool MoveNext()
|
||||
{
|
||||
return _dictEnumerator.MoveNext();
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
throw new NotSupportedException();
|
||||
}
|
||||
|
||||
private Dictionary<TFirst, TSecond>.Enumerator _dictEnumerator;
|
||||
|
||||
}
|
||||
|
||||
#region Exception throwing methods
|
||||
|
||||
/// <summary>
|
||||
/// Tries to add the pair to the dictionary.
|
||||
/// Throws an exception if either element is already in the dictionary
|
||||
/// </summary>
|
||||
/// <param name="first"></param>
|
||||
/// <param name="second"></param>
|
||||
public void Add(TFirst first, TSecond second)
|
||||
{
|
||||
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
|
||||
throw new ArgumentException("Duplicate first or second");
|
||||
|
||||
_firstToSecond.Add(first, second);
|
||||
_secondToFirst.Add(second, first);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the TSecond corresponding to the TFirst first
|
||||
/// Throws an exception if first is not in the dictionary.
|
||||
/// </summary>
|
||||
/// <param name="first">the key to search for</param>
|
||||
/// <returns>the value corresponding to first</returns>
|
||||
public TSecond GetByFirst(TFirst first)
|
||||
{
|
||||
TSecond second;
|
||||
if (!_firstToSecond.TryGetValue(first, out second))
|
||||
throw new ArgumentException("first");
|
||||
|
||||
return second;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the TFirst corresponing to the Second second.
|
||||
/// Throws an exception if second is not in the dictionary.
|
||||
/// </summary>
|
||||
/// <param name="second">the key to search for</param>
|
||||
/// <returns>the value corresponding to second</returns>
|
||||
public TFirst GetBySecond(TSecond second)
|
||||
{
|
||||
TFirst first;
|
||||
if (!_secondToFirst.TryGetValue(second, out first))
|
||||
throw new ArgumentException("second");
|
||||
|
||||
return first;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Remove the record containing first.
|
||||
/// If first is not in the dictionary, throws an Exception.
|
||||
/// </summary>
|
||||
/// <param name="first">the key of the record to delete</param>
|
||||
public void RemoveByFirst(TFirst first)
|
||||
{
|
||||
TSecond second;
|
||||
if (!_firstToSecond.TryGetValue(first, out second))
|
||||
throw new ArgumentException("first");
|
||||
|
||||
_firstToSecond.Remove(first);
|
||||
_secondToFirst.Remove(second);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove the record containing second.
|
||||
/// If second is not in the dictionary, throws an Exception.
|
||||
/// </summary>
|
||||
/// <param name="second">the key of the record to delete</param>
|
||||
public void RemoveBySecond(TSecond second)
|
||||
{
|
||||
TFirst first;
|
||||
if (!_secondToFirst.TryGetValue(second, out first))
|
||||
throw new ArgumentException("second");
|
||||
|
||||
_secondToFirst.Remove(second);
|
||||
_firstToSecond.Remove(first);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Try methods
|
||||
|
||||
/// <summary>
|
||||
/// Tries to add the pair to the dictionary.
|
||||
/// Returns false if either element is already in the dictionary
|
||||
/// </summary>
|
||||
/// <param name="first"></param>
|
||||
/// <param name="second"></param>
|
||||
/// <returns>true if successfully added, false if either element are already in the dictionary</returns>
|
||||
public bool TryAdd(TFirst first, TSecond second)
|
||||
{
|
||||
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
|
||||
return false;
|
||||
|
||||
_firstToSecond.Add(first, second);
|
||||
_secondToFirst.Add(second, first);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Find the TSecond corresponding to the TFirst first.
|
||||
/// Returns false if first is not in the dictionary.
|
||||
/// </summary>
|
||||
/// <param name="first">the key to search for</param>
|
||||
/// <param name="second">the corresponding value</param>
|
||||
/// <returns>true if first is in the dictionary, false otherwise</returns>
|
||||
public bool TryGetByFirst(TFirst first, out TSecond second)
|
||||
{
|
||||
return _firstToSecond.TryGetValue(first, out second);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the TFirst corresponding to the TSecond second.
|
||||
/// Returns false if second is not in the dictionary.
|
||||
/// </summary>
|
||||
/// <param name="second">the key to search for</param>
|
||||
/// <param name="first">the corresponding value</param>
|
||||
/// <returns>true if second is in the dictionary, false otherwise</returns>
|
||||
public bool TryGetBySecond(TSecond second, out TFirst first)
|
||||
{
|
||||
return _secondToFirst.TryGetValue(second, out first);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove the record containing first, if there is one.
|
||||
/// </summary>
|
||||
/// <param name="first"></param>
|
||||
/// <returns> If first is not in the dictionary, returns false, otherwise true</returns>
|
||||
public bool TryRemoveByFirst(TFirst first)
|
||||
{
|
||||
TSecond second;
|
||||
if (!_firstToSecond.TryGetValue(first, out second))
|
||||
return false;
|
||||
|
||||
_firstToSecond.Remove(first);
|
||||
_secondToFirst.Remove(second);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove the record containing second, if there is one.
|
||||
/// </summary>
|
||||
/// <param name="second"></param>
|
||||
/// <returns> If second is not in the dictionary, returns false, otherwise true</returns>
|
||||
public bool TryRemoveBySecond(TSecond second)
|
||||
{
|
||||
TFirst first;
|
||||
if (!_secondToFirst.TryGetValue(second, out first))
|
||||
return false;
|
||||
|
||||
_secondToFirst.Remove(second);
|
||||
_firstToSecond.Remove(first);
|
||||
return true;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// The number of pairs stored in the dictionary
|
||||
/// </summary>
|
||||
public Int32 Count {
|
||||
get { return _firstToSecond.Count; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes all items from the dictionary.
|
||||
/// </summary>
|
||||
public void Clear()
|
||||
{
|
||||
_firstToSecond.Clear();
|
||||
_secondToFirst.Clear();
|
||||
}
|
||||
|
||||
|
||||
public Enumerator GetEnumerator()
|
||||
{
|
||||
//enumerator.Reset(firstToSecond.GetEnumerator());
|
||||
return new Enumerator(_firstToSecond.GetEnumerator());
|
||||
}
|
||||
|
||||
IEnumerator<Pair> IEnumerable<Pair>.GetEnumerator()
|
||||
{
|
||||
return GetEnumerator();
|
||||
}
|
||||
|
||||
IEnumerator IEnumerable.GetEnumerator()
|
||||
{
|
||||
return GetEnumerator();
|
||||
}
|
||||
|
||||
|
||||
|
||||
private Dictionary<TFirst, TSecond> _firstToSecond = new Dictionary<TFirst, TSecond>();
|
||||
private Dictionary<TSecond, TFirst> _secondToFirst = new Dictionary<TSecond, TFirst>();
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue