Add Id mapping system & start on implementing an inverted index

This commit is contained in:
Starbeamrainbowlabs 2018-09-02 17:45:13 +01:00
parent e8069c93ff
commit c46a789c28
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
6 changed files with 360 additions and 1 deletions

49
SearchBox/IdMapper.cs Normal file
View file

@ -0,0 +1,49 @@
using System;
using Stackoverflow.Utilities;
namespace SearchBox
{
public class IdNotFoundException : Exception { public IdNotFoundException(string message) : base(message) { } }
public class IdMapper
{
private int nextId = 0;
public BiDictionary<int, string> map = new BiDictionary<int, string>();
public IdMapper()
{
}
public int GetId(string pageName)
{
// Perform unicode normalization
pageName = pageName.Normalize(System.Text.NormalizationForm.FormC);
int result;
if (!map.TryGetBySecond(pageName, out result)) {
map.Add(result = nextId++, pageName);
}
return result;
}
public string GetPageName(int id)
{
string result;
if (!map.TryGetByFirst(id, out result))
throw new IdNotFoundException($"Error: Couldn't find {id} in the ID map.");
return result;
}
public void MovePageName(string oldPageName, string newPageName)
{
int id = map.GetBySecond(oldPageName);
map.RemoveBySecond(oldPageName);
map.Add(id, newPageName);
}
public void DeletePageName(string pageName)
{
map.RemoveBySecond(pageName);
}
}
}

View file

@ -1,4 +1,5 @@
using System; using System;
using System.Collections;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Text; using System.Text;
@ -12,7 +13,7 @@ namespace SearchBox
ExcludeStopwords = 1 ExcludeStopwords = 1
} }
public class Index public class Index : IEnumerable<KeyValuePair<string, List<int>>>
{ {
private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>(); private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>();
private StopwordTester stopwordTester; private StopwordTester stopwordTester;
@ -53,6 +54,22 @@ namespace SearchBox
} }
public IEnumerable<KeyValuePair<string, List<int>>> IterateItems()
{
foreach(KeyValuePair<string, List<int>> item in index)
yield return item;
}
public IEnumerator<KeyValuePair<string, List<int>>> GetEnumerator()
{
return IterateItems().GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
public override string ToString() public override string ToString()
{ {
StringBuilder result = new StringBuilder("Index: \n"); StringBuilder result = new StringBuilder("Index: \n");
@ -67,5 +84,6 @@ namespace SearchBox
{ {
return new Index(File.ReadAllText(filename)); return new Index(File.ReadAllText(filename));
} }
} }
} }

View file

@ -0,0 +1,23 @@
using System;
using System.Collections.Generic;
namespace SearchBox
{
public class InvertedIndex
{
private Dictionary<string, Dictionary<int, List<int>>> invertedIndex = new Dictionary<string, Dictionary<int, List<int>>>();
public InvertedIndex()
{
}
public bool AddIndex(Index newIndex)
{
foreach (KeyValuePair<string, List<int>> token in newIndex)
{
}
return true;
}
}
}

View file

@ -42,6 +42,9 @@
<Compile Include="Index.cs" /> <Compile Include="Index.cs" />
<Compile Include="Utilities\StringPlus.cs" /> <Compile Include="Utilities\StringPlus.cs" />
<Compile Include="StopwordTester.cs" /> <Compile Include="StopwordTester.cs" />
<Compile Include="InvertedIndex.cs" />
<Compile Include="IdMapper.cs" />
<Compile Include="Utilities\BiDictionary.cs" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Folder Include="EmbeddedFiles\" /> <Folder Include="EmbeddedFiles\" />

View file

@ -50,6 +50,7 @@ namespace SearchBox
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]); if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
yield return new Tuple<int, string>(index, parts[i]); yield return new Tuple<int, string>(index, parts[i]);
} }
} }

View file

@ -0,0 +1,265 @@
using System;
using System.Collections;
using System.Collections.Generic;
namespace Stackoverflow.Utilities
{
/// <summary>
/// This is a dictionary guaranteed to have only one of each value and key.
/// It may be searched either by TFirst or by TSecond, giving a unique answer because it is 1 to 1.
/// It implements garbage-collector-friendly IEnumerable.
/// </summary>
/// <remarks>From https://stackoverflow.com/a/35949314/1460422</remarks>
/// <typeparam name="TFirst">The type of the "key"</typeparam>
/// <typeparam name="TSecond">The type of the "value"</typeparam>
public class BiDictionary<TFirst, TSecond> : IEnumerable<BiDictionary<TFirst, TSecond>.Pair>
{
public struct Pair
{
public TFirst First;
public TSecond Second;
}
public struct Enumerator : IEnumerator<Pair>, IEnumerator
{
public Enumerator(Dictionary<TFirst, TSecond>.Enumerator dictEnumerator)
{
_dictEnumerator = dictEnumerator;
}
public Pair Current {
get {
Pair pair;
pair.First = _dictEnumerator.Current.Key;
pair.Second = _dictEnumerator.Current.Value;
return pair;
}
}
object IEnumerator.Current {
get {
return Current;
}
}
public void Dispose()
{
_dictEnumerator.Dispose();
}
public bool MoveNext()
{
return _dictEnumerator.MoveNext();
}
public void Reset()
{
throw new NotSupportedException();
}
private Dictionary<TFirst, TSecond>.Enumerator _dictEnumerator;
}
#region Exception throwing methods
/// <summary>
/// Tries to add the pair to the dictionary.
/// Throws an exception if either element is already in the dictionary
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
public void Add(TFirst first, TSecond second)
{
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
throw new ArgumentException("Duplicate first or second");
_firstToSecond.Add(first, second);
_secondToFirst.Add(second, first);
}
/// <summary>
/// Find the TSecond corresponding to the TFirst first
/// Throws an exception if first is not in the dictionary.
/// </summary>
/// <param name="first">the key to search for</param>
/// <returns>the value corresponding to first</returns>
public TSecond GetByFirst(TFirst first)
{
TSecond second;
if (!_firstToSecond.TryGetValue(first, out second))
throw new ArgumentException("first");
return second;
}
/// <summary>
/// Find the TFirst corresponing to the Second second.
/// Throws an exception if second is not in the dictionary.
/// </summary>
/// <param name="second">the key to search for</param>
/// <returns>the value corresponding to second</returns>
public TFirst GetBySecond(TSecond second)
{
TFirst first;
if (!_secondToFirst.TryGetValue(second, out first))
throw new ArgumentException("second");
return first;
}
/// <summary>
/// Remove the record containing first.
/// If first is not in the dictionary, throws an Exception.
/// </summary>
/// <param name="first">the key of the record to delete</param>
public void RemoveByFirst(TFirst first)
{
TSecond second;
if (!_firstToSecond.TryGetValue(first, out second))
throw new ArgumentException("first");
_firstToSecond.Remove(first);
_secondToFirst.Remove(second);
}
/// <summary>
/// Remove the record containing second.
/// If second is not in the dictionary, throws an Exception.
/// </summary>
/// <param name="second">the key of the record to delete</param>
public void RemoveBySecond(TSecond second)
{
TFirst first;
if (!_secondToFirst.TryGetValue(second, out first))
throw new ArgumentException("second");
_secondToFirst.Remove(second);
_firstToSecond.Remove(first);
}
#endregion
#region Try methods
/// <summary>
/// Tries to add the pair to the dictionary.
/// Returns false if either element is already in the dictionary
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
/// <returns>true if successfully added, false if either element are already in the dictionary</returns>
public bool TryAdd(TFirst first, TSecond second)
{
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
return false;
_firstToSecond.Add(first, second);
_secondToFirst.Add(second, first);
return true;
}
/// <summary>
/// Find the TSecond corresponding to the TFirst first.
/// Returns false if first is not in the dictionary.
/// </summary>
/// <param name="first">the key to search for</param>
/// <param name="second">the corresponding value</param>
/// <returns>true if first is in the dictionary, false otherwise</returns>
public bool TryGetByFirst(TFirst first, out TSecond second)
{
return _firstToSecond.TryGetValue(first, out second);
}
/// <summary>
/// Find the TFirst corresponding to the TSecond second.
/// Returns false if second is not in the dictionary.
/// </summary>
/// <param name="second">the key to search for</param>
/// <param name="first">the corresponding value</param>
/// <returns>true if second is in the dictionary, false otherwise</returns>
public bool TryGetBySecond(TSecond second, out TFirst first)
{
return _secondToFirst.TryGetValue(second, out first);
}
/// <summary>
/// Remove the record containing first, if there is one.
/// </summary>
/// <param name="first"></param>
/// <returns> If first is not in the dictionary, returns false, otherwise true</returns>
public bool TryRemoveByFirst(TFirst first)
{
TSecond second;
if (!_firstToSecond.TryGetValue(first, out second))
return false;
_firstToSecond.Remove(first);
_secondToFirst.Remove(second);
return true;
}
/// <summary>
/// Remove the record containing second, if there is one.
/// </summary>
/// <param name="second"></param>
/// <returns> If second is not in the dictionary, returns false, otherwise true</returns>
public bool TryRemoveBySecond(TSecond second)
{
TFirst first;
if (!_secondToFirst.TryGetValue(second, out first))
return false;
_secondToFirst.Remove(second);
_firstToSecond.Remove(first);
return true;
}
#endregion
/// <summary>
/// The number of pairs stored in the dictionary
/// </summary>
public Int32 Count {
get { return _firstToSecond.Count; }
}
/// <summary>
/// Removes all items from the dictionary.
/// </summary>
public void Clear()
{
_firstToSecond.Clear();
_secondToFirst.Clear();
}
public Enumerator GetEnumerator()
{
//enumerator.Reset(firstToSecond.GetEnumerator());
return new Enumerator(_firstToSecond.GetEnumerator());
}
IEnumerator<Pair> IEnumerable<Pair>.GetEnumerator()
{
return GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
private Dictionary<TFirst, TSecond> _firstToSecond = new Dictionary<TFirst, TSecond>();
private Dictionary<TSecond, TFirst> _secondToFirst = new Dictionary<TSecond, TFirst>();
}
}