using System;
using System.Collections.Generic;
using System.Linq;
namespace MarkovGrams
{
public enum GenerationMode
{
CharacterLevel,
WordLevel
}
///
/// A collection of methods to generate various different types of n-grams.
///
public static class NGrams
{
///
/// Generates a unique list of n-grams that the given list of words.
///
/// The words to turn into n-grams.
/// The order of n-gram to generate..
/// A unique list of n-grams found in the given list of words.
public static IEnumerable GenerateFlat(IEnumerable words, int order, GenerationMode mode, bool distinct = true)
{
List results = new List();
foreach (string word in words)
{
results.AddRange(GenerateFlat(word, order, mode));
}
if (distinct) return results.Distinct();
return results;
}
///
/// Generates a unique list of n-grams from the given string.
///
/// The string to n-gram-ise.
/// The order of n-gram to generate.
/// A unique list of n-grams found in the specified string.
public static IEnumerable GenerateFlat(string str, int order, GenerationMode mode)
{
List results = new List();
if (mode == GenerationMode.CharacterLevel) {
for (int i = 0; i < str.Length - order; i++)
results.Add(str.Substring(i, order));
}
else {
string[] parts = str.Split(" ".ToCharArray());
for (int i = 0; i < parts.Length; i++)
results.Add(string.Join(" ", str.Skip(i).Take(order)));
}
return results.Distinct();
}
///
/// Generates a dictionary of weighted n-grams from the given list of words.
/// The key is the ngram itself, and the value is the linear weight of the ngram.
///
/// The words to n-gram-ise.
/// The order of ngrams to generate.
/// The weighted dictionary of ngrams.
public static Dictionary GenerateWeighted(IEnumerable words, int order, GenerationMode mode)
{
Dictionary results = new Dictionary();
foreach(string word in words)
GenerateWeighted(word, order, mode);
return results;
}
///
/// Generates a dictionary of weighted n-grams from the specified string.
///
/// The string to n-gram-ise.
/// The order of n-grams to generate.
/// The weighted dictionary of ngrams.
public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary results)
{
string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null;
for(int i = 0; i < str.Length - order; i++)
{
string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order));
if(!results.ContainsKey(ngram))
results[ngram] = 0;
results[ngram]++;
}
}
}
}