using System; using System.Collections.Generic; using System.Linq; namespace MarkovGrams { public enum GenerationMode { CharacterLevel, WordLevel } /// /// A collection of methods to generate various different types of n-grams. /// public static class NGrams { /// /// Generates a unique list of n-grams that the given list of words. /// /// The words to turn into n-grams. /// The order of n-gram to generate.. /// A unique list of n-grams found in the given list of words. public static IEnumerable GenerateFlat(IEnumerable words, int order, GenerationMode mode, bool distinct = true) { List results = new List(); foreach (string word in words) { results.AddRange(GenerateFlat(word, order, mode)); } if (distinct) return results.Distinct(); return results; } /// /// Generates a unique list of n-grams from the given string. /// /// The string to n-gram-ise. /// The order of n-gram to generate. /// A unique list of n-grams found in the specified string. public static IEnumerable GenerateFlat(string str, int order, GenerationMode mode) { List results = new List(); if (mode == GenerationMode.CharacterLevel) { for (int i = 0; i < str.Length - order; i++) results.Add(str.Substring(i, order)); } else { string[] parts = str.Split(" ".ToCharArray()); for (int i = 0; i < parts.Length; i++) results.Add(string.Join(" ", str.Skip(i).Take(order))); } return results.Distinct(); } /// /// Generates a dictionary of weighted n-grams from the given list of words. /// The key is the ngram itself, and the value is the linear weight of the ngram. /// /// The words to n-gram-ise. /// The order of ngrams to generate. /// The weighted dictionary of ngrams. public static Dictionary GenerateWeighted(IEnumerable words, int order, GenerationMode mode) { Dictionary results = new Dictionary(); foreach(string word in words) GenerateWeighted(word, order, mode); return results; } /// /// Generates a dictionary of weighted n-grams from the specified string. /// /// The string to n-gram-ise. /// The order of n-grams to generate. /// The weighted dictionary of ngrams. public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary results) { string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null; for(int i = 0; i < str.Length - order; i++) { string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order)); if(!results.ContainsKey(ngram)) results[ngram] = 0; results[ngram]++; } } } }