using System; using System.Collections.Generic; using System.Linq; namespace MarkovGrams { public enum GenerationMode { CharacterLevel, WordLevel } /// /// A collection of methods to generate various different types of n-grams. /// public static class NGrams { public static bool Verbose { get; set; } = true; /// /// Generates a unique list of n-grams that the given list of words. /// /// The words to turn into n-grams. /// The order of n-gram to generate.. /// A unique list of n-grams found in the given list of words. public static IEnumerable GenerateFlat(IEnumerable words, int order, GenerationMode mode, bool distinct = true) { List results = new List(); foreach (string word in words) { results.AddRange(GenerateFlat(word, order, mode)); } if (distinct) return results.Distinct(); return results; } /// /// Generates a unique list of n-grams from the given string. /// /// The string to n-gram-ise. /// The order of n-gram to generate. /// A unique list of n-grams found in the specified string. private static IEnumerable GenerateFlat(string str, int order, GenerationMode mode) { List results = new List(); if (mode == GenerationMode.CharacterLevel) { for (int i = 0; i < str.Length - order; i++) results.Add(str.Substring(i, order)); } else { string[] parts = str.Split(" ".ToCharArray()); for (int i = 0; i < parts.Length; i++) results.Add(string.Join(" ", str.Skip(i).Take(order)).Trim()); } return results.Distinct(); } /// /// Generates a dictionary of weighted n-grams from the given list of words. /// The key is the ngram itself, and the value is the linear weight of the ngram. /// /// The words to n-gram-ise. /// The order of ngrams to generate. /// The weighted dictionary of ngrams. public static Dictionary GenerateWeighted(IEnumerable words, int order, GenerationMode mode) { List wordList = new List(words); int wordCount = wordList.Count(); Dictionary results = new Dictionary(); int i = 0; foreach (string word in wordList) { GenerateWeighted(word, order, mode, ref results); i++; } return results; } /// /// Generates a dictionary of weighted n-grams from the specified string. /// /// The string to n-gram-ise. /// The order of n-grams to generate. /// The weighted dictionary of ngrams. private static void GenerateWeighted(string str, int order, GenerationMode mode, ref Dictionary results) { if (mode == GenerationMode.CharacterLevel) { for (int i = 0; i < str.Length - order; i++) { string ngram = str.Substring(i, order); if (!results.ContainsKey(ngram)) results[ngram] = 0; results[ngram]++; } } else { string[] parts = str.Split(" ".ToCharArray()); for (int i = 0; i < parts.Length - order; i++) { string ngram = string.Join(" ", parts.Skip(i).Take(order)).Trim(); if (ngram.Trim().Length == 0) continue; if (!results.ContainsKey(ngram)) results[ngram] = 0; results[ngram]++; } } } } }