|
|
@ -4,6 +4,12 @@ using System.Linq; |
|
|
|
|
|
|
|
namespace MarkovGrams |
|
|
|
{ |
|
|
|
public enum GenerationMode |
|
|
|
{ |
|
|
|
CharacterLevel, |
|
|
|
WordLevel |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// A collection of methods to generate various different types of n-grams.
|
|
|
|
/// </summary>
|
|
|
@ -15,12 +21,12 @@ namespace MarkovGrams |
|
|
|
/// <param name="words">The words to turn into n-grams.</param>
|
|
|
|
/// <param name="order">The order of n-gram to generate..</param>
|
|
|
|
/// <returns>A unique list of n-grams found in the given list of words.</returns>
|
|
|
|
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, bool distinct = true) |
|
|
|
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, GenerationMode mode, bool distinct = true) |
|
|
|
{ |
|
|
|
List<string> results = new List<string>(); |
|
|
|
foreach (string word in words) |
|
|
|
{ |
|
|
|
results.AddRange(GenerateFlat(word, order)); |
|
|
|
results.AddRange(GenerateFlat(word, order, mode)); |
|
|
|
} |
|
|
|
if (distinct) return results.Distinct(); |
|
|
|
return results; |
|
|
@ -32,12 +38,17 @@ namespace MarkovGrams |
|
|
|
/// <param name="str">The string to n-gram-ise.</param>
|
|
|
|
/// <param name="order">The order of n-gram to generate.</param>
|
|
|
|
/// <returns>A unique list of n-grams found in the specified string.</returns>
|
|
|
|
public static IEnumerable<string> GenerateFlat(string str, int order) |
|
|
|
public static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode) |
|
|
|
{ |
|
|
|
List<string> results = new List<string>(); |
|
|
|
for(int i = 0; i < str.Length - order; i++) |
|
|
|
{ |
|
|
|
results.Add(str.Substring(i, order)); |
|
|
|
if (mode == GenerationMode.CharacterLevel) { |
|
|
|
for (int i = 0; i < str.Length - order; i++) |
|
|
|
results.Add(str.Substring(i, order)); |
|
|
|
} |
|
|
|
else { |
|
|
|
string[] parts = str.Split(" ".ToCharArray()); |
|
|
|
for (int i = 0; i < parts.Length; i++) |
|
|
|
results.Add(string.Join(" ", str.Skip(i).Take(order))); |
|
|
|
} |
|
|
|
return results.Distinct(); |
|
|
|
} |
|
|
@ -49,19 +60,11 @@ namespace MarkovGrams |
|
|
|
/// <param name="words">The words to n-gram-ise.</param>
|
|
|
|
/// <param name="order">The order of ngrams to generate.</param>
|
|
|
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
|
|
|
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order) |
|
|
|
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode) |
|
|
|
{ |
|
|
|
Dictionary<string, int> results = new Dictionary<string, int>(); |
|
|
|
foreach(string word in words) |
|
|
|
{ |
|
|
|
Dictionary<string, int> wordNgrams = GenerateWeighted(word, order); |
|
|
|
foreach(KeyValuePair<string, int> ngram in wordNgrams) |
|
|
|
{ |
|
|
|
if(!results.ContainsKey(ngram.Key)) |
|
|
|
results[ngram.Key] = 0; |
|
|
|
results[ngram.Key] += ngram.Value; |
|
|
|
} |
|
|
|
} |
|
|
|
GenerateWeighted(word, order, mode); |
|
|
|
return results; |
|
|
|
} |
|
|
|
/// <summary>
|
|
|
@ -70,17 +73,16 @@ namespace MarkovGrams |
|
|
|
/// <param name="str">The string to n-gram-ise.</param>
|
|
|
|
/// <param name="order">The order of n-grams to generate.</param>
|
|
|
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
|
|
|
public static Dictionary<string, int> GenerateWeighted(string str, int order) |
|
|
|
public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary<string, int> results) |
|
|
|
{ |
|
|
|
Dictionary<string, int> results = new Dictionary<string, int>(); |
|
|
|
string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null; |
|
|
|
for(int i = 0; i < str.Length - order; i++) |
|
|
|
{ |
|
|
|
string ngram = str.Substring(i, order); |
|
|
|
string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order)); |
|
|
|
if(!results.ContainsKey(ngram)) |
|
|
|
results[ngram] = 0; |
|
|
|
results[ngram]++; |
|
|
|
} |
|
|
|
return results; |
|
|
|
} |
|
|
|
} |
|
|
|
} |