MarkovGrams/MarkovGrams/NGrams.cs

87 lines
2.9 KiB
C#

using System;
using System.Collections.Generic;
using System.Linq;
namespace MarkovGrams
{
/// <summary>
/// A collection of methods to generate various different types of n-grams.
/// </summary>
public static class NGrams
{
/// <summary>
/// Generates a unique list of n-grams that the given list of words.
/// </summary>
/// <param name="words">The words to turn into n-grams.</param>
/// <param name="order">The order of n-gram to generate..</param>
/// <returns>A unique list of n-grams found in the given list of words.</returns>
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, bool distinct = true)
{
List<string> results = new List<string>();
foreach (string word in words)
{
results.AddRange(GenerateFlat(word, order));
}
if (distinct) return results.Distinct();
return results;
}
/// <summary>
/// Generates a unique list of n-grams from the given string.
/// </summary>
/// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-gram to generate.</param>
/// <returns>A unique list of n-grams found in the specified string.</returns>
public static IEnumerable<string> GenerateFlat(string str, int order)
{
List<string> results = new List<string>();
for(int i = 0; i < str.Length - order; i++)
{
results.Add(str.Substring(i, order));
}
return results.Distinct();
}
/// <summary>
/// Generates a dictionary of weighted n-grams from the given list of words.
/// The key is the ngram itself, and the value is the linear weight of the ngram.
/// </summary>
/// <param name="words">The words to n-gram-ise.</param>
/// <param name="order">The order of ngrams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns>
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order)
{
Dictionary<string, int> results = new Dictionary<string, int>();
foreach(string word in words)
{
Dictionary<string, int> wordNgrams = GenerateWeighted(word, order);
foreach(KeyValuePair<string, int> ngram in wordNgrams)
{
if(!results.ContainsKey(ngram.Key))
results[ngram.Key] = 0;
results[ngram.Key] += ngram.Value;
}
}
return results;
}
/// <summary>
/// Generates a dictionary of weighted n-grams from the specified string.
/// </summary>
/// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-grams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns>
public static Dictionary<string, int> GenerateWeighted(string str, int order)
{
Dictionary<string, int> results = new Dictionary<string, int>();
for(int i = 0; i < str.Length - order; i++)
{
string ngram = str.Substring(i, order);
if(!results.ContainsKey(ngram))
results[ngram] = 0;
results[ngram]++;
}
return results;
}
}
}