|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Linq;
|
|
|
|
|
|
|
|
|
|
namespace MarkovGrams
|
|
|
|
|
{
|
|
|
|
|
public enum GenerationMode
|
|
|
|
|
{
|
|
|
|
|
CharacterLevel,
|
|
|
|
|
WordLevel
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// A collection of methods to generate various different types of n-grams.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public static class NGrams
|
|
|
|
|
{
|
|
|
|
|
public static bool Verbose { get; set; } = true;
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Generates a unique list of n-grams that the given list of words.
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="words">The words to turn into n-grams.</param>
|
|
|
|
|
/// <param name="order">The order of n-gram to generate..</param>
|
|
|
|
|
/// <returns>A unique list of n-grams found in the given list of words.</returns>
|
|
|
|
|
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, GenerationMode mode, bool distinct = true)
|
|
|
|
|
{
|
|
|
|
|
List<string> results = new List<string>();
|
|
|
|
|
foreach (string word in words)
|
|
|
|
|
{
|
|
|
|
|
results.AddRange(GenerateFlat(word, order, mode));
|
|
|
|
|
}
|
|
|
|
|
if (distinct) return results.Distinct();
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Generates a unique list of n-grams from the given string.
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="str">The string to n-gram-ise.</param>
|
|
|
|
|
/// <param name="order">The order of n-gram to generate.</param>
|
|
|
|
|
/// <returns>A unique list of n-grams found in the specified string.</returns>
|
|
|
|
|
private static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
|
|
|
|
|
{
|
|
|
|
|
List<string> results = new List<string>();
|
|
|
|
|
if (mode == GenerationMode.CharacterLevel) {
|
|
|
|
|
for (int i = 0; i < str.Length - order; i++)
|
|
|
|
|
results.Add(str.Substring(i, order));
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
string[] parts = str.Split(" ".ToCharArray());
|
|
|
|
|
for (int i = 0; i < parts.Length; i++)
|
|
|
|
|
results.Add(string.Join(" ", parts.Skip(i).Take(order)).Trim());
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
return results.Distinct();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Generates a dictionary of weighted n-grams from the given list of words.
|
|
|
|
|
/// The key is the ngram itself, and the value is the linear weight of the ngram.
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="words">The words to n-gram-ise.</param>
|
|
|
|
|
/// <param name="order">The order of ngrams to generate.</param>
|
|
|
|
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
|
|
|
|
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
|
|
|
|
|
{
|
|
|
|
|
List<string> wordList = new List<string>(words);
|
|
|
|
|
int wordCount = wordList.Count();
|
|
|
|
|
Dictionary<string, int> results = new Dictionary<string, int>();
|
|
|
|
|
int i = 0;
|
|
|
|
|
foreach (string word in wordList) {
|
|
|
|
|
GenerateWeighted(word, order, mode, ref results);
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Generates a dictionary of weighted n-grams from the specified string.
|
|
|
|
|
/// </summary>
|
|
|
|
|
/// <param name="str">The string to n-gram-ise.</param>
|
|
|
|
|
/// <param name="order">The order of n-grams to generate.</param>
|
|
|
|
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
|
|
|
|
private static void GenerateWeighted(string str, int order, GenerationMode mode, ref Dictionary<string, int> results)
|
|
|
|
|
{
|
|
|
|
|
if (mode == GenerationMode.CharacterLevel) {
|
|
|
|
|
for (int i = 0; i < str.Length - order; i++) {
|
|
|
|
|
string ngram = str.Substring(i, order);
|
|
|
|
|
if (!results.ContainsKey(ngram))
|
|
|
|
|
results[ngram] = 0;
|
|
|
|
|
results[ngram]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
string[] parts = str.Split(" ".ToCharArray());
|
|
|
|
|
for (int i = 0; i < parts.Length - order; i++) {
|
|
|
|
|
string ngram = string.Join(" ", parts.Skip(i).Take(order)).Trim();
|
|
|
|
|
if (ngram.Trim().Length == 0) continue;
|
|
|
|
|
if (!results.ContainsKey(ngram))
|
|
|
|
|
results[ngram] = 0;
|
|
|
|
|
results[ngram]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|