MarkovGrams/MarkovGrams/NGrams.cs

106 lines
3.5 KiB
C#

using System;
using System.Collections.Generic;
using System.Linq;
namespace MarkovGrams
{
public enum GenerationMode
{
CharacterLevel,
WordLevel
}
/// <summary>
/// A collection of methods to generate various different types of n-grams.
/// </summary>
public static class NGrams
{
public static bool Verbose { get; set; } = true;
/// <summary>
/// Generates a unique list of n-grams that the given list of words.
/// </summary>
/// <param name="words">The words to turn into n-grams.</param>
/// <param name="order">The order of n-gram to generate..</param>
/// <returns>A unique list of n-grams found in the given list of words.</returns>
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, GenerationMode mode, bool distinct = true)
{
List<string> results = new List<string>();
foreach (string word in words)
{
results.AddRange(GenerateFlat(word, order, mode));
}
if (distinct) return results.Distinct();
return results;
}
/// <summary>
/// Generates a unique list of n-grams from the given string.
/// </summary>
/// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-gram to generate.</param>
/// <returns>A unique list of n-grams found in the specified string.</returns>
private static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
{
List<string> results = new List<string>();
if (mode == GenerationMode.CharacterLevel) {
for (int i = 0; i < str.Length - order; i++)
results.Add(str.Substring(i, order));
}
else {
string[] parts = str.Split(" ".ToCharArray());
for (int i = 0; i < parts.Length; i++)
results.Add(string.Join(" ", str.Skip(i).Take(order)));
}
return results.Distinct();
}
/// <summary>
/// Generates a dictionary of weighted n-grams from the given list of words.
/// The key is the ngram itself, and the value is the linear weight of the ngram.
/// </summary>
/// <param name="words">The words to n-gram-ise.</param>
/// <param name="order">The order of ngrams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns>
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
{
List<string> wordList = new List<string>(words);
int wordCount = wordList.Count();
Dictionary<string, int> results = new Dictionary<string, int>();
int i = 0;
foreach (string word in wordList) {
GenerateWeighted(word, order, mode, ref results);
i++;
}
Console.WriteLine(" - done");
return results;
}
/// <summary>
/// Generates a dictionary of weighted n-grams from the specified string.
/// </summary>
/// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-grams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns>
private static void GenerateWeighted(string str, int order, GenerationMode mode, ref Dictionary<string, int> results)
{
if (mode == GenerationMode.CharacterLevel) {
for (int i = 0; i < str.Length - order; i++) {
string ngram = str.Substring(i, order);
if (!results.ContainsKey(ngram))
results[ngram] = 0;
results[ngram]++;
}
}
else {
string[] parts = str.Split(" ".ToCharArray());
for (int i = 0; i < parts.Length - order; i++) {
string ngram = string.Join(" ", parts.Skip(i).Take(order));
if (!results.ContainsKey(ngram))
results[ngram] = 0;
results[ngram]++;
}
}
}
}
}