Implement word-level ngrams
This commit is contained in:
parent
1cab57f1b6
commit
c55c8f4e8a
4 changed files with 69 additions and 35 deletions
|
@ -4,6 +4,12 @@ using System.Linq;
|
||||||
|
|
||||||
namespace MarkovGrams
|
namespace MarkovGrams
|
||||||
{
|
{
|
||||||
|
public enum GenerationMode
|
||||||
|
{
|
||||||
|
CharacterLevel,
|
||||||
|
WordLevel
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// A collection of methods to generate various different types of n-grams.
|
/// A collection of methods to generate various different types of n-grams.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@ -15,12 +21,12 @@ namespace MarkovGrams
|
||||||
/// <param name="words">The words to turn into n-grams.</param>
|
/// <param name="words">The words to turn into n-grams.</param>
|
||||||
/// <param name="order">The order of n-gram to generate..</param>
|
/// <param name="order">The order of n-gram to generate..</param>
|
||||||
/// <returns>A unique list of n-grams found in the given list of words.</returns>
|
/// <returns>A unique list of n-grams found in the given list of words.</returns>
|
||||||
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, bool distinct = true)
|
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, GenerationMode mode, bool distinct = true)
|
||||||
{
|
{
|
||||||
List<string> results = new List<string>();
|
List<string> results = new List<string>();
|
||||||
foreach (string word in words)
|
foreach (string word in words)
|
||||||
{
|
{
|
||||||
results.AddRange(GenerateFlat(word, order));
|
results.AddRange(GenerateFlat(word, order, mode));
|
||||||
}
|
}
|
||||||
if (distinct) return results.Distinct();
|
if (distinct) return results.Distinct();
|
||||||
return results;
|
return results;
|
||||||
|
@ -32,13 +38,18 @@ namespace MarkovGrams
|
||||||
/// <param name="str">The string to n-gram-ise.</param>
|
/// <param name="str">The string to n-gram-ise.</param>
|
||||||
/// <param name="order">The order of n-gram to generate.</param>
|
/// <param name="order">The order of n-gram to generate.</param>
|
||||||
/// <returns>A unique list of n-grams found in the specified string.</returns>
|
/// <returns>A unique list of n-grams found in the specified string.</returns>
|
||||||
public static IEnumerable<string> GenerateFlat(string str, int order)
|
public static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
|
||||||
{
|
{
|
||||||
List<string> results = new List<string>();
|
List<string> results = new List<string>();
|
||||||
for(int i = 0; i < str.Length - order; i++)
|
if (mode == GenerationMode.CharacterLevel) {
|
||||||
{
|
for (int i = 0; i < str.Length - order; i++)
|
||||||
results.Add(str.Substring(i, order));
|
results.Add(str.Substring(i, order));
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
string[] parts = str.Split(" ".ToCharArray());
|
||||||
|
for (int i = 0; i < parts.Length; i++)
|
||||||
|
results.Add(string.Join(" ", str.Skip(i).Take(order)));
|
||||||
|
}
|
||||||
return results.Distinct();
|
return results.Distinct();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -49,19 +60,11 @@ namespace MarkovGrams
|
||||||
/// <param name="words">The words to n-gram-ise.</param>
|
/// <param name="words">The words to n-gram-ise.</param>
|
||||||
/// <param name="order">The order of ngrams to generate.</param>
|
/// <param name="order">The order of ngrams to generate.</param>
|
||||||
/// <returns>The weighted dictionary of ngrams.</returns>
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
||||||
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order)
|
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
|
||||||
{
|
{
|
||||||
Dictionary<string, int> results = new Dictionary<string, int>();
|
Dictionary<string, int> results = new Dictionary<string, int>();
|
||||||
foreach(string word in words)
|
foreach(string word in words)
|
||||||
{
|
GenerateWeighted(word, order, mode);
|
||||||
Dictionary<string, int> wordNgrams = GenerateWeighted(word, order);
|
|
||||||
foreach(KeyValuePair<string, int> ngram in wordNgrams)
|
|
||||||
{
|
|
||||||
if(!results.ContainsKey(ngram.Key))
|
|
||||||
results[ngram.Key] = 0;
|
|
||||||
results[ngram.Key] += ngram.Value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
@ -70,17 +73,16 @@ namespace MarkovGrams
|
||||||
/// <param name="str">The string to n-gram-ise.</param>
|
/// <param name="str">The string to n-gram-ise.</param>
|
||||||
/// <param name="order">The order of n-grams to generate.</param>
|
/// <param name="order">The order of n-grams to generate.</param>
|
||||||
/// <returns>The weighted dictionary of ngrams.</returns>
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
||||||
public static Dictionary<string, int> GenerateWeighted(string str, int order)
|
public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary<string, int> results)
|
||||||
{
|
{
|
||||||
Dictionary<string, int> results = new Dictionary<string, int>();
|
string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null;
|
||||||
for(int i = 0; i < str.Length - order; i++)
|
for(int i = 0; i < str.Length - order; i++)
|
||||||
{
|
{
|
||||||
string ngram = str.Substring(i, order);
|
string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order));
|
||||||
if(!results.ContainsKey(ngram))
|
if(!results.ContainsKey(ngram))
|
||||||
results[ngram] = 0;
|
results[ngram] = 0;
|
||||||
results[ngram]++;
|
results[ngram]++;
|
||||||
}
|
}
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,8 @@ namespace MarkovGrams
|
||||||
{
|
{
|
||||||
public static int Main(string[] args)
|
public static int Main(string[] args)
|
||||||
{
|
{
|
||||||
Mode mode = Mode.None;
|
Mode operationMode = Mode.None;
|
||||||
|
GenerationMode generationMode = GenerationMode.CharacterLevel;
|
||||||
List<string> extras = new List<string>();
|
List<string> extras = new List<string>();
|
||||||
StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput());
|
StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput());
|
||||||
int order = 3, length = 8, count = 10;
|
int order = 3, length = 8, count = 10;
|
||||||
|
@ -62,8 +63,11 @@ namespace MarkovGrams
|
||||||
case "start-uppercase":
|
case "start-uppercase":
|
||||||
startOnUppercase = true;
|
startOnUppercase = true;
|
||||||
break;
|
break;
|
||||||
|
case "words":
|
||||||
|
generationMode = GenerationMode.WordLevel;
|
||||||
|
break;
|
||||||
case "help":
|
case "help":
|
||||||
mode = Mode.Help;
|
operationMode = Mode.Help;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
Console.Error.WriteLine($"Error: Unknown option '{args[i]}'.");
|
Console.Error.WriteLine($"Error: Unknown option '{args[i]}'.");
|
||||||
|
@ -71,8 +75,8 @@ namespace MarkovGrams
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(mode != Mode.Help && extras.Count > 0)
|
if(operationMode != Mode.Help && extras.Count > 0)
|
||||||
mode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true);
|
operationMode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true);
|
||||||
|
|
||||||
|
|
||||||
// ------------------------------------------------------------------------------------------
|
// ------------------------------------------------------------------------------------------
|
||||||
|
@ -86,12 +90,13 @@ namespace MarkovGrams
|
||||||
return new string[] { word.Trim() };
|
return new string[] { word.Trim() };
|
||||||
});
|
});
|
||||||
|
|
||||||
switch (mode)
|
switch (operationMode)
|
||||||
{
|
{
|
||||||
case Mode.Markov:
|
case Mode.Markov:
|
||||||
Stopwatch utimer = Stopwatch.StartNew();
|
Stopwatch utimer = Stopwatch.StartNew();
|
||||||
UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain(
|
UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain(
|
||||||
NGrams.GenerateFlat(words, order)
|
NGrams.GenerateFlat(words, order, generationMode),
|
||||||
|
generationMode
|
||||||
);
|
);
|
||||||
unweightedChain.StartOnUppercase = startOnUppercase;
|
unweightedChain.StartOnUppercase = startOnUppercase;
|
||||||
|
|
||||||
|
@ -103,7 +108,8 @@ namespace MarkovGrams
|
||||||
case Mode.WeightedMarkov:
|
case Mode.WeightedMarkov:
|
||||||
Stopwatch wtimer = Stopwatch.StartNew();
|
Stopwatch wtimer = Stopwatch.StartNew();
|
||||||
WeightedMarkovChain weightedChain = new WeightedMarkovChain(
|
WeightedMarkovChain weightedChain = new WeightedMarkovChain(
|
||||||
NGrams.GenerateWeighted(words, order)
|
NGrams.GenerateWeighted(words, order, generationMode),
|
||||||
|
generationMode
|
||||||
);
|
);
|
||||||
weightedChain.StartOnUppercase = startOnUppercase;
|
weightedChain.StartOnUppercase = startOnUppercase;
|
||||||
|
|
||||||
|
@ -113,7 +119,7 @@ namespace MarkovGrams
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Mode.NGrams:
|
case Mode.NGrams:
|
||||||
foreach (string ngram in NGrams.GenerateFlat(words, order, ngramsUnique))
|
foreach (string ngram in NGrams.GenerateFlat(words, order, generationMode, ngramsUnique))
|
||||||
Console.WriteLine(ngram);
|
Console.WriteLine(ngram);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -137,6 +143,7 @@ namespace MarkovGrams
|
||||||
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
|
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
|
||||||
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
|
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
|
||||||
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
|
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
|
||||||
|
Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes)");
|
||||||
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
|
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
|
||||||
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
|
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
|
||||||
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
|
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
|
||||||
|
|
|
@ -25,13 +25,22 @@ namespace MarkovGrams
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public bool StartOnUppercase = false;
|
public bool StartOnUppercase = false;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The generation mode to use when running the Markov Chain.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// The input n-grams must have been generated using the same mode specified here.
|
||||||
|
/// </remarks>
|
||||||
|
public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Creates a new character-based markov chain.
|
/// Creates a new character-based markov chain.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
|
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
|
||||||
public UnweightedMarkovChain(IEnumerable<string> inNgrams)
|
public UnweightedMarkovChain(IEnumerable<string> inNgrams, GenerationMode inMode)
|
||||||
{
|
{
|
||||||
ngrams = new List<string>(inNgrams);
|
ngrams = new List<string>(inNgrams);
|
||||||
|
Mode = inMode;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
@ -63,7 +72,7 @@ namespace MarkovGrams
|
||||||
while(result.Length < length)
|
while(result.Length < length)
|
||||||
{
|
{
|
||||||
// The substring that the next ngram in the chain needs to start with
|
// The substring that the next ngram in the chain needs to start with
|
||||||
string nextStartsWith = lastNgram.Substring(1);
|
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
|
||||||
// Get a list of possible n-grams we could choose from next
|
// Get a list of possible n-grams we could choose from next
|
||||||
List<string> nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith));
|
List<string> nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith));
|
||||||
// If there aren't any choices left, we can't exactly keep adding to the new string any more :-(
|
// If there aren't any choices left, we can't exactly keep adding to the new string any more :-(
|
||||||
|
@ -72,7 +81,10 @@ namespace MarkovGrams
|
||||||
// Pick a random n-gram from the list
|
// Pick a random n-gram from the list
|
||||||
string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count));
|
string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count));
|
||||||
// Add the last character from the n-gram to the string we're building
|
// Add the last character from the n-gram to the string we're building
|
||||||
|
if (Mode == GenerationMode.CharacterLevel)
|
||||||
result += nextNgram[nextNgram.Length - 1];
|
result += nextNgram[nextNgram.Length - 1];
|
||||||
|
else
|
||||||
|
result += string.Join(" ", nextNgram.Split(' ').Skip(1));
|
||||||
lastNgram = nextNgram;
|
lastNgram = nextNgram;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,17 +24,27 @@ namespace MarkovGrams
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public bool StartOnUppercase = false;
|
public bool StartOnUppercase = false;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The generation mode to use when running the Markov Chain.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// The input n-grams must have been generated using the same mode specified here.
|
||||||
|
/// </remarks>
|
||||||
|
public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Creates a new character-based markov chain.
|
/// Creates a new character-based markov chain.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
|
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
|
||||||
public WeightedMarkovChain(Dictionary<string, double> inNgrams) {
|
public WeightedMarkovChain(Dictionary<string, double> inNgrams, GenerationMode inMode) {
|
||||||
ngrams = inNgrams;
|
ngrams = inNgrams;
|
||||||
|
Mode = inMode;
|
||||||
}
|
}
|
||||||
public WeightedMarkovChain(Dictionary<string, int> inNgrams) {
|
public WeightedMarkovChain(Dictionary<string, int> inNgrams, GenerationMode inMode) {
|
||||||
ngrams = new Dictionary<string, double>();
|
ngrams = new Dictionary<string, double>();
|
||||||
foreach (KeyValuePair<string, int> ngram in inNgrams)
|
foreach (KeyValuePair<string, int> ngram in inNgrams)
|
||||||
ngrams[ngram.Key] = ngram.Value;
|
ngrams[ngram.Key] = ngram.Value;
|
||||||
|
Mode = inMode;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
@ -77,7 +87,7 @@ namespace MarkovGrams
|
||||||
{
|
{
|
||||||
wrandom.ClearContents();
|
wrandom.ClearContents();
|
||||||
// The substring that the next ngram in the chain needs to start with
|
// The substring that the next ngram in the chain needs to start with
|
||||||
string nextStartsWith = lastNgram.Substring(1);
|
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
|
||||||
// Get a list of possible n-grams we could choose from next
|
// Get a list of possible n-grams we could choose from next
|
||||||
Dictionary<string, double> convNextNgrams = new Dictionary<string, double>();
|
Dictionary<string, double> convNextNgrams = new Dictionary<string, double>();
|
||||||
ngrams.Where(gram_data => gram_data.Key.StartsWith(nextStartsWith))
|
ngrams.Where(gram_data => gram_data.Key.StartsWith(nextStartsWith))
|
||||||
|
@ -89,7 +99,10 @@ namespace MarkovGrams
|
||||||
// Pick a random n-gram from the list
|
// Pick a random n-gram from the list
|
||||||
string nextNgram = wrandom.Next();
|
string nextNgram = wrandom.Next();
|
||||||
// Add the last character from the n-gram to the string we're building
|
// Add the last character from the n-gram to the string we're building
|
||||||
|
if (Mode == GenerationMode.CharacterLevel)
|
||||||
result += nextNgram[nextNgram.Length - 1];
|
result += nextNgram[nextNgram.Length - 1];
|
||||||
|
else
|
||||||
|
result += string.Join(" ", nextNgram.Split(' ').Skip(1));
|
||||||
lastNgram = nextNgram;
|
lastNgram = nextNgram;
|
||||||
}
|
}
|
||||||
wrandom.ClearContents();
|
wrandom.ClearContents();
|
||||||
|
|
Loading…
Reference in a new issue