diff --git a/MarkovGrams/NGrams.cs b/MarkovGrams/NGrams.cs
index 913ac49..2c7bba5 100644
--- a/MarkovGrams/NGrams.cs
+++ b/MarkovGrams/NGrams.cs
@@ -4,6 +4,12 @@ using System.Linq;
namespace MarkovGrams
{
+ public enum GenerationMode
+ {
+ CharacterLevel,
+ WordLevel
+ }
+
///
/// A collection of methods to generate various different types of n-grams.
///
@@ -15,12 +21,12 @@ namespace MarkovGrams
/// The words to turn into n-grams.
/// The order of n-gram to generate..
/// A unique list of n-grams found in the given list of words.
- public static IEnumerable GenerateFlat(IEnumerable words, int order, bool distinct = true)
+ public static IEnumerable GenerateFlat(IEnumerable words, int order, GenerationMode mode, bool distinct = true)
{
List results = new List();
foreach (string word in words)
{
- results.AddRange(GenerateFlat(word, order));
+ results.AddRange(GenerateFlat(word, order, mode));
}
if (distinct) return results.Distinct();
return results;
@@ -32,12 +38,17 @@ namespace MarkovGrams
/// The string to n-gram-ise.
/// The order of n-gram to generate.
/// A unique list of n-grams found in the specified string.
- public static IEnumerable GenerateFlat(string str, int order)
+ public static IEnumerable GenerateFlat(string str, int order, GenerationMode mode)
{
List results = new List();
- for(int i = 0; i < str.Length - order; i++)
- {
- results.Add(str.Substring(i, order));
+ if (mode == GenerationMode.CharacterLevel) {
+ for (int i = 0; i < str.Length - order; i++)
+ results.Add(str.Substring(i, order));
+ }
+ else {
+ string[] parts = str.Split(" ".ToCharArray());
+ for (int i = 0; i < parts.Length; i++)
+ results.Add(string.Join(" ", str.Skip(i).Take(order)));
}
return results.Distinct();
}
@@ -49,19 +60,11 @@ namespace MarkovGrams
/// The words to n-gram-ise.
/// The order of ngrams to generate.
/// The weighted dictionary of ngrams.
- public static Dictionary GenerateWeighted(IEnumerable words, int order)
+ public static Dictionary GenerateWeighted(IEnumerable words, int order, GenerationMode mode)
{
Dictionary results = new Dictionary();
foreach(string word in words)
- {
- Dictionary wordNgrams = GenerateWeighted(word, order);
- foreach(KeyValuePair ngram in wordNgrams)
- {
- if(!results.ContainsKey(ngram.Key))
- results[ngram.Key] = 0;
- results[ngram.Key] += ngram.Value;
- }
- }
+ GenerateWeighted(word, order, mode);
return results;
}
///
@@ -70,17 +73,16 @@ namespace MarkovGrams
/// The string to n-gram-ise.
/// The order of n-grams to generate.
/// The weighted dictionary of ngrams.
- public static Dictionary GenerateWeighted(string str, int order)
+ public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary results)
{
- Dictionary results = new Dictionary();
+ string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null;
for(int i = 0; i < str.Length - order; i++)
{
- string ngram = str.Substring(i, order);
+ string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order));
if(!results.ContainsKey(ngram))
results[ngram] = 0;
results[ngram]++;
}
- return results;
}
}
}
diff --git a/MarkovGrams/Program.cs b/MarkovGrams/Program.cs
index c29a865..405b36a 100644
--- a/MarkovGrams/Program.cs
+++ b/MarkovGrams/Program.cs
@@ -20,7 +20,8 @@ namespace MarkovGrams
{
public static int Main(string[] args)
{
- Mode mode = Mode.None;
+ Mode operationMode = Mode.None;
+ GenerationMode generationMode = GenerationMode.CharacterLevel;
List extras = new List();
StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput());
int order = 3, length = 8, count = 10;
@@ -62,8 +63,11 @@ namespace MarkovGrams
case "start-uppercase":
startOnUppercase = true;
break;
+ case "words":
+ generationMode = GenerationMode.WordLevel;
+ break;
case "help":
- mode = Mode.Help;
+ operationMode = Mode.Help;
break;
default:
Console.Error.WriteLine($"Error: Unknown option '{args[i]}'.");
@@ -71,8 +75,8 @@ namespace MarkovGrams
}
}
- if(mode != Mode.Help && extras.Count > 0)
- mode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true);
+ if(operationMode != Mode.Help && extras.Count > 0)
+ operationMode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true);
// ------------------------------------------------------------------------------------------
@@ -86,12 +90,13 @@ namespace MarkovGrams
return new string[] { word.Trim() };
});
- switch (mode)
+ switch (operationMode)
{
case Mode.Markov:
Stopwatch utimer = Stopwatch.StartNew();
UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain(
- NGrams.GenerateFlat(words, order)
+ NGrams.GenerateFlat(words, order, generationMode),
+ generationMode
);
unweightedChain.StartOnUppercase = startOnUppercase;
@@ -103,7 +108,8 @@ namespace MarkovGrams
case Mode.WeightedMarkov:
Stopwatch wtimer = Stopwatch.StartNew();
WeightedMarkovChain weightedChain = new WeightedMarkovChain(
- NGrams.GenerateWeighted(words, order)
+ NGrams.GenerateWeighted(words, order, generationMode),
+ generationMode
);
weightedChain.StartOnUppercase = startOnUppercase;
@@ -113,7 +119,7 @@ namespace MarkovGrams
break;
case Mode.NGrams:
- foreach (string ngram in NGrams.GenerateFlat(words, order, ngramsUnique))
+ foreach (string ngram in NGrams.GenerateFlat(words, order, generationMode, ngramsUnique))
Console.WriteLine(ngram);
break;
@@ -137,6 +143,7 @@ namespace MarkovGrams
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
+ Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes)");
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
diff --git a/MarkovGrams/UnweightedMarkovChain.cs b/MarkovGrams/UnweightedMarkovChain.cs
index 415048d..d5899e7 100644
--- a/MarkovGrams/UnweightedMarkovChain.cs
+++ b/MarkovGrams/UnweightedMarkovChain.cs
@@ -25,13 +25,22 @@ namespace MarkovGrams
///
public bool StartOnUppercase = false;
+ ///
+ /// The generation mode to use when running the Markov Chain.
+ ///
+ ///
+ /// The input n-grams must have been generated using the same mode specified here.
+ ///
+ public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
+
///
/// Creates a new character-based markov chain.
///
/// The ngrams to populate the new markov chain with.
- public UnweightedMarkovChain(IEnumerable inNgrams)
+ public UnweightedMarkovChain(IEnumerable inNgrams, GenerationMode inMode)
{
ngrams = new List(inNgrams);
+ Mode = inMode;
}
///
@@ -63,7 +72,7 @@ namespace MarkovGrams
while(result.Length < length)
{
// The substring that the next ngram in the chain needs to start with
- string nextStartsWith = lastNgram.Substring(1);
+ string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
// Get a list of possible n-grams we could choose from next
List nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith));
// If there aren't any choices left, we can't exactly keep adding to the new string any more :-(
@@ -72,7 +81,10 @@ namespace MarkovGrams
// Pick a random n-gram from the list
string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count));
// Add the last character from the n-gram to the string we're building
- result += nextNgram[nextNgram.Length - 1];
+ if (Mode == GenerationMode.CharacterLevel)
+ result += nextNgram[nextNgram.Length - 1];
+ else
+ result += string.Join(" ", nextNgram.Split(' ').Skip(1));
lastNgram = nextNgram;
}
diff --git a/MarkovGrams/WeightedMarkovChain.cs b/MarkovGrams/WeightedMarkovChain.cs
index 3ff9f13..3e30d9e 100644
--- a/MarkovGrams/WeightedMarkovChain.cs
+++ b/MarkovGrams/WeightedMarkovChain.cs
@@ -24,17 +24,27 @@ namespace MarkovGrams
///
public bool StartOnUppercase = false;
+ ///
+ /// The generation mode to use when running the Markov Chain.
+ ///
+ ///
+ /// The input n-grams must have been generated using the same mode specified here.
+ ///
+ public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
+
///
/// Creates a new character-based markov chain.
///
/// The ngrams to populate the new markov chain with.
- public WeightedMarkovChain(Dictionary inNgrams) {
+ public WeightedMarkovChain(Dictionary inNgrams, GenerationMode inMode) {
ngrams = inNgrams;
+ Mode = inMode;
}
- public WeightedMarkovChain(Dictionary inNgrams) {
+ public WeightedMarkovChain(Dictionary inNgrams, GenerationMode inMode) {
ngrams = new Dictionary();
foreach (KeyValuePair ngram in inNgrams)
ngrams[ngram.Key] = ngram.Value;
+ Mode = inMode;
}
///
@@ -77,7 +87,7 @@ namespace MarkovGrams
{
wrandom.ClearContents();
// The substring that the next ngram in the chain needs to start with
- string nextStartsWith = lastNgram.Substring(1);
+ string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
// Get a list of possible n-grams we could choose from next
Dictionary convNextNgrams = new Dictionary();
ngrams.Where(gram_data => gram_data.Key.StartsWith(nextStartsWith))
@@ -89,7 +99,10 @@ namespace MarkovGrams
// Pick a random n-gram from the list
string nextNgram = wrandom.Next();
// Add the last character from the n-gram to the string we're building
- result += nextNgram[nextNgram.Length - 1];
+ if (Mode == GenerationMode.CharacterLevel)
+ result += nextNgram[nextNgram.Length - 1];
+ else
+ result += string.Join(" ", nextNgram.Split(' ').Skip(1));
lastNgram = nextNgram;
}
wrandom.ClearContents();