Compare commits

...

2 commits

8 changed files with 183 additions and 109 deletions

View file

@ -39,6 +39,7 @@
<Compile Include="Utilities\WeightedRandom.cs" /> <Compile Include="Utilities\WeightedRandom.cs" />
<Compile Include="WeightedMarkovChain.cs" /> <Compile Include="WeightedMarkovChain.cs" />
<Compile Include="Utilities\LinqExtensions.cs" /> <Compile Include="Utilities\LinqExtensions.cs" />
<Compile Include="Utilities\StreamReaderExtensions.cs" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Folder Include="Utilities\" /> <Folder Include="Utilities\" />

View file

@ -15,14 +15,15 @@ namespace MarkovGrams
/// <param name="words">The words to turn into n-grams.</param> /// <param name="words">The words to turn into n-grams.</param>
/// <param name="order">The order of n-gram to generate..</param> /// <param name="order">The order of n-gram to generate..</param>
/// <returns>A unique list of n-grams found in the given list of words.</returns> /// <returns>A unique list of n-grams found in the given list of words.</returns>
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order) public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, bool distinct = true)
{ {
List<string> results = new List<string>(); List<string> results = new List<string>();
foreach(string word in words) foreach (string word in words)
{ {
results.AddRange(GenerateFlat(word, order)); results.AddRange(GenerateFlat(word, order));
} }
return results.Distinct(); if (distinct) return results.Distinct();
return results;
} }
/// <summary> /// <summary>

View file

@ -3,122 +3,139 @@ using System.Collections.Generic;
using System.Diagnostics; using System.Diagnostics;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using MarkovGrams.Utilities;
namespace MarkovGrams namespace MarkovGrams
{ {
public enum Mode
{
Help,
NGrams,
Markov,
WeightedMarkov
}
class MainClass class MainClass
{ {
public static int Main(string[] args) public static int Main(string[] args)
{ {
if(args.Length < 1) List<string> extras = new List<string>();
StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput());
int order = 3, length = 8, count = 10;
bool splitOnWhitespace = true,
ngramsUnique = true,
convertLowercase = false,
startOnUppercase = false;
for (int i = 0; i < args.Length; i++)
{ {
Console.WriteLine("Usage:"); if (!args[i].StartsWith("-"))
Console.WriteLine(" ./MarkovGrams.exe <command>"); {
Console.WriteLine(); extras.Add(args[i]);
Console.WriteLine("Available commands:"); continue;
Console.WriteLine(" markov:"); }
Console.WriteLine(" Generate new words using an unweighted markov chain.");
Console.WriteLine(" markov-w:"); switch (args[i].TrimStart("-".ToCharArray()))
Console.WriteLine(" Generate new words using a weighted markov chain."); {
Console.WriteLine(" ngrams:"); case "wordlist":
Console.WriteLine(" Generate raw unique n-grams"); wordlistSource = new StreamReader(args[++i]);
Console.WriteLine(); break;
Console.WriteLine("Type just ./MarovGrams.exe <command> to see command-specific help."); case "order":
return 1; order = int.Parse(args[++i]);
break;
case "length":
length = int.Parse(args[++i]);
break;
case "count":
count = int.Parse(args[++i]);
break;
case "no-split":
splitOnWhitespace = false;
break;
case "no-unique":
ngramsUnique = false;
break;
case "lowercase":
convertLowercase = true;
break;
case "start-uppercase":
startOnUppercase = true;
break;
default:
Console.Error.WriteLine($"Error: Unknown option '{args[i]}'.");
return 1;
}
} }
string mode = args[0];
string wordlistFilename; Mode mode = extras.Count > 0 ? (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true) : Mode.Help;
int order;
IEnumerable<string> words, ngrams;
switch(mode)
// ------------------------------------------------------------------------------------------
IEnumerable<string> words = wordlistSource.ReadAllLines().SelectMany((string word) => {
word = word.Trim();
if (convertLowercase)
word = word.ToLower();
if (splitOnWhitespace)
return word.Split(' ');
return new string[] { word.Trim() };
});
switch (mode)
{ {
case "markov": case Mode.Markov:
if(args.Length != 5)
{
Console.WriteLine("markov command usage:");
Console.WriteLine(" ./MarkovGrams.exe markov <wordlist.txt> <order> <length> <count>");
Console.WriteLine();
Console.WriteLine("<wordlist.txt> The path to the wordlist to read from.");
Console.WriteLine("<order> The order of the n-grams to use.");
Console.WriteLine("<length> The length of word to generate.");
Console.WriteLine("<count> The number of words to generate.");
Console.WriteLine();
return 1;
}
wordlistFilename = args[1];
order = int.Parse(args[2]);
int desiredStringLength = int.Parse(args[3]);
int count = int.Parse(args[4]);
words = File.ReadLines(wordlistFilename).SelectMany(word => word.Trim().Split(' '));
ngrams = NGrams.GenerateFlat(words, order);
Stopwatch utimer = Stopwatch.StartNew(); Stopwatch utimer = Stopwatch.StartNew();
UnweightedMarkovChain chain = new UnweightedMarkovChain(ngrams); UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain(
NGrams.GenerateFlat(words, order)
);
unweightedChain.StartOnUppercase = startOnUppercase;
for(int i = 0; i < count; i++) for (int i = 0; i < count; i++)
Console.WriteLine(chain.Generate(desiredStringLength)); Console.WriteLine(unweightedChain.Generate(length));
Console.Error.WriteLine($"{count} words in {utimer.ElapsedMilliseconds}ms"); Console.Error.WriteLine($"{count} words in {utimer.ElapsedMilliseconds}ms");
break; break;
case "markov-w": case Mode.WeightedMarkov:
if (args.Length != 5)
{
Console.WriteLine("markov-w command usage:");
Console.WriteLine(" ./MarkovGrams.exe markov-w <wordlist.txt> <order> <length> <count>");
Console.WriteLine();
Console.WriteLine("<wordlist.txt> The path to the wordlist to read from.");
Console.WriteLine("<order> The order of the n-grams to use.");
Console.WriteLine("<length> The length of word to generate.");
Console.WriteLine("<count> The number of words to generate.");
Console.WriteLine();
return 1;
}
wordlistFilename = args[1];
order = int.Parse(args[2]);
int weightedDesiredStringLength = int.Parse(args[3]);
int weightedCount = int.Parse(args[4]);
words = File.ReadLines(wordlistFilename).SelectMany(word => word.Trim().Split(' '));
ngrams = NGrams.GenerateFlat(words, order);
Stopwatch wtimer = Stopwatch.StartNew(); Stopwatch wtimer = Stopwatch.StartNew();
WeightedMarkovChain weightedChain = new WeightedMarkovChain(ngrams); WeightedMarkovChain weightedChain = new WeightedMarkovChain(
NGrams.GenerateWeighted(words, order)
);
weightedChain.StartOnUppercase = startOnUppercase;
for (int i = 0; i < weightedCount; i++) for (int i = 0; i < count; i++)
Console.WriteLine(weightedChain.Generate(weightedDesiredStringLength)); Console.WriteLine(weightedChain.Generate(length));
Console.Error.WriteLine($"{weightedCount} words in {wtimer.ElapsedMilliseconds}ms"); Console.Error.WriteLine($"{count} words in {wtimer.ElapsedMilliseconds}ms");
break; break;
case "ngrams":
if(args.Length != 3)
{
Console.WriteLine("ngrams command usage:");
Console.WriteLine(" ./MarkovGrams.exe <wordlist.txt> <order>");
Console.WriteLine();
Console.WriteLine("<wordlist.txt> The path to the wordlist to read from.");
Console.WriteLine("<order> The order of n-grams to generate.");
Console.WriteLine();
return 1;
}
wordlistFilename = args[1]; case Mode.NGrams:
order = int.Parse(args[2]); foreach (string ngram in NGrams.GenerateFlat(words, order, ngramsUnique))
words = File.ReadLines(wordlistFilename).SelectMany(word => word.Trim().Split(' '));
ngrams = NGrams.GenerateFlat(words, order);
foreach(string ngram in ngrams)
Console.WriteLine(ngram); Console.WriteLine(ngram);
break; break;
case Mode.Help:
default: default:
Console.WriteLine("Unknown command {0}."); Console.WriteLine("Usage:");
Console.WriteLine("Available commands:"); Console.WriteLine(" ./MarkovGrams.exe <mode> [options]");
Console.WriteLine(" markov Generate words with a markov chain");
Console.WriteLine(" ngrams Generate unique ngrams from wordlists");
Console.WriteLine(); Console.WriteLine();
Console.WriteLine("Available modes:");
Console.WriteLine(" markov:");
Console.WriteLine(" Generate new words using an unweighted markov chain.");
Console.WriteLine(" markov-w:");
Console.WriteLine(" Generate new words using a weighted markov chain.");
Console.WriteLine(" ngrams:");
Console.WriteLine(" Generate raw unique n-grams");
Console.WriteLine();
Console.WriteLine("Available options:");
Console.WriteLine(" --wordlist {filename} Read the wordlist from the specified filename instead of stdin");
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
Console.WriteLine(" --no-unique Don't remove duplicates from the list of ngrams (Only valid in ngrams mode)");
Console.WriteLine("Type just ./MarkovGrams.exe <mode> to see mode-specific help.");
return 1; return 1;
} }

View file

@ -12,12 +12,18 @@ namespace MarkovGrams
/// <summary> /// <summary>
/// The random number generator /// The random number generator
/// </summary> /// </summary>
Random rand = new Random(); private Random rand = new Random();
/// <summary> /// <summary>
/// The ngrams that this markov chain currently contains. /// The ngrams that this markov chain currently contains.
/// </summary> /// </summary>
List<string> ngrams; private List<string> ngrams;
/// <summary>
/// Whether to always start generating a new word from an n-gram that starts with
/// an uppercase letter.
/// </summary>
public bool StartOnUppercase = false;
/// <summary> /// <summary>
/// Creates a new character-based markov chain. /// Creates a new character-based markov chain.
@ -34,7 +40,10 @@ namespace MarkovGrams
/// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns> /// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns>
public string RandomNgram() public string RandomNgram()
{ {
return ngrams[rand.Next(0, ngrams.Count)]; IEnumerable<string> validNGrams = StartOnUppercase ? ngrams.Where((ngram) => char.IsUpper(ngram[0])) : ngrams;
if (validNGrams.Count() == 0)
throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase}).");
return validNGrams.ElementAt(rand.Next(0, validNGrams.Count()));
} }
/// <summary> /// <summary>

View file

@ -1,5 +1,6 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq;
namespace MarkovGrams.Utilities namespace MarkovGrams.Utilities
{ {
@ -12,5 +13,12 @@ namespace MarkovGrams.Utilities
action(item); action(item);
} }
} }
public static T ShiftAt<T>(this List<T> list, int index)
{
T item = list[index];
list.RemoveAt(index);
return item;
}
} }
} }

View file

@ -0,0 +1,19 @@
using System;
using System.Collections.Generic;
using System.IO;
namespace MarkovGrams.Utilities
{
public static class StreamReaderExtensions
{
public static IEnumerable<string> ReadAllLines(this StreamReader streamReader)
{
string line;
while ((line = streamReader.ReadLine()) != null) {
yield return line;
}
}
}
}

View file

@ -16,6 +16,11 @@ namespace SBRL.Algorithms
/// <changelog> /// <changelog>
/// v0.1 - 20th May 2017: /// v0.1 - 20th May 2017:
/// - Creation! :D /// - Creation! :D
/// v0.2 - 17th Februrary 2018:
/// - Add Count property
/// - Add SetContents and ClearContents methods
/// - Add empty constructor
/// - Next() will now throw an InvalidOperationException if the generator's internal weights list is empty
/// </changelog> /// </changelog>
public class WeightedRandom<ItemType> public class WeightedRandom<ItemType>
{ {

View file

@ -16,32 +16,46 @@ namespace MarkovGrams
/// <summary> /// <summary>
/// The ngrams that this markov chain currently contains. /// The ngrams that this markov chain currently contains.
/// </summary> /// </summary>
Dictionary<string, double> ngrams; private Dictionary<string, double> ngrams;
/// <summary>
/// Whether to always start generating a new word from an n-gram that starts with
/// an uppercase letter.
/// </summary>
public bool StartOnUppercase = false;
/// <summary> /// <summary>
/// Creates a new character-based markov chain. /// Creates a new character-based markov chain.
/// </summary> /// </summary>
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param> /// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
public WeightedMarkovChain(IEnumerable<string> inNgrams) public WeightedMarkovChain(Dictionary<string, double> inNgrams) {
{ ngrams = inNgrams;
}
public WeightedMarkovChain(Dictionary<string, int> inNgrams) {
ngrams = new Dictionary<string, double>(); ngrams = new Dictionary<string, double>();
foreach (string ngram in inNgrams) foreach (KeyValuePair<string, int> ngram in inNgrams)
{ ngrams[ngram.Key] = ngram.Value;
if (ngrams.ContainsKey(ngram))
ngrams[ngram]++;
else
ngrams.Add(ngram, 1);
}
} }
/// <summary> /// <summary>
/// Returns a random ngram that's currently loaded into this WeightedMarkovChain. /// Returns a random ngram that's currently loaded into this WeightedMarkovChain.
/// </summary> /// </summary>
/// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns> /// <returns>A random ngram from this UnweightedMarkovChain's cache of ngrams.</returns>
public string RandomNgram() public string RandomNgram()
{ {
if (wrandom.Count == 0) if (wrandom.Count == 0) {
wrandom.SetContents(ngrams); if (!StartOnUppercase)
wrandom.SetContents(ngrams);
else {
Dictionary<string, double> filteredNGrams = new Dictionary<string, double>();
foreach (KeyValuePair<string, double> pair in ngrams.Where((pair) => char.IsUpper(pair.Key[0])))
filteredNGrams.Add(pair.Key, pair.Value);
if (filteredNGrams.Count() == 0)
throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase}).");
wrandom.SetContents(filteredNGrams);
}
}
return wrandom.Next(); return wrandom.Next();
} }