Compare commits
2 commits
e7f67d7f5e
...
2d70038e4c
Author | SHA1 | Date | |
---|---|---|---|
2d70038e4c | |||
0aab41f1b7 |
8 changed files with 183 additions and 109 deletions
|
@ -39,6 +39,7 @@
|
||||||
<Compile Include="Utilities\WeightedRandom.cs" />
|
<Compile Include="Utilities\WeightedRandom.cs" />
|
||||||
<Compile Include="WeightedMarkovChain.cs" />
|
<Compile Include="WeightedMarkovChain.cs" />
|
||||||
<Compile Include="Utilities\LinqExtensions.cs" />
|
<Compile Include="Utilities\LinqExtensions.cs" />
|
||||||
|
<Compile Include="Utilities\StreamReaderExtensions.cs" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Folder Include="Utilities\" />
|
<Folder Include="Utilities\" />
|
||||||
|
|
|
@ -15,14 +15,15 @@ namespace MarkovGrams
|
||||||
/// <param name="words">The words to turn into n-grams.</param>
|
/// <param name="words">The words to turn into n-grams.</param>
|
||||||
/// <param name="order">The order of n-gram to generate..</param>
|
/// <param name="order">The order of n-gram to generate..</param>
|
||||||
/// <returns>A unique list of n-grams found in the given list of words.</returns>
|
/// <returns>A unique list of n-grams found in the given list of words.</returns>
|
||||||
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order)
|
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, bool distinct = true)
|
||||||
{
|
{
|
||||||
List<string> results = new List<string>();
|
List<string> results = new List<string>();
|
||||||
foreach(string word in words)
|
foreach (string word in words)
|
||||||
{
|
{
|
||||||
results.AddRange(GenerateFlat(word, order));
|
results.AddRange(GenerateFlat(word, order));
|
||||||
}
|
}
|
||||||
return results.Distinct();
|
if (distinct) return results.Distinct();
|
||||||
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|
|
@ -3,122 +3,139 @@ using System.Collections.Generic;
|
||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using MarkovGrams.Utilities;
|
||||||
|
|
||||||
namespace MarkovGrams
|
namespace MarkovGrams
|
||||||
{
|
{
|
||||||
|
public enum Mode
|
||||||
|
{
|
||||||
|
Help,
|
||||||
|
NGrams,
|
||||||
|
Markov,
|
||||||
|
WeightedMarkov
|
||||||
|
}
|
||||||
|
|
||||||
class MainClass
|
class MainClass
|
||||||
{
|
{
|
||||||
public static int Main(string[] args)
|
public static int Main(string[] args)
|
||||||
{
|
{
|
||||||
if(args.Length < 1)
|
List<string> extras = new List<string>();
|
||||||
|
StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput());
|
||||||
|
int order = 3, length = 8, count = 10;
|
||||||
|
bool splitOnWhitespace = true,
|
||||||
|
ngramsUnique = true,
|
||||||
|
convertLowercase = false,
|
||||||
|
startOnUppercase = false;
|
||||||
|
for (int i = 0; i < args.Length; i++)
|
||||||
{
|
{
|
||||||
Console.WriteLine("Usage:");
|
if (!args[i].StartsWith("-"))
|
||||||
Console.WriteLine(" ./MarkovGrams.exe <command>");
|
{
|
||||||
Console.WriteLine();
|
extras.Add(args[i]);
|
||||||
Console.WriteLine("Available commands:");
|
continue;
|
||||||
Console.WriteLine(" markov:");
|
}
|
||||||
Console.WriteLine(" Generate new words using an unweighted markov chain.");
|
|
||||||
Console.WriteLine(" markov-w:");
|
switch (args[i].TrimStart("-".ToCharArray()))
|
||||||
Console.WriteLine(" Generate new words using a weighted markov chain.");
|
{
|
||||||
Console.WriteLine(" ngrams:");
|
case "wordlist":
|
||||||
Console.WriteLine(" Generate raw unique n-grams");
|
wordlistSource = new StreamReader(args[++i]);
|
||||||
Console.WriteLine();
|
break;
|
||||||
Console.WriteLine("Type just ./MarovGrams.exe <command> to see command-specific help.");
|
case "order":
|
||||||
return 1;
|
order = int.Parse(args[++i]);
|
||||||
|
break;
|
||||||
|
case "length":
|
||||||
|
length = int.Parse(args[++i]);
|
||||||
|
break;
|
||||||
|
case "count":
|
||||||
|
count = int.Parse(args[++i]);
|
||||||
|
break;
|
||||||
|
case "no-split":
|
||||||
|
splitOnWhitespace = false;
|
||||||
|
break;
|
||||||
|
case "no-unique":
|
||||||
|
ngramsUnique = false;
|
||||||
|
break;
|
||||||
|
case "lowercase":
|
||||||
|
convertLowercase = true;
|
||||||
|
break;
|
||||||
|
case "start-uppercase":
|
||||||
|
startOnUppercase = true;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
Console.Error.WriteLine($"Error: Unknown option '{args[i]}'.");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
string mode = args[0];
|
|
||||||
string wordlistFilename;
|
|
||||||
int order;
|
|
||||||
IEnumerable<string> words, ngrams;
|
|
||||||
|
|
||||||
switch(mode)
|
Mode mode = extras.Count > 0 ? (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true) : Mode.Help;
|
||||||
|
|
||||||
|
|
||||||
|
// ------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
IEnumerable<string> words = wordlistSource.ReadAllLines().SelectMany((string word) => {
|
||||||
|
word = word.Trim();
|
||||||
|
if (convertLowercase)
|
||||||
|
word = word.ToLower();
|
||||||
|
if (splitOnWhitespace)
|
||||||
|
return word.Split(' ');
|
||||||
|
return new string[] { word.Trim() };
|
||||||
|
});
|
||||||
|
|
||||||
|
switch (mode)
|
||||||
{
|
{
|
||||||
case "markov":
|
case Mode.Markov:
|
||||||
if(args.Length != 5)
|
|
||||||
{
|
|
||||||
Console.WriteLine("markov command usage:");
|
|
||||||
Console.WriteLine(" ./MarkovGrams.exe markov <wordlist.txt> <order> <length> <count>");
|
|
||||||
Console.WriteLine();
|
|
||||||
Console.WriteLine("<wordlist.txt> The path to the wordlist to read from.");
|
|
||||||
Console.WriteLine("<order> The order of the n-grams to use.");
|
|
||||||
Console.WriteLine("<length> The length of word to generate.");
|
|
||||||
Console.WriteLine("<count> The number of words to generate.");
|
|
||||||
Console.WriteLine();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
wordlistFilename = args[1];
|
|
||||||
order = int.Parse(args[2]);
|
|
||||||
int desiredStringLength = int.Parse(args[3]);
|
|
||||||
int count = int.Parse(args[4]);
|
|
||||||
|
|
||||||
words = File.ReadLines(wordlistFilename).SelectMany(word => word.Trim().Split(' '));
|
|
||||||
ngrams = NGrams.GenerateFlat(words, order);
|
|
||||||
|
|
||||||
Stopwatch utimer = Stopwatch.StartNew();
|
Stopwatch utimer = Stopwatch.StartNew();
|
||||||
UnweightedMarkovChain chain = new UnweightedMarkovChain(ngrams);
|
UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain(
|
||||||
|
NGrams.GenerateFlat(words, order)
|
||||||
|
);
|
||||||
|
unweightedChain.StartOnUppercase = startOnUppercase;
|
||||||
|
|
||||||
for(int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
Console.WriteLine(chain.Generate(desiredStringLength));
|
Console.WriteLine(unweightedChain.Generate(length));
|
||||||
Console.Error.WriteLine($"{count} words in {utimer.ElapsedMilliseconds}ms");
|
Console.Error.WriteLine($"{count} words in {utimer.ElapsedMilliseconds}ms");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "markov-w":
|
case Mode.WeightedMarkov:
|
||||||
if (args.Length != 5)
|
|
||||||
{
|
|
||||||
Console.WriteLine("markov-w command usage:");
|
|
||||||
Console.WriteLine(" ./MarkovGrams.exe markov-w <wordlist.txt> <order> <length> <count>");
|
|
||||||
Console.WriteLine();
|
|
||||||
Console.WriteLine("<wordlist.txt> The path to the wordlist to read from.");
|
|
||||||
Console.WriteLine("<order> The order of the n-grams to use.");
|
|
||||||
Console.WriteLine("<length> The length of word to generate.");
|
|
||||||
Console.WriteLine("<count> The number of words to generate.");
|
|
||||||
Console.WriteLine();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
wordlistFilename = args[1];
|
|
||||||
order = int.Parse(args[2]);
|
|
||||||
int weightedDesiredStringLength = int.Parse(args[3]);
|
|
||||||
int weightedCount = int.Parse(args[4]);
|
|
||||||
|
|
||||||
words = File.ReadLines(wordlistFilename).SelectMany(word => word.Trim().Split(' '));
|
|
||||||
ngrams = NGrams.GenerateFlat(words, order);
|
|
||||||
|
|
||||||
Stopwatch wtimer = Stopwatch.StartNew();
|
Stopwatch wtimer = Stopwatch.StartNew();
|
||||||
WeightedMarkovChain weightedChain = new WeightedMarkovChain(ngrams);
|
WeightedMarkovChain weightedChain = new WeightedMarkovChain(
|
||||||
|
NGrams.GenerateWeighted(words, order)
|
||||||
|
);
|
||||||
|
weightedChain.StartOnUppercase = startOnUppercase;
|
||||||
|
|
||||||
for (int i = 0; i < weightedCount; i++)
|
for (int i = 0; i < count; i++)
|
||||||
Console.WriteLine(weightedChain.Generate(weightedDesiredStringLength));
|
Console.WriteLine(weightedChain.Generate(length));
|
||||||
Console.Error.WriteLine($"{weightedCount} words in {wtimer.ElapsedMilliseconds}ms");
|
Console.Error.WriteLine($"{count} words in {wtimer.ElapsedMilliseconds}ms");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "ngrams":
|
case Mode.NGrams:
|
||||||
if(args.Length != 3)
|
foreach (string ngram in NGrams.GenerateFlat(words, order, ngramsUnique))
|
||||||
{
|
|
||||||
Console.WriteLine("ngrams command usage:");
|
|
||||||
Console.WriteLine(" ./MarkovGrams.exe <wordlist.txt> <order>");
|
|
||||||
Console.WriteLine();
|
|
||||||
Console.WriteLine("<wordlist.txt> The path to the wordlist to read from.");
|
|
||||||
Console.WriteLine("<order> The order of n-grams to generate.");
|
|
||||||
Console.WriteLine();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
wordlistFilename = args[1];
|
|
||||||
order = int.Parse(args[2]);
|
|
||||||
words = File.ReadLines(wordlistFilename).SelectMany(word => word.Trim().Split(' '));
|
|
||||||
ngrams = NGrams.GenerateFlat(words, order);
|
|
||||||
|
|
||||||
foreach(string ngram in ngrams)
|
|
||||||
Console.WriteLine(ngram);
|
Console.WriteLine(ngram);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case Mode.Help:
|
||||||
default:
|
default:
|
||||||
Console.WriteLine("Unknown command {0}.");
|
Console.WriteLine("Usage:");
|
||||||
Console.WriteLine("Available commands:");
|
Console.WriteLine(" ./MarkovGrams.exe <mode> [options]");
|
||||||
Console.WriteLine(" markov Generate words with a markov chain");
|
|
||||||
Console.WriteLine(" ngrams Generate unique ngrams from wordlists");
|
|
||||||
Console.WriteLine();
|
Console.WriteLine();
|
||||||
|
Console.WriteLine("Available modes:");
|
||||||
|
Console.WriteLine(" markov:");
|
||||||
|
Console.WriteLine(" Generate new words using an unweighted markov chain.");
|
||||||
|
Console.WriteLine(" markov-w:");
|
||||||
|
Console.WriteLine(" Generate new words using a weighted markov chain.");
|
||||||
|
Console.WriteLine(" ngrams:");
|
||||||
|
Console.WriteLine(" Generate raw unique n-grams");
|
||||||
|
Console.WriteLine();
|
||||||
|
Console.WriteLine("Available options:");
|
||||||
|
Console.WriteLine(" --wordlist {filename} Read the wordlist from the specified filename instead of stdin");
|
||||||
|
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
|
||||||
|
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
|
||||||
|
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
|
||||||
|
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
|
||||||
|
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
|
||||||
|
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
|
||||||
|
Console.WriteLine(" --no-unique Don't remove duplicates from the list of ngrams (Only valid in ngrams mode)");
|
||||||
|
Console.WriteLine("Type just ./MarkovGrams.exe <mode> to see mode-specific help.");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,12 +12,18 @@ namespace MarkovGrams
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The random number generator
|
/// The random number generator
|
||||||
/// </summary>
|
/// </summary>
|
||||||
Random rand = new Random();
|
private Random rand = new Random();
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The ngrams that this markov chain currently contains.
|
/// The ngrams that this markov chain currently contains.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
List<string> ngrams;
|
private List<string> ngrams;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Whether to always start generating a new word from an n-gram that starts with
|
||||||
|
/// an uppercase letter.
|
||||||
|
/// </summary>
|
||||||
|
public bool StartOnUppercase = false;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Creates a new character-based markov chain.
|
/// Creates a new character-based markov chain.
|
||||||
|
@ -34,7 +40,10 @@ namespace MarkovGrams
|
||||||
/// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns>
|
/// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns>
|
||||||
public string RandomNgram()
|
public string RandomNgram()
|
||||||
{
|
{
|
||||||
return ngrams[rand.Next(0, ngrams.Count)];
|
IEnumerable<string> validNGrams = StartOnUppercase ? ngrams.Where((ngram) => char.IsUpper(ngram[0])) : ngrams;
|
||||||
|
if (validNGrams.Count() == 0)
|
||||||
|
throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase}).");
|
||||||
|
return validNGrams.ElementAt(rand.Next(0, validNGrams.Count()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
|
||||||
namespace MarkovGrams.Utilities
|
namespace MarkovGrams.Utilities
|
||||||
{
|
{
|
||||||
|
@ -12,5 +13,12 @@ namespace MarkovGrams.Utilities
|
||||||
action(item);
|
action(item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static T ShiftAt<T>(this List<T> list, int index)
|
||||||
|
{
|
||||||
|
T item = list[index];
|
||||||
|
list.RemoveAt(index);
|
||||||
|
return item;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
19
MarkovGrams/Utilities/StreamReaderExtensions.cs
Normal file
19
MarkovGrams/Utilities/StreamReaderExtensions.cs
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
|
||||||
|
namespace MarkovGrams.Utilities
|
||||||
|
{
|
||||||
|
public static class StreamReaderExtensions
|
||||||
|
{
|
||||||
|
public static IEnumerable<string> ReadAllLines(this StreamReader streamReader)
|
||||||
|
{
|
||||||
|
string line;
|
||||||
|
while ((line = streamReader.ReadLine()) != null) {
|
||||||
|
yield return line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,6 +16,11 @@ namespace SBRL.Algorithms
|
||||||
/// <changelog>
|
/// <changelog>
|
||||||
/// v0.1 - 20th May 2017:
|
/// v0.1 - 20th May 2017:
|
||||||
/// - Creation! :D
|
/// - Creation! :D
|
||||||
|
/// v0.2 - 17th Februrary 2018:
|
||||||
|
/// - Add Count property
|
||||||
|
/// - Add SetContents and ClearContents methods
|
||||||
|
/// - Add empty constructor
|
||||||
|
/// - Next() will now throw an InvalidOperationException if the generator's internal weights list is empty
|
||||||
/// </changelog>
|
/// </changelog>
|
||||||
public class WeightedRandom<ItemType>
|
public class WeightedRandom<ItemType>
|
||||||
{
|
{
|
||||||
|
|
|
@ -16,32 +16,46 @@ namespace MarkovGrams
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The ngrams that this markov chain currently contains.
|
/// The ngrams that this markov chain currently contains.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
Dictionary<string, double> ngrams;
|
private Dictionary<string, double> ngrams;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Whether to always start generating a new word from an n-gram that starts with
|
||||||
|
/// an uppercase letter.
|
||||||
|
/// </summary>
|
||||||
|
public bool StartOnUppercase = false;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Creates a new character-based markov chain.
|
/// Creates a new character-based markov chain.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
|
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
|
||||||
public WeightedMarkovChain(IEnumerable<string> inNgrams)
|
public WeightedMarkovChain(Dictionary<string, double> inNgrams) {
|
||||||
{
|
ngrams = inNgrams;
|
||||||
|
}
|
||||||
|
public WeightedMarkovChain(Dictionary<string, int> inNgrams) {
|
||||||
ngrams = new Dictionary<string, double>();
|
ngrams = new Dictionary<string, double>();
|
||||||
foreach (string ngram in inNgrams)
|
foreach (KeyValuePair<string, int> ngram in inNgrams)
|
||||||
{
|
ngrams[ngram.Key] = ngram.Value;
|
||||||
if (ngrams.ContainsKey(ngram))
|
|
||||||
ngrams[ngram]++;
|
|
||||||
else
|
|
||||||
ngrams.Add(ngram, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns a random ngram that's currently loaded into this WeightedMarkovChain.
|
/// Returns a random ngram that's currently loaded into this WeightedMarkovChain.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns>
|
/// <returns>A random ngram from this UnweightedMarkovChain's cache of ngrams.</returns>
|
||||||
public string RandomNgram()
|
public string RandomNgram()
|
||||||
{
|
{
|
||||||
if (wrandom.Count == 0)
|
if (wrandom.Count == 0) {
|
||||||
wrandom.SetContents(ngrams);
|
if (!StartOnUppercase)
|
||||||
|
wrandom.SetContents(ngrams);
|
||||||
|
else {
|
||||||
|
Dictionary<string, double> filteredNGrams = new Dictionary<string, double>();
|
||||||
|
foreach (KeyValuePair<string, double> pair in ngrams.Where((pair) => char.IsUpper(pair.Key[0])))
|
||||||
|
filteredNGrams.Add(pair.Key, pair.Value);
|
||||||
|
if (filteredNGrams.Count() == 0)
|
||||||
|
throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase}).");
|
||||||
|
wrandom.SetContents(filteredNGrams);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return wrandom.Next();
|
return wrandom.Next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue