Compare commits

..

No commits in common. "c55c8f4e8a37c5e0a8b3a4717f2c59b58c3cb6ed" and "0a1eed722f34725dd74426b9bbac1549eebb06f3" have entirely different histories.

6 changed files with 35 additions and 42660 deletions

View file

@ -4,12 +4,6 @@ using System.Linq;
namespace MarkovGrams
{
public enum GenerationMode
{
CharacterLevel,
WordLevel
}
/// <summary>
/// A collection of methods to generate various different types of n-grams.
/// </summary>
@ -21,12 +15,12 @@ namespace MarkovGrams
/// <param name="words">The words to turn into n-grams.</param>
/// <param name="order">The order of n-gram to generate..</param>
/// <returns>A unique list of n-grams found in the given list of words.</returns>
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, GenerationMode mode, bool distinct = true)
public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, bool distinct = true)
{
List<string> results = new List<string>();
foreach (string word in words)
{
results.AddRange(GenerateFlat(word, order, mode));
results.AddRange(GenerateFlat(word, order));
}
if (distinct) return results.Distinct();
return results;
@ -38,18 +32,13 @@ namespace MarkovGrams
/// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-gram to generate.</param>
/// <returns>A unique list of n-grams found in the specified string.</returns>
public static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
public static IEnumerable<string> GenerateFlat(string str, int order)
{
List<string> results = new List<string>();
if (mode == GenerationMode.CharacterLevel) {
for (int i = 0; i < str.Length - order; i++)
for(int i = 0; i < str.Length - order; i++)
{
results.Add(str.Substring(i, order));
}
else {
string[] parts = str.Split(" ".ToCharArray());
for (int i = 0; i < parts.Length; i++)
results.Add(string.Join(" ", str.Skip(i).Take(order)));
}
return results.Distinct();
}
@ -60,11 +49,19 @@ namespace MarkovGrams
/// <param name="words">The words to n-gram-ise.</param>
/// <param name="order">The order of ngrams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns>
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order)
{
Dictionary<string, int> results = new Dictionary<string, int>();
foreach(string word in words)
GenerateWeighted(word, order, mode);
{
Dictionary<string, int> wordNgrams = GenerateWeighted(word, order);
foreach(KeyValuePair<string, int> ngram in wordNgrams)
{
if(!results.ContainsKey(ngram.Key))
results[ngram.Key] = 0;
results[ngram.Key] += ngram.Value;
}
}
return results;
}
/// <summary>
@ -73,16 +70,17 @@ namespace MarkovGrams
/// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-grams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns>
public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary<string, int> results)
public static Dictionary<string, int> GenerateWeighted(string str, int order)
{
string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null;
Dictionary<string, int> results = new Dictionary<string, int>();
for(int i = 0; i < str.Length - order; i++)
{
string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order));
string ngram = str.Substring(i, order);
if(!results.ContainsKey(ngram))
results[ngram] = 0;
results[ngram]++;
}
return results;
}
}
}

View file

@ -20,8 +20,7 @@ namespace MarkovGrams
{
public static int Main(string[] args)
{
Mode operationMode = Mode.None;
GenerationMode generationMode = GenerationMode.CharacterLevel;
Mode mode = Mode.None;
List<string> extras = new List<string>();
StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput());
int order = 3, length = 8, count = 10;
@ -63,11 +62,8 @@ namespace MarkovGrams
case "start-uppercase":
startOnUppercase = true;
break;
case "words":
generationMode = GenerationMode.WordLevel;
break;
case "help":
operationMode = Mode.Help;
mode = Mode.Help;
break;
default:
Console.Error.WriteLine($"Error: Unknown option '{args[i]}'.");
@ -75,8 +71,8 @@ namespace MarkovGrams
}
}
if(operationMode != Mode.Help && extras.Count > 0)
operationMode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true);
if(mode != Mode.Help && extras.Count > 0)
mode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true);
// ------------------------------------------------------------------------------------------
@ -90,13 +86,12 @@ namespace MarkovGrams
return new string[] { word.Trim() };
});
switch (operationMode)
switch (mode)
{
case Mode.Markov:
Stopwatch utimer = Stopwatch.StartNew();
UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain(
NGrams.GenerateFlat(words, order, generationMode),
generationMode
NGrams.GenerateFlat(words, order)
);
unweightedChain.StartOnUppercase = startOnUppercase;
@ -108,8 +103,7 @@ namespace MarkovGrams
case Mode.WeightedMarkov:
Stopwatch wtimer = Stopwatch.StartNew();
WeightedMarkovChain weightedChain = new WeightedMarkovChain(
NGrams.GenerateWeighted(words, order, generationMode),
generationMode
NGrams.GenerateWeighted(words, order)
);
weightedChain.StartOnUppercase = startOnUppercase;
@ -119,7 +113,7 @@ namespace MarkovGrams
break;
case Mode.NGrams:
foreach (string ngram in NGrams.GenerateFlat(words, order, generationMode, ngramsUnique))
foreach (string ngram in NGrams.GenerateFlat(words, order, ngramsUnique))
Console.WriteLine(ngram);
break;
@ -143,7 +137,6 @@ namespace MarkovGrams
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes)");
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");

View file

@ -25,22 +25,13 @@ namespace MarkovGrams
/// </summary>
public bool StartOnUppercase = false;
/// <summary>
/// The generation mode to use when running the Markov Chain.
/// </summary>
/// <remarks>
/// The input n-grams must have been generated using the same mode specified here.
/// </remarks>
public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
/// <summary>
/// Creates a new character-based markov chain.
/// </summary>
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
public UnweightedMarkovChain(IEnumerable<string> inNgrams, GenerationMode inMode)
public UnweightedMarkovChain(IEnumerable<string> inNgrams)
{
ngrams = new List<string>(inNgrams);
Mode = inMode;
}
/// <summary>
@ -72,7 +63,7 @@ namespace MarkovGrams
while(result.Length < length)
{
// The substring that the next ngram in the chain needs to start with
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
string nextStartsWith = lastNgram.Substring(1);
// Get a list of possible n-grams we could choose from next
List<string> nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith));
// If there aren't any choices left, we can't exactly keep adding to the new string any more :-(
@ -81,10 +72,7 @@ namespace MarkovGrams
// Pick a random n-gram from the list
string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count));
// Add the last character from the n-gram to the string we're building
if (Mode == GenerationMode.CharacterLevel)
result += nextNgram[nextNgram.Length - 1];
else
result += string.Join(" ", nextNgram.Split(' ').Skip(1));
lastNgram = nextNgram;
}

View file

@ -24,27 +24,17 @@ namespace MarkovGrams
/// </summary>
public bool StartOnUppercase = false;
/// <summary>
/// The generation mode to use when running the Markov Chain.
/// </summary>
/// <remarks>
/// The input n-grams must have been generated using the same mode specified here.
/// </remarks>
public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
/// <summary>
/// Creates a new character-based markov chain.
/// </summary>
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
public WeightedMarkovChain(Dictionary<string, double> inNgrams, GenerationMode inMode) {
public WeightedMarkovChain(Dictionary<string, double> inNgrams) {
ngrams = inNgrams;
Mode = inMode;
}
public WeightedMarkovChain(Dictionary<string, int> inNgrams, GenerationMode inMode) {
public WeightedMarkovChain(Dictionary<string, int> inNgrams) {
ngrams = new Dictionary<string, double>();
foreach (KeyValuePair<string, int> ngram in inNgrams)
ngrams[ngram.Key] = ngram.Value;
Mode = inMode;
}
/// <summary>
@ -87,7 +77,7 @@ namespace MarkovGrams
{
wrandom.ClearContents();
// The substring that the next ngram in the chain needs to start with
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
string nextStartsWith = lastNgram.Substring(1);
// Get a list of possible n-grams we could choose from next
Dictionary<string, double> convNextNgrams = new Dictionary<string, double>();
ngrams.Where(gram_data => gram_data.Key.StartsWith(nextStartsWith))
@ -99,10 +89,7 @@ namespace MarkovGrams
// Pick a random n-gram from the list
string nextNgram = wrandom.Next();
// Add the last character from the n-gram to the string we're building
if (Mode == GenerationMode.CharacterLevel)
result += nextNgram[nextNgram.Length - 1];
else
result += string.Join(" ", nextNgram.Split(' ').Skip(1));
lastNgram = nextNgram;
}
wrandom.ClearContents();

File diff suppressed because it is too large Load diff

View file

@ -15,6 +15,3 @@ sort Final-Fantasy-15-Items.txt -o Final-Fantasy-15-Items.txt
### No Man's Sky ###
curl "http://orcz.com/No_Man's_Sky:_Items_List" | xidel --data - --css "table td:first-child a, #mw-content-text > ul > li" | sed -e 's/\s*—.*$//g' | sort >No-Mans-Sky-Items.txt
### Recipes Wikia ###
curl http://recipes.wikia.com/sitemap-newsitemapxml-index.xml | xidel --data - --css "loc" | grep -i NS_0 | xargs -n1 -I{} sh -c 'curl {} | xidel --data - --css "loc"' | sed -e 's/^.*\///g' -e 's/_/ /g' | sort >Dishes.txt