Compare commits

..

No commits in common. "face6935549bd40f2938133b4e3089a14c61b69a" and "c55c8f4e8a37c5e0a8b3a4717f2c59b58c3cb6ed" have entirely different histories.

6 changed files with 2358 additions and 2376 deletions

View file

@ -15,8 +15,6 @@ namespace MarkovGrams
/// </summary> /// </summary>
public static class NGrams public static class NGrams
{ {
public static bool Verbose { get; set; } = true;
/// <summary> /// <summary>
/// Generates a unique list of n-grams that the given list of words. /// Generates a unique list of n-grams that the given list of words.
/// </summary> /// </summary>
@ -40,7 +38,7 @@ namespace MarkovGrams
/// <param name="str">The string to n-gram-ise.</param> /// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-gram to generate.</param> /// <param name="order">The order of n-gram to generate.</param>
/// <returns>A unique list of n-grams found in the specified string.</returns> /// <returns>A unique list of n-grams found in the specified string.</returns>
private static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode) public static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
{ {
List<string> results = new List<string>(); List<string> results = new List<string>();
if (mode == GenerationMode.CharacterLevel) { if (mode == GenerationMode.CharacterLevel) {
@ -64,15 +62,9 @@ namespace MarkovGrams
/// <returns>The weighted dictionary of ngrams.</returns> /// <returns>The weighted dictionary of ngrams.</returns>
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode) public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
{ {
List<string> wordList = new List<string>(words);
int wordCount = wordList.Count();
Dictionary<string, int> results = new Dictionary<string, int>(); Dictionary<string, int> results = new Dictionary<string, int>();
int i = 0; foreach(string word in words)
foreach (string word in wordList) { GenerateWeighted(word, order, mode);
GenerateWeighted(word, order, mode, ref results);
i++;
}
Console.WriteLine(" - done");
return results; return results;
} }
/// <summary> /// <summary>
@ -81,20 +73,12 @@ namespace MarkovGrams
/// <param name="str">The string to n-gram-ise.</param> /// <param name="str">The string to n-gram-ise.</param>
/// <param name="order">The order of n-grams to generate.</param> /// <param name="order">The order of n-grams to generate.</param>
/// <returns>The weighted dictionary of ngrams.</returns> /// <returns>The weighted dictionary of ngrams.</returns>
private static void GenerateWeighted(string str, int order, GenerationMode mode, ref Dictionary<string, int> results) public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary<string, int> results)
{ {
if (mode == GenerationMode.CharacterLevel) { string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null;
for (int i = 0; i < str.Length - order; i++) { for(int i = 0; i < str.Length - order; i++)
string ngram = str.Substring(i, order); {
if (!results.ContainsKey(ngram)) string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order));
results[ngram] = 0;
results[ngram]++;
}
}
else {
string[] parts = str.Split(" ".ToCharArray());
for (int i = 0; i < parts.Length - order; i++) {
string ngram = string.Join(" ", parts.Skip(i).Take(order));
if(!results.ContainsKey(ngram)) if(!results.ContainsKey(ngram))
results[ngram] = 0; results[ngram] = 0;
results[ngram]++; results[ngram]++;
@ -102,4 +86,3 @@ namespace MarkovGrams
} }
} }
} }
}

View file

@ -65,7 +65,6 @@ namespace MarkovGrams
break; break;
case "words": case "words":
generationMode = GenerationMode.WordLevel; generationMode = GenerationMode.WordLevel;
splitOnWhitespace = false;
break; break;
case "help": case "help":
operationMode = Mode.Help; operationMode = Mode.Help;
@ -142,9 +141,9 @@ namespace MarkovGrams
Console.WriteLine(" --help Show this message"); Console.WriteLine(" --help Show this message");
Console.WriteLine(" --wordlist {filename} Read the wordlist from the specified filename instead of stdin"); Console.WriteLine(" --wordlist {filename} Read the wordlist from the specified filename instead of stdin");
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)"); Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode; instead specifies the number of words to generate with --words)"); Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)"); Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes; implies --no-split)"); Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes)");
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word"); Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
Console.WriteLine(" --lowercase Convert the input to lowercase before processing"); Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter"); Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");

View file

@ -69,7 +69,7 @@ namespace MarkovGrams
{ {
string result = RandomNgram(); string result = RandomNgram();
string lastNgram = result; string lastNgram = result;
while((Mode == GenerationMode.CharacterLevel ? result.Length : result.Split(' ').Length) < length) while(result.Length < length)
{ {
// The substring that the next ngram in the chain needs to start with // The substring that the next ngram in the chain needs to start with
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0]; string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];

View file

@ -83,7 +83,7 @@ namespace MarkovGrams
{ {
string result = RandomNgram(); string result = RandomNgram();
string lastNgram = result; string lastNgram = result;
while((Mode == GenerationMode.CharacterLevel ? result.Length : result.Split(' ').Length) < length) while(result.Length < length)
{ {
wrandom.ClearContents(); wrandom.ClearContents();
// The substring that the next ngram in the chain needs to start with // The substring that the next ngram in the chain needs to start with

File diff suppressed because it is too large Load diff

View file

@ -17,4 +17,4 @@ sort Final-Fantasy-15-Items.txt -o Final-Fantasy-15-Items.txt
curl "http://orcz.com/No_Man's_Sky:_Items_List" | xidel --data - --css "table td:first-child a, #mw-content-text > ul > li" | sed -e 's/\s*—.*$//g' | sort >No-Mans-Sky-Items.txt curl "http://orcz.com/No_Man's_Sky:_Items_List" | xidel --data - --css "table td:first-child a, #mw-content-text > ul > li" | sed -e 's/\s*—.*$//g' | sort >No-Mans-Sky-Items.txt
### Recipes Wikia ### ### Recipes Wikia ###
curl http://recipes.wikia.com/sitemap-newsitemapxml-index.xml | xidel --data - --css "loc" | grep -i NS_0 | xargs -n1 -I{} sh -c 'curl {} | xidel --data - --css "loc"' | sed -e 's/^.*\///g' -e 's/_/ /g' | python -c "import urllib, sys; print urllib.unquote(sys.argv[1] if len(sys.argv) > 1 else sys.stdin.read()[0:-1])" | sort >Dishes.txt curl http://recipes.wikia.com/sitemap-newsitemapxml-index.xml | xidel --data - --css "loc" | grep -i NS_0 | xargs -n1 -I{} sh -c 'curl {} | xidel --data - --css "loc"' | sed -e 's/^.*\///g' -e 's/_/ /g' | sort >Dishes.txt