Add initial word-level support
This commit is contained in:
parent
ac184f9102
commit
face693554
4 changed files with 33 additions and 15 deletions
|
@ -15,6 +15,8 @@ namespace MarkovGrams
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static class NGrams
|
public static class NGrams
|
||||||
{
|
{
|
||||||
|
public static bool Verbose { get; set; } = true;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Generates a unique list of n-grams that the given list of words.
|
/// Generates a unique list of n-grams that the given list of words.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@ -38,7 +40,7 @@ namespace MarkovGrams
|
||||||
/// <param name="str">The string to n-gram-ise.</param>
|
/// <param name="str">The string to n-gram-ise.</param>
|
||||||
/// <param name="order">The order of n-gram to generate.</param>
|
/// <param name="order">The order of n-gram to generate.</param>
|
||||||
/// <returns>A unique list of n-grams found in the specified string.</returns>
|
/// <returns>A unique list of n-grams found in the specified string.</returns>
|
||||||
public static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
|
private static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
|
||||||
{
|
{
|
||||||
List<string> results = new List<string>();
|
List<string> results = new List<string>();
|
||||||
if (mode == GenerationMode.CharacterLevel) {
|
if (mode == GenerationMode.CharacterLevel) {
|
||||||
|
@ -62,9 +64,15 @@ namespace MarkovGrams
|
||||||
/// <returns>The weighted dictionary of ngrams.</returns>
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
||||||
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
|
public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
|
||||||
{
|
{
|
||||||
|
List<string> wordList = new List<string>(words);
|
||||||
|
int wordCount = wordList.Count();
|
||||||
Dictionary<string, int> results = new Dictionary<string, int>();
|
Dictionary<string, int> results = new Dictionary<string, int>();
|
||||||
foreach(string word in words)
|
int i = 0;
|
||||||
GenerateWeighted(word, order, mode);
|
foreach (string word in wordList) {
|
||||||
|
GenerateWeighted(word, order, mode, ref results);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
Console.WriteLine(" - done");
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
@ -73,12 +81,20 @@ namespace MarkovGrams
|
||||||
/// <param name="str">The string to n-gram-ise.</param>
|
/// <param name="str">The string to n-gram-ise.</param>
|
||||||
/// <param name="order">The order of n-grams to generate.</param>
|
/// <param name="order">The order of n-grams to generate.</param>
|
||||||
/// <returns>The weighted dictionary of ngrams.</returns>
|
/// <returns>The weighted dictionary of ngrams.</returns>
|
||||||
public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary<string, int> results)
|
private static void GenerateWeighted(string str, int order, GenerationMode mode, ref Dictionary<string, int> results)
|
||||||
{
|
{
|
||||||
string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null;
|
if (mode == GenerationMode.CharacterLevel) {
|
||||||
for(int i = 0; i < str.Length - order; i++)
|
for (int i = 0; i < str.Length - order; i++) {
|
||||||
{
|
string ngram = str.Substring(i, order);
|
||||||
string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order));
|
if (!results.ContainsKey(ngram))
|
||||||
|
results[ngram] = 0;
|
||||||
|
results[ngram]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
string[] parts = str.Split(" ".ToCharArray());
|
||||||
|
for (int i = 0; i < parts.Length - order; i++) {
|
||||||
|
string ngram = string.Join(" ", parts.Skip(i).Take(order));
|
||||||
if (!results.ContainsKey(ngram))
|
if (!results.ContainsKey(ngram))
|
||||||
results[ngram] = 0;
|
results[ngram] = 0;
|
||||||
results[ngram]++;
|
results[ngram]++;
|
||||||
|
@ -86,3 +102,4 @@ namespace MarkovGrams
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -65,6 +65,7 @@ namespace MarkovGrams
|
||||||
break;
|
break;
|
||||||
case "words":
|
case "words":
|
||||||
generationMode = GenerationMode.WordLevel;
|
generationMode = GenerationMode.WordLevel;
|
||||||
|
splitOnWhitespace = false;
|
||||||
break;
|
break;
|
||||||
case "help":
|
case "help":
|
||||||
operationMode = Mode.Help;
|
operationMode = Mode.Help;
|
||||||
|
@ -141,9 +142,9 @@ namespace MarkovGrams
|
||||||
Console.WriteLine(" --help Show this message");
|
Console.WriteLine(" --help Show this message");
|
||||||
Console.WriteLine(" --wordlist {filename} Read the wordlist from the specified filename instead of stdin");
|
Console.WriteLine(" --wordlist {filename} Read the wordlist from the specified filename instead of stdin");
|
||||||
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
|
Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)");
|
||||||
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)");
|
Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode; instead specifies the number of words to generate with --words)");
|
||||||
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
|
Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)");
|
||||||
Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes)");
|
Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes; implies --no-split)");
|
||||||
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
|
Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word");
|
||||||
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
|
Console.WriteLine(" --lowercase Convert the input to lowercase before processing");
|
||||||
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
|
Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter");
|
||||||
|
|
|
@ -69,7 +69,7 @@ namespace MarkovGrams
|
||||||
{
|
{
|
||||||
string result = RandomNgram();
|
string result = RandomNgram();
|
||||||
string lastNgram = result;
|
string lastNgram = result;
|
||||||
while(result.Length < length)
|
while((Mode == GenerationMode.CharacterLevel ? result.Length : result.Split(' ').Length) < length)
|
||||||
{
|
{
|
||||||
// The substring that the next ngram in the chain needs to start with
|
// The substring that the next ngram in the chain needs to start with
|
||||||
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
|
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
|
||||||
|
|
|
@ -83,7 +83,7 @@ namespace MarkovGrams
|
||||||
{
|
{
|
||||||
string result = RandomNgram();
|
string result = RandomNgram();
|
||||||
string lastNgram = result;
|
string lastNgram = result;
|
||||||
while(result.Length < length)
|
while((Mode == GenerationMode.CharacterLevel ? result.Length : result.Split(' ').Length) < length)
|
||||||
{
|
{
|
||||||
wrandom.ClearContents();
|
wrandom.ClearContents();
|
||||||
// The substring that the next ngram in the chain needs to start with
|
// The substring that the next ngram in the chain needs to start with
|
||||||
|
|
Loading…
Reference in a new issue