diff --git a/MarkovGrams/NGrams.cs b/MarkovGrams/NGrams.cs index 913ac49..2c7bba5 100644 --- a/MarkovGrams/NGrams.cs +++ b/MarkovGrams/NGrams.cs @@ -4,6 +4,12 @@ using System.Linq; namespace MarkovGrams { + public enum GenerationMode + { + CharacterLevel, + WordLevel + } + /// /// A collection of methods to generate various different types of n-grams. /// @@ -15,12 +21,12 @@ namespace MarkovGrams /// The words to turn into n-grams. /// The order of n-gram to generate.. /// A unique list of n-grams found in the given list of words. - public static IEnumerable GenerateFlat(IEnumerable words, int order, bool distinct = true) + public static IEnumerable GenerateFlat(IEnumerable words, int order, GenerationMode mode, bool distinct = true) { List results = new List(); foreach (string word in words) { - results.AddRange(GenerateFlat(word, order)); + results.AddRange(GenerateFlat(word, order, mode)); } if (distinct) return results.Distinct(); return results; @@ -32,12 +38,17 @@ namespace MarkovGrams /// The string to n-gram-ise. /// The order of n-gram to generate. /// A unique list of n-grams found in the specified string. - public static IEnumerable GenerateFlat(string str, int order) + public static IEnumerable GenerateFlat(string str, int order, GenerationMode mode) { List results = new List(); - for(int i = 0; i < str.Length - order; i++) - { - results.Add(str.Substring(i, order)); + if (mode == GenerationMode.CharacterLevel) { + for (int i = 0; i < str.Length - order; i++) + results.Add(str.Substring(i, order)); + } + else { + string[] parts = str.Split(" ".ToCharArray()); + for (int i = 0; i < parts.Length; i++) + results.Add(string.Join(" ", str.Skip(i).Take(order))); } return results.Distinct(); } @@ -49,19 +60,11 @@ namespace MarkovGrams /// The words to n-gram-ise. /// The order of ngrams to generate. /// The weighted dictionary of ngrams. - public static Dictionary GenerateWeighted(IEnumerable words, int order) + public static Dictionary GenerateWeighted(IEnumerable words, int order, GenerationMode mode) { Dictionary results = new Dictionary(); foreach(string word in words) - { - Dictionary wordNgrams = GenerateWeighted(word, order); - foreach(KeyValuePair ngram in wordNgrams) - { - if(!results.ContainsKey(ngram.Key)) - results[ngram.Key] = 0; - results[ngram.Key] += ngram.Value; - } - } + GenerateWeighted(word, order, mode); return results; } /// @@ -70,17 +73,16 @@ namespace MarkovGrams /// The string to n-gram-ise. /// The order of n-grams to generate. /// The weighted dictionary of ngrams. - public static Dictionary GenerateWeighted(string str, int order) + public static void GenerateWeighted(string str, int order, GenerationMode mode, Dictionary results) { - Dictionary results = new Dictionary(); + string[] parts = mode == GenerationMode.WordLevel ? str.Split(" ".ToCharArray()) : null; for(int i = 0; i < str.Length - order; i++) { - string ngram = str.Substring(i, order); + string ngram = mode == GenerationMode.CharacterLevel ? str.Substring(i, order) : string.Join(" ", parts.Skip(i).Take(order)); if(!results.ContainsKey(ngram)) results[ngram] = 0; results[ngram]++; } - return results; } } } diff --git a/MarkovGrams/Program.cs b/MarkovGrams/Program.cs index c29a865..405b36a 100644 --- a/MarkovGrams/Program.cs +++ b/MarkovGrams/Program.cs @@ -20,7 +20,8 @@ namespace MarkovGrams { public static int Main(string[] args) { - Mode mode = Mode.None; + Mode operationMode = Mode.None; + GenerationMode generationMode = GenerationMode.CharacterLevel; List extras = new List(); StreamReader wordlistSource = new StreamReader(Console.OpenStandardInput()); int order = 3, length = 8, count = 10; @@ -62,8 +63,11 @@ namespace MarkovGrams case "start-uppercase": startOnUppercase = true; break; + case "words": + generationMode = GenerationMode.WordLevel; + break; case "help": - mode = Mode.Help; + operationMode = Mode.Help; break; default: Console.Error.WriteLine($"Error: Unknown option '{args[i]}'."); @@ -71,8 +75,8 @@ namespace MarkovGrams } } - if(mode != Mode.Help && extras.Count > 0) - mode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true); + if(operationMode != Mode.Help && extras.Count > 0) + operationMode = (Mode)Enum.Parse(typeof(Mode), extras.ShiftAt(0).Replace("markov-w", "weightedmarkov"), true); // ------------------------------------------------------------------------------------------ @@ -86,12 +90,13 @@ namespace MarkovGrams return new string[] { word.Trim() }; }); - switch (mode) + switch (operationMode) { case Mode.Markov: Stopwatch utimer = Stopwatch.StartNew(); UnweightedMarkovChain unweightedChain = new UnweightedMarkovChain( - NGrams.GenerateFlat(words, order) + NGrams.GenerateFlat(words, order, generationMode), + generationMode ); unweightedChain.StartOnUppercase = startOnUppercase; @@ -103,7 +108,8 @@ namespace MarkovGrams case Mode.WeightedMarkov: Stopwatch wtimer = Stopwatch.StartNew(); WeightedMarkovChain weightedChain = new WeightedMarkovChain( - NGrams.GenerateWeighted(words, order) + NGrams.GenerateWeighted(words, order, generationMode), + generationMode ); weightedChain.StartOnUppercase = startOnUppercase; @@ -113,7 +119,7 @@ namespace MarkovGrams break; case Mode.NGrams: - foreach (string ngram in NGrams.GenerateFlat(words, order, ngramsUnique)) + foreach (string ngram in NGrams.GenerateFlat(words, order, generationMode, ngramsUnique)) Console.WriteLine(ngram); break; @@ -137,6 +143,7 @@ namespace MarkovGrams Console.WriteLine(" --order {number} Use the specified order when generating n-grams (default: 3)"); Console.WriteLine(" --length {number} The target length of word to generate (Not available in ngrams mode)"); Console.WriteLine(" --count {number} The number of words to generate (Not valid in ngrams mode)"); + Console.WriteLine(" --words Generate ngrams on word-level instead of character-level (Applies to all modes)"); Console.WriteLine(" --no-split Don't split input words on whitespace - treat each line as a single word"); Console.WriteLine(" --lowercase Convert the input to lowercase before processing"); Console.WriteLine(" --start-uppercase Start the generating a word only with n-grams that start with a capital letter"); diff --git a/MarkovGrams/UnweightedMarkovChain.cs b/MarkovGrams/UnweightedMarkovChain.cs index 415048d..d5899e7 100644 --- a/MarkovGrams/UnweightedMarkovChain.cs +++ b/MarkovGrams/UnweightedMarkovChain.cs @@ -25,13 +25,22 @@ namespace MarkovGrams /// public bool StartOnUppercase = false; + /// + /// The generation mode to use when running the Markov Chain. + /// + /// + /// The input n-grams must have been generated using the same mode specified here. + /// + public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel; + /// /// Creates a new character-based markov chain. /// /// The ngrams to populate the new markov chain with. - public UnweightedMarkovChain(IEnumerable inNgrams) + public UnweightedMarkovChain(IEnumerable inNgrams, GenerationMode inMode) { ngrams = new List(inNgrams); + Mode = inMode; } /// @@ -63,7 +72,7 @@ namespace MarkovGrams while(result.Length < length) { // The substring that the next ngram in the chain needs to start with - string nextStartsWith = lastNgram.Substring(1); + string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0]; // Get a list of possible n-grams we could choose from next List nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith)); // If there aren't any choices left, we can't exactly keep adding to the new string any more :-( @@ -72,7 +81,10 @@ namespace MarkovGrams // Pick a random n-gram from the list string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count)); // Add the last character from the n-gram to the string we're building - result += nextNgram[nextNgram.Length - 1]; + if (Mode == GenerationMode.CharacterLevel) + result += nextNgram[nextNgram.Length - 1]; + else + result += string.Join(" ", nextNgram.Split(' ').Skip(1)); lastNgram = nextNgram; } diff --git a/MarkovGrams/WeightedMarkovChain.cs b/MarkovGrams/WeightedMarkovChain.cs index 3ff9f13..3e30d9e 100644 --- a/MarkovGrams/WeightedMarkovChain.cs +++ b/MarkovGrams/WeightedMarkovChain.cs @@ -24,17 +24,27 @@ namespace MarkovGrams /// public bool StartOnUppercase = false; + /// + /// The generation mode to use when running the Markov Chain. + /// + /// + /// The input n-grams must have been generated using the same mode specified here. + /// + public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel; + /// /// Creates a new character-based markov chain. /// /// The ngrams to populate the new markov chain with. - public WeightedMarkovChain(Dictionary inNgrams) { + public WeightedMarkovChain(Dictionary inNgrams, GenerationMode inMode) { ngrams = inNgrams; + Mode = inMode; } - public WeightedMarkovChain(Dictionary inNgrams) { + public WeightedMarkovChain(Dictionary inNgrams, GenerationMode inMode) { ngrams = new Dictionary(); foreach (KeyValuePair ngram in inNgrams) ngrams[ngram.Key] = ngram.Value; + Mode = inMode; } /// @@ -77,7 +87,7 @@ namespace MarkovGrams { wrandom.ClearContents(); // The substring that the next ngram in the chain needs to start with - string nextStartsWith = lastNgram.Substring(1); + string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0]; // Get a list of possible n-grams we could choose from next Dictionary convNextNgrams = new Dictionary(); ngrams.Where(gram_data => gram_data.Key.StartsWith(nextStartsWith)) @@ -89,7 +99,10 @@ namespace MarkovGrams // Pick a random n-gram from the list string nextNgram = wrandom.Next(); // Add the last character from the n-gram to the string we're building - result += nextNgram[nextNgram.Length - 1]; + if (Mode == GenerationMode.CharacterLevel) + result += nextNgram[nextNgram.Length - 1]; + else + result += string.Join(" ", nextNgram.Split(' ').Skip(1)); lastNgram = nextNgram; } wrandom.ClearContents();