using System; using System.Collections.Generic; using System.Linq; using MarkovGrams.Utilities; using SBRL.Algorithms; namespace MarkovGrams { /// /// An unweighted character-based markov chain. /// public class WeightedMarkovChain { private WeightedRandom wrandom = new WeightedRandom(); /// /// The ngrams that this markov chain currently contains. /// private Dictionary ngrams; /// /// Whether to always start generating a new word from an n-gram that starts with /// an uppercase letter. /// public bool StartOnUppercase = false; /// /// The generation mode to use when running the Markov Chain. /// /// /// The input n-grams must have been generated using the same mode specified here. /// public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel; /// /// Creates a new character-based markov chain. /// /// The ngrams to populate the new markov chain with. public WeightedMarkovChain(Dictionary inNgrams, GenerationMode inMode) { ngrams = inNgrams; Mode = inMode; } public WeightedMarkovChain(Dictionary inNgrams, GenerationMode inMode) { ngrams = new Dictionary(); foreach (KeyValuePair ngram in inNgrams) ngrams[ngram.Key] = ngram.Value; Mode = inMode; } /// /// Returns a random ngram that's currently loaded into this WeightedMarkovChain. /// /// A random ngram from this UnweightedMarkovChain's cache of ngrams. public string RandomNgram() { if (wrandom.Count == 0) { if (!StartOnUppercase) wrandom.SetContents(ngrams); else { Dictionary filteredNGrams = new Dictionary(); foreach (KeyValuePair pair in ngrams.Where((pair) => char.IsUpper(pair.Key[0]))) filteredNGrams.Add(pair.Key, pair.Value); if (filteredNGrams.Count() == 0) throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase})."); wrandom.SetContents(filteredNGrams); } } return wrandom.Next(); } /// /// Generates a new random string from the currently stored ngrams. /// /// /// The length of ngram to generate. /// Note that this is a target, not a fixed value - e.g. passing 2 when the n-gram order is 3 will /// result in a string of length 3. Also, depending on the current ngrams this markov chain contains, /// it may end up being cut short. /// /// A new random string. public string Generate(int length) { string result = RandomNgram(); string lastNgram = result; while((Mode == GenerationMode.CharacterLevel ? result.Length : result.Split(' ').Length) < length) { wrandom.ClearContents(); // The substring that the next ngram in the chain needs to start with string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0]; // Get a list of possible n-grams we could choose from next Dictionary convNextNgrams = new Dictionary(); ngrams.Where(gram_data => gram_data.Key.StartsWith(nextStartsWith)) .ForEach((KeyValuePair ngramData) => convNextNgrams.Add(ngramData.Key, ngramData.Value)); // If there aren't any choices left, we can't exactly keep adding to the new string any more :-( if(convNextNgrams.Count() == 0) break; wrandom.SetContents(convNextNgrams); // Pick a random n-gram from the list string nextNgram = wrandom.Next(); // Add the last character from the n-gram to the string we're building if (Mode == GenerationMode.CharacterLevel) result += nextNgram[nextNgram.Length - 1]; else result += string.Join(" ", nextNgram.Split(' ').Skip(1)); lastNgram = nextNgram; } wrandom.ClearContents(); return result; } } }