Experiments into markov chains, n-grams, and text generation.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

94 lines
3.3 KiB

using System;
using System.Collections.Generic;
using System.Linq;
namespace MarkovGrams
{
/// <summary>
/// An unweighted character-based markov chain.
/// </summary>
public class UnweightedMarkovChain
{
/// <summary>
/// The random number generator
/// </summary>
private Random rand = new Random();
/// <summary>
/// The ngrams that this markov chain currently contains.
/// </summary>
private List<string> ngrams;
/// <summary>
/// Whether to always start generating a new word from an n-gram that starts with
/// an uppercase letter.
/// </summary>
public bool StartOnUppercase = false;
/// <summary>
/// The generation mode to use when running the Markov Chain.
/// </summary>
/// <remarks>
/// The input n-grams must have been generated using the same mode specified here.
/// </remarks>
public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
/// <summary>
/// Creates a new character-based markov chain.
/// </summary>
/// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
public UnweightedMarkovChain(IEnumerable<string> inNgrams, GenerationMode inMode)
{
ngrams = new List<string>(inNgrams);
Mode = inMode;
}
/// <summary>
/// Returns a random ngram that's currently loaded into this UnweightedMarkovChain.
/// </summary>
/// <returns>A random ngram from this UnweightMarkovChain's cache of ngrams.</returns>
public string RandomNgram()
{
IEnumerable<string> validNGrams = StartOnUppercase ? ngrams.Where((ngram) => char.IsUpper(ngram[0])) : ngrams;
if (validNGrams.Count() == 0)
throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase}).");
return validNGrams.ElementAt(rand.Next(0, validNGrams.Count()));
}
/// <summary>
/// Generates a new random string from the currently stored ngrams.
/// </summary>
/// <param name="length">
/// The length of ngram to generate.
/// Note that this is a target, not a fixed value - e.g. passing 2 when the n-gram order is 3 will
/// result in a string of length 3. Also, depending on the current ngrams this markov chain contains,
/// it may end up being cut short.
/// </param>
/// <returns>A new random string.</returns>
public string Generate(int length)
{
string result = RandomNgram();
string lastNgram = result;
while(result.Length < length)
{
// The substring that the next ngram in the chain needs to start with
string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : lastNgram.Split(' ')[0];
// Get a list of possible n-grams we could choose from next
List<string> nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith));
// If there aren't any choices left, we can't exactly keep adding to the new string any more :-(
if(nextNgrams.Count == 0)
break;
// Pick a random n-gram from the list
string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count));
// Add the last character from the n-gram to the string we're building
if (Mode == GenerationMode.CharacterLevel)
result += nextNgram[nextNgram.Length - 1];
else
result += string.Join(" ", nextNgram.Split(' ').Skip(1));
lastNgram = nextNgram;
}
return result;
}
}
}