Experiments into markov chains, n-grams, and text generation.

WeightedMarkovChain.cs 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. using System;
  2. using System.Collections.Concurrent;
  3. using System.Collections.Generic;
  4. using System.Linq;
  5. using System.Threading.Tasks;
  6. using MarkovGrams.Utilities;
  7. using SBRL.Algorithms;
  8. namespace MarkovGrams
  9. {
  10. /// <summary>
  11. /// An unweighted character-based markov chain.
  12. /// </summary>
  13. public class WeightedMarkovChain
  14. {
  15. private WeightedRandom<string> wrandom = new WeightedRandom<string>();
  16. /// <summary>
  17. /// The ngrams that this markov chain currently contains.
  18. /// </summary>
  19. private Dictionary<string, double> ngrams;
  20. /// <summary>
  21. /// Whether to always start generating a new word from an n-gram that starts with
  22. /// an uppercase letter.
  23. /// </summary>
  24. public bool StartOnUppercase = false;
  25. /// <summary>
  26. /// The generation mode to use when running the Markov Chain.
  27. /// </summary>
  28. /// <remarks>
  29. /// The input n-grams must have been generated using the same mode specified here.
  30. /// </remarks>
  31. public GenerationMode Mode { get; private set; } = GenerationMode.CharacterLevel;
  32. /// <summary>
  33. /// Creates a new character-based markov chain.
  34. /// </summary>
  35. /// <param name="inNgrams">The ngrams to populate the new markov chain with.</param>
  36. public WeightedMarkovChain(Dictionary<string, double> inNgrams, GenerationMode inMode) {
  37. ngrams = inNgrams;
  38. Mode = inMode;
  39. }
  40. public WeightedMarkovChain(Dictionary<string, int> inNgrams, GenerationMode inMode) {
  41. ngrams = new Dictionary<string, double>();
  42. foreach (KeyValuePair<string, int> ngram in inNgrams)
  43. ngrams[ngram.Key] = ngram.Value;
  44. Mode = inMode;
  45. }
  46. /// <summary>
  47. /// Returns a random ngram that's currently loaded into this WeightedMarkovChain.
  48. /// </summary>
  49. /// <returns>A random ngram from this UnweightedMarkovChain's cache of ngrams.</returns>
  50. public string RandomNgram()
  51. {
  52. if (wrandom.Count == 0) {
  53. if (!StartOnUppercase)
  54. wrandom.SetContents(ngrams);
  55. else {
  56. ConcurrentDictionary<string, double> filteredNGrams = new ConcurrentDictionary<string, double>();
  57. Parallel.ForEach(ngrams, (KeyValuePair<string, double> pair) => {
  58. if (!char.IsUpper(pair.Key[0])) return;
  59. if (!filteredNGrams.TryAdd(pair.Key, pair.Value))
  60. throw new Exception("Error: Couldn't add to uppercase staging n-gram ConcurrentDictionary!");
  61. });
  62. if (filteredNGrams.Count() == 0)
  63. throw new Exception($"Error: No valid starting ngrams were found (StartOnUppercase: {StartOnUppercase}).");
  64. wrandom.SetContents(filteredNGrams);
  65. }
  66. }
  67. return wrandom.Next();
  68. }
  69. /// <summary>
  70. /// Generates a new random string from the currently stored ngrams.
  71. /// </summary>
  72. /// <param name="length">
  73. /// The length of ngram to generate.
  74. /// Note that this is a target, not a fixed value - e.g. passing 2 when the n-gram order is 3 will
  75. /// result in a string of length 3. Also, depending on the current ngrams this markov chain contains,
  76. /// it may end up being cut short.
  77. /// </param>
  78. /// <returns>A new random string.</returns>
  79. public string Generate(int length)
  80. {
  81. return Generate(length, out float noop);
  82. }
  83. public string Generate(int length, out float choicePointRatio)
  84. {
  85. string result = RandomNgram();
  86. string lastNgram = result;
  87. ConcurrentBag<int> choiceCounts = new ConcurrentBag<int>(); int i = 0;
  88. while((Mode == GenerationMode.CharacterLevel ? result.Length : result.CountCharInstances(" ".ToCharArray()) + 1) < length)
  89. {
  90. wrandom.ClearContents();
  91. // The substring that the next ngram in the chain needs to start with
  92. string nextStartsWith = Mode == GenerationMode.CharacterLevel ? lastNgram.Substring(1) : string.Join(" ", lastNgram.Split(' ').Skip(1));
  93. // Get a list of possible n-grams we could choose from next
  94. ConcurrentDictionary<string, double> convNextNgrams = new ConcurrentDictionary<string, double>();
  95. Parallel.ForEach(ngrams, (KeyValuePair<string, double> ngramData) => {
  96. if (!ngramData.Key.StartsWithFast(nextStartsWith)) return;
  97. if (!convNextNgrams.TryAdd(ngramData.Key, ngramData.Value))
  98. throw new Exception("Error: Failed to add to staging ngram concurrent dictionary");
  99. });
  100. choiceCounts.Add(convNextNgrams.Count());
  101. // If there aren't any choices left, we can't exactly keep adding to the new string any more :-(
  102. if(convNextNgrams.Count == 0)
  103. break;
  104. wrandom.SetContents(convNextNgrams);
  105. // Pick a random n-gram from the list
  106. string nextNgram = wrandom.Next();
  107. // Add the last character from the n-gram to the string we're building
  108. if (Mode == GenerationMode.CharacterLevel)
  109. result += nextNgram[nextNgram.Length - 1];
  110. else
  111. result += ' ' + nextNgram.Substring(nextNgram.LastIndexOf(' ') + 1);
  112. lastNgram = nextNgram; i++;
  113. }
  114. wrandom.ClearContents();
  115. if (choiceCounts.Sum() > 0)
  116. choicePointRatio = (float)choiceCounts.Sum() / (float)(i + 1);
  117. else
  118. choicePointRatio = 0;
  119. return result;
  120. }
  121. }
  122. }