Experiments into markov chains, n-grams, and text generation.

NGrams.cs 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. namespace MarkovGrams
  5. {
  6. public enum GenerationMode
  7. {
  8. CharacterLevel,
  9. WordLevel
  10. }
  11. /// <summary>
  12. /// A collection of methods to generate various different types of n-grams.
  13. /// </summary>
  14. public static class NGrams
  15. {
  16. public static bool Verbose { get; set; } = true;
  17. /// <summary>
  18. /// Generates a unique list of n-grams that the given list of words.
  19. /// </summary>
  20. /// <param name="words">The words to turn into n-grams.</param>
  21. /// <param name="order">The order of n-gram to generate..</param>
  22. /// <returns>A unique list of n-grams found in the given list of words.</returns>
  23. public static IEnumerable<string> GenerateFlat(IEnumerable<string> words, int order, GenerationMode mode, bool distinct = true)
  24. {
  25. List<string> results = new List<string>();
  26. foreach (string word in words)
  27. {
  28. results.AddRange(GenerateFlat(word, order, mode));
  29. }
  30. if (distinct) return results.Distinct();
  31. return results;
  32. }
  33. /// <summary>
  34. /// Generates a unique list of n-grams from the given string.
  35. /// </summary>
  36. /// <param name="str">The string to n-gram-ise.</param>
  37. /// <param name="order">The order of n-gram to generate.</param>
  38. /// <returns>A unique list of n-grams found in the specified string.</returns>
  39. private static IEnumerable<string> GenerateFlat(string str, int order, GenerationMode mode)
  40. {
  41. List<string> results = new List<string>();
  42. if (mode == GenerationMode.CharacterLevel) {
  43. for (int i = 0; i < str.Length - order; i++)
  44. results.Add(str.Substring(i, order));
  45. }
  46. else {
  47. string[] parts = str.Split(" ".ToCharArray());
  48. for (int i = 0; i < parts.Length; i++)
  49. results.Add(string.Join(" ", parts.Skip(i).Take(order)).Trim());
  50. }
  51. return results.Distinct();
  52. }
  53. /// <summary>
  54. /// Generates a dictionary of weighted n-grams from the given list of words.
  55. /// The key is the ngram itself, and the value is the linear weight of the ngram.
  56. /// </summary>
  57. /// <param name="words">The words to n-gram-ise.</param>
  58. /// <param name="order">The order of ngrams to generate.</param>
  59. /// <returns>The weighted dictionary of ngrams.</returns>
  60. public static Dictionary<string, int> GenerateWeighted(IEnumerable<string> words, int order, GenerationMode mode)
  61. {
  62. List<string> wordList = new List<string>(words);
  63. int wordCount = wordList.Count();
  64. Dictionary<string, int> results = new Dictionary<string, int>();
  65. int i = 0;
  66. foreach (string word in wordList) {
  67. GenerateWeighted(word, order, mode, ref results);
  68. i++;
  69. }
  70. return results;
  71. }
  72. /// <summary>
  73. /// Generates a dictionary of weighted n-grams from the specified string.
  74. /// </summary>
  75. /// <param name="str">The string to n-gram-ise.</param>
  76. /// <param name="order">The order of n-grams to generate.</param>
  77. /// <returns>The weighted dictionary of ngrams.</returns>
  78. private static void GenerateWeighted(string str, int order, GenerationMode mode, ref Dictionary<string, int> results)
  79. {
  80. if (mode == GenerationMode.CharacterLevel) {
  81. for (int i = 0; i < str.Length - order; i++) {
  82. string ngram = str.Substring(i, order);
  83. if (!results.ContainsKey(ngram))
  84. results[ngram] = 0;
  85. results[ngram]++;
  86. }
  87. }
  88. else {
  89. string[] parts = str.Split(" ".ToCharArray());
  90. for (int i = 0; i < parts.Length - order; i++) {
  91. string ngram = string.Join(" ", parts.Skip(i).Take(order)).Trim();
  92. if (ngram.Trim().Length == 0) continue;
  93. if (!results.ContainsKey(ngram))
  94. results[ngram] = 0;
  95. results[ngram]++;
  96. }
  97. }
  98. }
  99. }
  100. }