commit 647b4cc0b14a7b5b69dad2df0cef42f54007ab68 Author: Starbeamrainbowlabs Date: Wed Apr 26 21:35:00 2017 +0100 Initial commit. I think this si worth keeping in it's entirety! diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..08185c2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,434 @@ + +# Created by https://www.gitignore.io/api/visualstudio,monodevelop,csharp + +### Csharp ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ + +# Visual Studio 2015 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ +**/Properties/launchSettings.json + +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# TODO: Comment the next line if you want to checkin your web deploy settings +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/packages/* +# except build/, which is used as an MSBuild target. +!**/packages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/packages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Typescript v1 declaration files +typings/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# JetBrains Rider +.idea/ +*.sln.iml + +# CodeRush +.cr/ + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +### MonoDevelop ### +#User Specific +*.usertasks + +#Mono Project Files +*.resources +test-results/ + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files + +# User-specific files (MonoDevelop/Xamarin Studio) + +# Build results + +# Visual Studio 2015 cache/options directory +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# MSTest test Results + +# NUNIT + +# Build Results of an ATL Project + +# .NET Core + + +# Chutzpah Test files + +# Visual C++ cache files + +# Visual Studio profiler + +# TFS 2012 Local Workspace + +# Guidance Automation Toolkit + +# ReSharper is a .NET coding add-in + +# JustCode is a .NET coding add-in + +# TeamCity is a build add-in + +# DotCover is a Code Coverage Tool + +# Visual Studio code coverage results + +# NCrunch + +# MightyMoose + +# Web workbench (sass) + +# Installshield output folder + +# DocProject is a documentation generator add-in + +# Click-Once directory + +# Publish Web Output +# TODO: Comment the next line if you want to checkin your web deploy settings +# but database connection strings (with potential passwords) will be unencrypted + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted + +# NuGet Packages +# The packages folder can be ignored because of Package Restore +# except build/, which is used as an MSBuild target. +# Uncomment if necessary however generally it will be regenerated when needed +#!**/packages/repositories.config +# NuGet v3's project.json files produces more ignorable files + +# Microsoft Azure Build Output + +# Microsoft Azure Emulator + +# Windows Store app package directories and files + +# Visual Studio cache files +# files ending in .cache can be ignored +# but keep track of directories ending in .cache + +# Others + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) + +# SQL Server files + +# Business Intelligence projects + +# Microsoft Fakes + +# GhostDoc plugin setting file + +# Node.js Tools for Visual Studio + +# Typescript v1 declaration files + +# Visual Studio 6 build log + +# Visual Studio 6 workspace options file + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) + +# Visual Studio LightSwitch build output + +# Paket dependency manager + +# FAKE - F# Make + +# JetBrains Rider + +# CodeRush + +# Python Tools for Visual Studio (PTVS) + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Telerik's JustMock configuration file + +# BizTalk build output + +# End of https://www.gitignore.io/api/visualstudio,monodevelop,csharp diff --git a/MarkovGrams.sln b/MarkovGrams.sln new file mode 100644 index 0000000..7d617a4 --- /dev/null +++ b/MarkovGrams.sln @@ -0,0 +1,17 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2012 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MarkovGrams", "MarkovGrams\MarkovGrams.csproj", "{14743F58-9418-4147-9C2C-0626AD7185D3}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x86 = Debug|x86 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {14743F58-9418-4147-9C2C-0626AD7185D3}.Debug|x86.ActiveCfg = Debug|x86 + {14743F58-9418-4147-9C2C-0626AD7185D3}.Debug|x86.Build.0 = Debug|x86 + {14743F58-9418-4147-9C2C-0626AD7185D3}.Release|x86.ActiveCfg = Release|x86 + {14743F58-9418-4147-9C2C-0626AD7185D3}.Release|x86.Build.0 = Release|x86 + EndGlobalSection +EndGlobal diff --git a/MarkovGrams/MarkovGrams.csproj b/MarkovGrams/MarkovGrams.csproj new file mode 100644 index 0000000..bf5c49c --- /dev/null +++ b/MarkovGrams/MarkovGrams.csproj @@ -0,0 +1,41 @@ + + + + Debug + x86 + {14743F58-9418-4147-9C2C-0626AD7185D3} + Exe + MarkovGrams + MarkovGrams + v4.5 + + + true + full + false + bin\Debug + DEBUG; + prompt + 4 + true + x86 + + + true + bin\Release + prompt + 4 + true + x86 + + + + + + + + + + + + \ No newline at end of file diff --git a/MarkovGrams/NGrams.cs b/MarkovGrams/NGrams.cs new file mode 100644 index 0000000..5137b6b --- /dev/null +++ b/MarkovGrams/NGrams.cs @@ -0,0 +1,85 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MarkovGrams +{ + /// + /// A collection of methods to generate various different types of n-grams. + /// + public static class NGrams + { + /// + /// Generates a unique list of n-grams that the given list of words. + /// + /// The words to turn into n-grams. + /// The order of n-gram to generate.. + /// A unique list of n-grams found in the given list of words. + public static IEnumerable GenerateFlat(IEnumerable words, int order) + { + List results = new List(); + foreach(string word in words) + { + results.AddRange(GenerateFlat(word, order)); + } + return results.Distinct(); + } + + /// + /// Generates a unique list of n-grams from the given string. + /// + /// The string to n-gram-ise. + /// The order of n-gram to generate. + /// A unique list of n-grams found in the specified string. + public static IEnumerable GenerateFlat(string str, int order) + { + List results = new List(); + for(int i = 0; i < str.Length - order; i++) + { + results.Add(str.Substring(i, order)); + } + return results.Distinct(); + } + + /// + /// Generates a dictionary of weighted n-grams from the given list of words. + /// The key is the ngram itself, and the value is the linear weight of the ngram. + /// + /// The words to n-gram-ise. + /// The order of ngrams to generate. + /// The weighted dictionary of ngrams. + public static Dictionary GenerateWeighted(IEnumerable words, int order) + { + Dictionary results = new Dictionary(); + foreach(string word in words) + { + Dictionary wordNgrams = GenerateWeighted(word, order); + foreach(KeyValuePair ngram in wordNgrams) + { + if(!results.ContainsKey(ngram.Key)) + results[ngram.Key] = 0; + results[ngram.Key] += ngram.Value; + } + } + return results; + } + /// + /// Generates a dictionary of weighted n-grams from the specified string. + /// + /// The string to n-gram-ise. + /// The order of n-grams to generate. + /// The weighted dictionary of ngrams. + public static Dictionary GenerateWeighted(string str, int order) + { + Dictionary results = new Dictionary(); + for(int i = 0; i < str.Length - order; i++) + { + string ngram = str.Substring(i, order); + if(!results.ContainsKey(ngram)) + results[ngram] = 0; + results[ngram]++; + } + return results; + } + } +} diff --git a/MarkovGrams/Program.cs b/MarkovGrams/Program.cs new file mode 100644 index 0000000..e7741ae --- /dev/null +++ b/MarkovGrams/Program.cs @@ -0,0 +1,30 @@ +using System; +using System.Collections.Generic; +using System.IO; + +namespace MarkovGrams +{ + class MainClass + { + public static void Main(string[] args) + { + if(args.Length != 3) + { + Console.WriteLine("Usage:"); + Console.WriteLine(" ./MarkovGrams.exe "); + return; + } + + string wordlistFilename = args[0]; + int order = int.Parse(args[1]); + int desiredStringLength = int.Parse(args[2]); + + IEnumerable words = File.ReadLines(wordlistFilename); + IEnumerable ngrams = NGrams.GenerateFlat(words, order); + + UnweightedMarkovChain chain = new UnweightedMarkovChain(ngrams); + + Console.WriteLine(chain.Generate(desiredStringLength)); + } + } +} diff --git a/MarkovGrams/Properties/AssemblyInfo.cs b/MarkovGrams/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..3025aa3 --- /dev/null +++ b/MarkovGrams/Properties/AssemblyInfo.cs @@ -0,0 +1,26 @@ +using System.Reflection; +using System.Runtime.CompilerServices; + +// Information about this assembly is defined by the following attributes. +// Change them to the values specific to your project. + +[assembly: AssemblyTitle("MarkovGrams")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("")] +[assembly: AssemblyCopyright("sbrl")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// The assembly version has the format "{Major}.{Minor}.{Build}.{Revision}". +// The form "{Major}.{Minor}.*" will automatically update the build and revision, +// and "{Major}.{Minor}.{Build}.*" will update just the revision. + +[assembly: AssemblyVersion("1.0.*")] + +// The following attributes are used to specify the signing key for the assembly, +// if desired. See the Mono documentation for more information about signing. + +//[assembly: AssemblyDelaySign(false)] +//[assembly: AssemblyKeyFile("")] diff --git a/MarkovGrams/UnweightedMarkovChain.cs b/MarkovGrams/UnweightedMarkovChain.cs new file mode 100644 index 0000000..bef2e1c --- /dev/null +++ b/MarkovGrams/UnweightedMarkovChain.cs @@ -0,0 +1,72 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MarkovGrams +{ + /// + /// An unweighted character-based markov chain. + /// + public class UnweightedMarkovChain + { + /// + /// The random number generator + /// + Random rand = new Random(); + + /// + /// The ngrams that this markov chain currently contains. + /// + List ngrams; + + /// + /// Creates a new character-based markov chain. + /// + /// The ngrams to populate the new markov chain with. + public UnweightedMarkovChain(IEnumerable inNgrams) + { + ngrams = new List(inNgrams); + } + + /// + /// Returns a random ngram that's currently loaded into this UnweightedMarkovChain. + /// + /// A random ngram from this UnweightMarkovChain's cache of ngrams. + public string RandomNgram() + { + return ngrams[rand.Next(0, ngrams.Count)]; + } + + /// + /// Generates a new random string from the currently stored ngrams. + /// + /// + /// The length of ngram to generate. + /// Note that this is a target, not a fixed value - e.g. passing 2 when the n-gram order is 3 will + /// result in a string of length 3. Also, depending on the current ngrams this markov chain contains, + /// it may end up being cut short. + /// + /// A new random string. + public string Generate(int length) + { + string result = RandomNgram(); + string lastNgram = result; + while(result.Length < length) + { + // The substring that the next ngram in the chain needs to start with + string nextStartsWith = lastNgram.Substring(1); + // Get a list of possible n-grams we could choose from next + List nextNgrams = ngrams.FindAll(gram => gram.StartsWith(nextStartsWith)); + // If there aren't any choices left, we can't exactly keep adding to the new string any more :-( + if(nextNgrams.Count == 0) + break; + // Pick a random n-gram from the list + string nextNgram = nextNgrams.ElementAt(rand.Next(0, nextNgrams.Count)); + // Add the last character from the n-gram to the string we're building + result += nextNgram[nextNgram.Length - 1]; + } + + return result; + } + } +}