using System; using System.Collections; using System.Collections.Generic; using System.Net; using System.Text.RegularExpressions; using SearchBox.Utilities; using UnidecodeSharpFork; namespace SearchBox { [Flags] public enum TokenizerOptions { Lowercase = 1, Transliterate = 2, HidePunctuation = 4, DecodeHtmlEntities = 8 } public class Tokenizer : IEnumerable> { private static Regex splitter = new Regex( @"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|", RegexOptions.Compiled | RegexOptions.Multiline ); private string source; public bool Verbose = false; public Tokenizer(string inSource, TokenizerOptions options = TokenizerOptions.Transliterate | TokenizerOptions.Lowercase | TokenizerOptions.HidePunctuation) { if (options.HasFlag(TokenizerOptions.Transliterate)) inSource = inSource.Unidecode(); if (options.HasFlag(TokenizerOptions.Lowercase)) inSource = inSource.ToLower(); if (options.HasFlag(TokenizerOptions.HidePunctuation)) inSource = inSource.ReplaceMultiple( @"[]{}|/\".ToCharArray(), " ".ToCharArray() ); if (options.HasFlag(TokenizerOptions.DecodeHtmlEntities)) inSource = WebUtility.HtmlDecode(inSource); source = inSource; } public IEnumerable> IterateTokens() { int index = 0; string[] parts = splitter.Split(source); for (int i = 0; i < parts.Length; i++) { index += parts[i].Length; if (string.IsNullOrWhiteSpace(parts[i]) || splitter.IsMatch(parts[i])) continue; if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]); yield return new Tuple(index, parts[i]); } } public IEnumerator> GetEnumerator() { return IterateTokens().GetEnumerator(); } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } } }