2018-09-02 15:36:37 +00:00
|
|
|
|
using System;
|
|
|
|
|
using System.Collections;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Net;
|
|
|
|
|
using System.Text.RegularExpressions;
|
2018-09-11 13:27:25 +00:00
|
|
|
|
using LibSearchBox.Utilities;
|
2018-09-02 15:36:37 +00:00
|
|
|
|
using UnidecodeSharpFork;
|
|
|
|
|
|
2018-09-11 13:27:25 +00:00
|
|
|
|
namespace LibSearchBox
|
2018-09-02 15:36:37 +00:00
|
|
|
|
{
|
|
|
|
|
[Flags]
|
|
|
|
|
public enum TokenizerOptions
|
|
|
|
|
{
|
|
|
|
|
Lowercase = 1,
|
|
|
|
|
Transliterate = 2,
|
|
|
|
|
HidePunctuation = 4,
|
|
|
|
|
DecodeHtmlEntities = 8
|
|
|
|
|
}
|
|
|
|
|
public class Tokenizer : IEnumerable<Tuple<int, string>>
|
|
|
|
|
{
|
|
|
|
|
private static Regex splitter = new Regex(
|
|
|
|
|
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
|
|
|
|
|
RegexOptions.Compiled | RegexOptions.Multiline
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
private string source;
|
|
|
|
|
|
|
|
|
|
public bool Verbose = false;
|
|
|
|
|
|
|
|
|
|
public Tokenizer(string inSource, TokenizerOptions options = TokenizerOptions.Transliterate | TokenizerOptions.Lowercase | TokenizerOptions.HidePunctuation)
|
|
|
|
|
{
|
|
|
|
|
if (options.HasFlag(TokenizerOptions.Transliterate)) inSource = inSource.Unidecode();
|
|
|
|
|
if (options.HasFlag(TokenizerOptions.Lowercase)) inSource = inSource.ToLower();
|
|
|
|
|
if (options.HasFlag(TokenizerOptions.HidePunctuation)) inSource = inSource.ReplaceMultiple(
|
|
|
|
|
@"[]{}|/\".ToCharArray(),
|
|
|
|
|
" ".ToCharArray()
|
|
|
|
|
);
|
|
|
|
|
if (options.HasFlag(TokenizerOptions.DecodeHtmlEntities)) inSource = WebUtility.HtmlDecode(inSource);
|
|
|
|
|
|
|
|
|
|
source = inSource;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public IEnumerable<Tuple<int, string>> IterateTokens()
|
|
|
|
|
{
|
|
|
|
|
int index = 0;
|
|
|
|
|
string[] parts = splitter.Split(source);
|
|
|
|
|
for (int i = 0; i < parts.Length; i++) {
|
|
|
|
|
index += parts[i].Length;
|
|
|
|
|
if (string.IsNullOrWhiteSpace(parts[i]) || splitter.IsMatch(parts[i])) continue;
|
|
|
|
|
|
|
|
|
|
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
|
|
|
|
|
2018-09-02 16:45:13 +00:00
|
|
|
|
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
|
2018-09-02 15:36:37 +00:00
|
|
|
|
yield return new Tuple<int, string>(index, parts[i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public IEnumerator<Tuple<int, string>> GetEnumerator() {
|
|
|
|
|
return IterateTokens().GetEnumerator();
|
|
|
|
|
}
|
|
|
|
|
IEnumerator IEnumerable.GetEnumerator()
|
|
|
|
|
{
|
|
|
|
|
return GetEnumerator();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|