SearchBox/SearchBox/Tokenizer.cs

66 lines
1.8 KiB
C#

using System;
using System.Collections;
using System.Collections.Generic;
using System.Net;
using System.Text.RegularExpressions;
using LibSearchBox.Utilities;
using UnidecodeSharpFork;
namespace LibSearchBox
{
[Flags]
public enum TokenizerOptions
{
Lowercase = 1,
Transliterate = 2,
HidePunctuation = 4,
DecodeHtmlEntities = 8
}
public class Tokenizer : IEnumerable<(int, string)>
{
private static Regex splitter = new Regex(
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
RegexOptions.Compiled | RegexOptions.Multiline
);
private string source;
public bool Verbose = false;
public Tokenizer(string inSource, TokenizerOptions options = TokenizerOptions.Transliterate | TokenizerOptions.Lowercase | TokenizerOptions.HidePunctuation)
{
if (options.HasFlag(TokenizerOptions.Transliterate)) inSource = inSource.Unidecode();
if (options.HasFlag(TokenizerOptions.Lowercase)) inSource = inSource.ToLower();
if (options.HasFlag(TokenizerOptions.HidePunctuation)) inSource = inSource.ReplaceMultiple(
@"[]{}|/\".ToCharArray(),
" ".ToCharArray()
);
if (options.HasFlag(TokenizerOptions.DecodeHtmlEntities)) inSource = WebUtility.HtmlDecode(inSource);
source = inSource;
}
public IEnumerable<(int, string)> IterateTokens()
{
int index = 0;
string[] parts = splitter.Split(source);
for (int i = 0; i < parts.Length; i++) {
index += parts[i].Length;
if (string.IsNullOrWhiteSpace(parts[i]) || splitter.IsMatch(parts[i])) continue;
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
yield return (index, parts[i]);
}
}
public IEnumerator<(int, string)> GetEnumerator() {
return IterateTokens().GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
}