66 lines
1.8 KiB
C#
66 lines
1.8 KiB
C#
using System;
|
|
using System.Collections;
|
|
using System.Collections.Generic;
|
|
using System.Net;
|
|
using System.Text.RegularExpressions;
|
|
using LibSearchBox.Utilities;
|
|
using UnidecodeSharpFork;
|
|
|
|
namespace LibSearchBox
|
|
{
|
|
[Flags]
|
|
public enum TokenizerOptions
|
|
{
|
|
Lowercase = 1,
|
|
Transliterate = 2,
|
|
HidePunctuation = 4,
|
|
DecodeHtmlEntities = 8
|
|
}
|
|
public class Tokenizer : IEnumerable<(int, string)>
|
|
{
|
|
private static Regex splitter = new Regex(
|
|
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
|
|
RegexOptions.Compiled | RegexOptions.Multiline
|
|
);
|
|
|
|
private string source;
|
|
|
|
public bool Verbose = false;
|
|
|
|
public Tokenizer(string inSource, TokenizerOptions options = TokenizerOptions.Transliterate | TokenizerOptions.Lowercase | TokenizerOptions.HidePunctuation)
|
|
{
|
|
if (options.HasFlag(TokenizerOptions.Transliterate)) inSource = inSource.Unidecode();
|
|
if (options.HasFlag(TokenizerOptions.Lowercase)) inSource = inSource.ToLower();
|
|
if (options.HasFlag(TokenizerOptions.HidePunctuation)) inSource = inSource.ReplaceMultiple(
|
|
@"[]{}|/\".ToCharArray(),
|
|
" ".ToCharArray()
|
|
);
|
|
if (options.HasFlag(TokenizerOptions.DecodeHtmlEntities)) inSource = WebUtility.HtmlDecode(inSource);
|
|
|
|
source = inSource;
|
|
}
|
|
|
|
public IEnumerable<(int, string)> IterateTokens()
|
|
{
|
|
int index = 0;
|
|
string[] parts = splitter.Split(source);
|
|
for (int i = 0; i < parts.Length; i++) {
|
|
index += parts[i].Length;
|
|
if (string.IsNullOrWhiteSpace(parts[i]) || splitter.IsMatch(parts[i])) continue;
|
|
|
|
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
|
|
|
yield return (index, parts[i]);
|
|
}
|
|
}
|
|
|
|
public IEnumerator<(int, string)> GetEnumerator() {
|
|
return IterateTokens().GetEnumerator();
|
|
}
|
|
IEnumerator IEnumerable.GetEnumerator()
|
|
{
|
|
return GetEnumerator();
|
|
}
|
|
}
|
|
}
|