Add indexing.
This commit is contained in:
parent
0ef3b5958d
commit
e8069c93ff
14 changed files with 807 additions and 20 deletions
15
SearchBox-CLI/Program.cs
Normal file
15
SearchBox-CLI/Program.cs
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
using System;
|
||||||
|
using SearchBox;
|
||||||
|
|
||||||
|
namespace SearchBoxCLI
|
||||||
|
{
|
||||||
|
class MainClass
|
||||||
|
{
|
||||||
|
public static void Main(string[] args)
|
||||||
|
{
|
||||||
|
string input = Console.In.ReadToEnd();
|
||||||
|
Index index = new Index(input);
|
||||||
|
Console.WriteLine(index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
26
SearchBox-CLI/Properties/AssemblyInfo.cs
Normal file
26
SearchBox-CLI/Properties/AssemblyInfo.cs
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
using System.Reflection;
|
||||||
|
using System.Runtime.CompilerServices;
|
||||||
|
|
||||||
|
// Information about this assembly is defined by the following attributes.
|
||||||
|
// Change them to the values specific to your project.
|
||||||
|
|
||||||
|
[assembly: AssemblyTitle("SearchBox-CLI")]
|
||||||
|
[assembly: AssemblyDescription("")]
|
||||||
|
[assembly: AssemblyConfiguration("")]
|
||||||
|
[assembly: AssemblyCompany("")]
|
||||||
|
[assembly: AssemblyProduct("")]
|
||||||
|
[assembly: AssemblyCopyright("sbrl")]
|
||||||
|
[assembly: AssemblyTrademark("")]
|
||||||
|
[assembly: AssemblyCulture("")]
|
||||||
|
|
||||||
|
// The assembly version has the format "{Major}.{Minor}.{Build}.{Revision}".
|
||||||
|
// The form "{Major}.{Minor}.*" will automatically update the build and revision,
|
||||||
|
// and "{Major}.{Minor}.{Build}.*" will update just the revision.
|
||||||
|
|
||||||
|
[assembly: AssemblyVersion("1.0.*")]
|
||||||
|
|
||||||
|
// The following attributes are used to specify the signing key for the assembly,
|
||||||
|
// if desired. See the Mono documentation for more information about signing.
|
||||||
|
|
||||||
|
//[assembly: AssemblyDelaySign(false)]
|
||||||
|
//[assembly: AssemblyKeyFile("")]
|
45
SearchBox-CLI/SearchBox-CLI.csproj
Normal file
45
SearchBox-CLI/SearchBox-CLI.csproj
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
|
<PropertyGroup>
|
||||||
|
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||||
|
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
|
||||||
|
<ProjectGuid>{1E2F5559-C918-4A5D-9C24-117F204708A4}</ProjectGuid>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<RootNamespace>SearchBoxCLI</RootNamespace>
|
||||||
|
<AssemblyName>SearchBox-CLI</AssemblyName>
|
||||||
|
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x86' ">
|
||||||
|
<DebugSymbols>true</DebugSymbols>
|
||||||
|
<DebugType>full</DebugType>
|
||||||
|
<Optimize>false</Optimize>
|
||||||
|
<OutputPath>bin\Debug</OutputPath>
|
||||||
|
<DefineConstants>DEBUG;</DefineConstants>
|
||||||
|
<ErrorReport>prompt</ErrorReport>
|
||||||
|
<WarningLevel>4</WarningLevel>
|
||||||
|
<ExternalConsole>true</ExternalConsole>
|
||||||
|
<PlatformTarget>x86</PlatformTarget>
|
||||||
|
</PropertyGroup>
|
||||||
|
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
|
||||||
|
<Optimize>true</Optimize>
|
||||||
|
<OutputPath>bin\Release</OutputPath>
|
||||||
|
<ErrorReport>prompt</ErrorReport>
|
||||||
|
<WarningLevel>4</WarningLevel>
|
||||||
|
<ExternalConsole>true</ExternalConsole>
|
||||||
|
<PlatformTarget>x86</PlatformTarget>
|
||||||
|
</PropertyGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Reference Include="System" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Compile Include="Program.cs" />
|
||||||
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\SearchBox\SearchBox.csproj">
|
||||||
|
<Project>{5243F60A-F822-4C52-A333-E4089754EC6A}</Project>
|
||||||
|
<Name>SearchBox</Name>
|
||||||
|
</ProjectReference>
|
||||||
|
</ItemGroup>
|
||||||
|
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
|
||||||
|
</Project>
|
|
@ -1,7 +1,9 @@
|
||||||
|
|
||||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
# Visual Studio 2012
|
# Visual Studio 2012
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SearchBox", "SearchBox\SearchBox.csproj", "{6E470E38-B239-477F-BF65-D8522292D8C5}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SearchBox", "SearchBox\SearchBox.csproj", "{5243F60A-F822-4C52-A333-E4089754EC6A}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SearchBox-CLI", "SearchBox-CLI\SearchBox-CLI.csproj", "{1E2F5559-C918-4A5D-9C24-117F204708A4}"
|
||||||
EndProject
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
@ -9,9 +11,13 @@ Global
|
||||||
Release|x86 = Release|x86
|
Release|x86 = Release|x86
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
{6E470E38-B239-477F-BF65-D8522292D8C5}.Debug|x86.ActiveCfg = Debug|x86
|
{5243F60A-F822-4C52-A333-E4089754EC6A}.Debug|x86.ActiveCfg = Debug|x86
|
||||||
{6E470E38-B239-477F-BF65-D8522292D8C5}.Debug|x86.Build.0 = Debug|x86
|
{5243F60A-F822-4C52-A333-E4089754EC6A}.Debug|x86.Build.0 = Debug|x86
|
||||||
{6E470E38-B239-477F-BF65-D8522292D8C5}.Release|x86.ActiveCfg = Release|x86
|
{5243F60A-F822-4C52-A333-E4089754EC6A}.Release|x86.ActiveCfg = Release|x86
|
||||||
{6E470E38-B239-477F-BF65-D8522292D8C5}.Release|x86.Build.0 = Release|x86
|
{5243F60A-F822-4C52-A333-E4089754EC6A}.Release|x86.Build.0 = Release|x86
|
||||||
|
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Debug|x86.ActiveCfg = Debug|x86
|
||||||
|
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Debug|x86.Build.0 = Debug|x86
|
||||||
|
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Release|x86.ActiveCfg = Release|x86
|
||||||
|
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Release|x86.Build.0 = Release|x86
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
EndGlobal
|
EndGlobal
|
||||||
|
|
311
SearchBox/EmbeddedFiles/Stopwords.txt
Normal file
311
SearchBox/EmbeddedFiles/Stopwords.txt
Normal file
|
@ -0,0 +1,311 @@
|
||||||
|
a
|
||||||
|
about
|
||||||
|
above
|
||||||
|
above
|
||||||
|
across
|
||||||
|
after
|
||||||
|
afterwards
|
||||||
|
again
|
||||||
|
against
|
||||||
|
all
|
||||||
|
almost
|
||||||
|
alone
|
||||||
|
along
|
||||||
|
already
|
||||||
|
also
|
||||||
|
although
|
||||||
|
always
|
||||||
|
am
|
||||||
|
among
|
||||||
|
amongst
|
||||||
|
amoungst
|
||||||
|
amount
|
||||||
|
an
|
||||||
|
and
|
||||||
|
another
|
||||||
|
any
|
||||||
|
anyhow
|
||||||
|
anyone
|
||||||
|
anything
|
||||||
|
anyway
|
||||||
|
anywhere
|
||||||
|
are
|
||||||
|
around
|
||||||
|
as
|
||||||
|
at
|
||||||
|
back
|
||||||
|
be
|
||||||
|
became
|
||||||
|
because
|
||||||
|
become
|
||||||
|
becomes
|
||||||
|
becoming
|
||||||
|
been
|
||||||
|
before
|
||||||
|
beforehand
|
||||||
|
behind
|
||||||
|
being
|
||||||
|
below
|
||||||
|
beside
|
||||||
|
besides
|
||||||
|
between
|
||||||
|
beyond
|
||||||
|
bill
|
||||||
|
both
|
||||||
|
bottom
|
||||||
|
but
|
||||||
|
by
|
||||||
|
call
|
||||||
|
can
|
||||||
|
cannot
|
||||||
|
cant
|
||||||
|
co
|
||||||
|
con
|
||||||
|
could
|
||||||
|
couldnt
|
||||||
|
cry
|
||||||
|
de
|
||||||
|
describe
|
||||||
|
detail
|
||||||
|
do
|
||||||
|
done
|
||||||
|
down
|
||||||
|
due
|
||||||
|
during
|
||||||
|
each
|
||||||
|
eg
|
||||||
|
eight
|
||||||
|
either
|
||||||
|
eleven
|
||||||
|
else
|
||||||
|
elsewhere
|
||||||
|
empty
|
||||||
|
enough
|
||||||
|
etc
|
||||||
|
even
|
||||||
|
ever
|
||||||
|
every
|
||||||
|
everyone
|
||||||
|
everything
|
||||||
|
everywhere
|
||||||
|
except
|
||||||
|
few
|
||||||
|
fill
|
||||||
|
find
|
||||||
|
fire
|
||||||
|
first
|
||||||
|
five
|
||||||
|
for
|
||||||
|
former
|
||||||
|
formerly
|
||||||
|
found
|
||||||
|
four
|
||||||
|
from
|
||||||
|
front
|
||||||
|
full
|
||||||
|
further
|
||||||
|
get
|
||||||
|
give
|
||||||
|
go
|
||||||
|
had
|
||||||
|
has
|
||||||
|
hasnt
|
||||||
|
have
|
||||||
|
he
|
||||||
|
hence
|
||||||
|
her
|
||||||
|
here
|
||||||
|
hereafter
|
||||||
|
hereby
|
||||||
|
herein
|
||||||
|
hereupon
|
||||||
|
hers
|
||||||
|
herself
|
||||||
|
him
|
||||||
|
himself
|
||||||
|
his
|
||||||
|
how
|
||||||
|
however
|
||||||
|
ie
|
||||||
|
if
|
||||||
|
in
|
||||||
|
inc
|
||||||
|
indeed
|
||||||
|
interest
|
||||||
|
into
|
||||||
|
is
|
||||||
|
it
|
||||||
|
its
|
||||||
|
itself
|
||||||
|
keep
|
||||||
|
last
|
||||||
|
latter
|
||||||
|
latterly
|
||||||
|
least
|
||||||
|
less
|
||||||
|
ltd
|
||||||
|
made
|
||||||
|
many
|
||||||
|
may
|
||||||
|
me
|
||||||
|
meanwhile
|
||||||
|
might
|
||||||
|
mine
|
||||||
|
more
|
||||||
|
moreover
|
||||||
|
most
|
||||||
|
mostly
|
||||||
|
move
|
||||||
|
much
|
||||||
|
must
|
||||||
|
my
|
||||||
|
myself
|
||||||
|
name
|
||||||
|
namely
|
||||||
|
neither
|
||||||
|
never
|
||||||
|
nevertheless
|
||||||
|
next
|
||||||
|
nine
|
||||||
|
no
|
||||||
|
none
|
||||||
|
nor
|
||||||
|
not
|
||||||
|
nothing
|
||||||
|
now
|
||||||
|
nowhere
|
||||||
|
of
|
||||||
|
off
|
||||||
|
often
|
||||||
|
on
|
||||||
|
once
|
||||||
|
one
|
||||||
|
only
|
||||||
|
onto
|
||||||
|
or
|
||||||
|
other
|
||||||
|
others
|
||||||
|
otherwise
|
||||||
|
our
|
||||||
|
ours
|
||||||
|
ourselves
|
||||||
|
out
|
||||||
|
over
|
||||||
|
own
|
||||||
|
part
|
||||||
|
per
|
||||||
|
perhaps
|
||||||
|
please
|
||||||
|
put
|
||||||
|
rather
|
||||||
|
re
|
||||||
|
same
|
||||||
|
see
|
||||||
|
seem
|
||||||
|
seemed
|
||||||
|
seeming
|
||||||
|
seems
|
||||||
|
serious
|
||||||
|
several
|
||||||
|
she
|
||||||
|
should
|
||||||
|
show
|
||||||
|
side
|
||||||
|
since
|
||||||
|
sincere
|
||||||
|
six
|
||||||
|
sixty
|
||||||
|
so
|
||||||
|
some
|
||||||
|
somehow
|
||||||
|
someone
|
||||||
|
something
|
||||||
|
sometime
|
||||||
|
sometimes
|
||||||
|
somewhere
|
||||||
|
still
|
||||||
|
such
|
||||||
|
system
|
||||||
|
take
|
||||||
|
ten
|
||||||
|
than
|
||||||
|
that
|
||||||
|
the
|
||||||
|
their
|
||||||
|
them
|
||||||
|
themselves
|
||||||
|
then
|
||||||
|
thence
|
||||||
|
there
|
||||||
|
thereafter
|
||||||
|
thereby
|
||||||
|
therefore
|
||||||
|
therein
|
||||||
|
thereupon
|
||||||
|
these
|
||||||
|
they
|
||||||
|
thickv
|
||||||
|
thin
|
||||||
|
third
|
||||||
|
this
|
||||||
|
those
|
||||||
|
though
|
||||||
|
three
|
||||||
|
through
|
||||||
|
throughout
|
||||||
|
thru
|
||||||
|
thus
|
||||||
|
to
|
||||||
|
together
|
||||||
|
too
|
||||||
|
top
|
||||||
|
toward
|
||||||
|
towards
|
||||||
|
twelve
|
||||||
|
twenty
|
||||||
|
two
|
||||||
|
un
|
||||||
|
under
|
||||||
|
until
|
||||||
|
up
|
||||||
|
upon
|
||||||
|
us
|
||||||
|
very
|
||||||
|
via
|
||||||
|
was
|
||||||
|
we
|
||||||
|
well
|
||||||
|
were
|
||||||
|
what
|
||||||
|
whatever
|
||||||
|
when
|
||||||
|
whence
|
||||||
|
whenever
|
||||||
|
where
|
||||||
|
whereafter
|
||||||
|
whereas
|
||||||
|
whereby
|
||||||
|
wherein
|
||||||
|
whereupon
|
||||||
|
wherever
|
||||||
|
whether
|
||||||
|
which
|
||||||
|
while
|
||||||
|
whither
|
||||||
|
who
|
||||||
|
whoever
|
||||||
|
whole
|
||||||
|
whom
|
||||||
|
whose
|
||||||
|
why
|
||||||
|
will
|
||||||
|
with
|
||||||
|
within
|
||||||
|
without
|
||||||
|
would
|
||||||
|
yet
|
||||||
|
you
|
||||||
|
your
|
||||||
|
yours
|
||||||
|
yourself
|
||||||
|
yourselves
|
71
SearchBox/Index.cs
Normal file
71
SearchBox/Index.cs
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
using System.Text;
|
||||||
|
using SBRL.Utilities;
|
||||||
|
|
||||||
|
namespace SearchBox
|
||||||
|
{
|
||||||
|
[Flags]
|
||||||
|
public enum IndexOptions
|
||||||
|
{
|
||||||
|
ExcludeStopwords = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
public class Index
|
||||||
|
{
|
||||||
|
private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>();
|
||||||
|
private StopwordTester stopwordTester;
|
||||||
|
|
||||||
|
public Index(string inSource, IEnumerable<string> stopwords, IndexOptions options)
|
||||||
|
{
|
||||||
|
if (options.HasFlag(IndexOptions.ExcludeStopwords))
|
||||||
|
stopwordTester = new StopwordTester(stopwords);
|
||||||
|
|
||||||
|
// Tokenize the input and file it in our index
|
||||||
|
Tokenizer tokenizer = new Tokenizer(inSource);
|
||||||
|
foreach (Tuple<int, string> token in tokenizer) {
|
||||||
|
if (stopwordTester.IsStopword(token.Item2)) continue;
|
||||||
|
insert(token.Item2, token.Item1);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
public Index(string inSource, IndexOptions options)
|
||||||
|
: this(inSource, EmbeddedFiles.EnumerateLines("SearchBox.EmbeddedFiles.Stopwords.txt"), options)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
public Index(string inSource) : this(inSource, IndexOptions.ExcludeStopwords)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<int> this[string key] {
|
||||||
|
get {
|
||||||
|
return index[key];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected void insert(string token, int offset)
|
||||||
|
{
|
||||||
|
if (!index.ContainsKey(token))
|
||||||
|
index.Add(token, new List<int>());
|
||||||
|
index[token].Add(offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
StringBuilder result = new StringBuilder("Index: \n");
|
||||||
|
foreach (KeyValuePair<string, List<int>> item in index)
|
||||||
|
result.AppendLine($"\t{item.Key}: {string.Join(", ", item.Value)}");
|
||||||
|
return result.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public static Index FromFile(string filename)
|
||||||
|
{
|
||||||
|
return new Index(File.ReadAllText(filename));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,12 +0,0 @@
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace SearchBox
|
|
||||||
{
|
|
||||||
class MainClass
|
|
||||||
{
|
|
||||||
public static void Main(string[] args)
|
|
||||||
{
|
|
||||||
Console.WriteLine("Hello World!");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
10
SearchBox/SearchBox.cs
Normal file
10
SearchBox/SearchBox.cs
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
using System;
|
||||||
|
namespace SearchBox
|
||||||
|
{
|
||||||
|
public class SearchBox
|
||||||
|
{
|
||||||
|
public SearchBox()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,8 +3,8 @@
|
||||||
<PropertyGroup>
|
<PropertyGroup>
|
||||||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||||
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
|
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
|
||||||
<ProjectGuid>{6E470E38-B239-477F-BF65-D8522292D8C5}</ProjectGuid>
|
<ProjectGuid>{5243F60A-F822-4C52-A333-E4089754EC6A}</ProjectGuid>
|
||||||
<OutputType>Exe</OutputType>
|
<OutputType>Library</OutputType>
|
||||||
<RootNamespace>SearchBox</RootNamespace>
|
<RootNamespace>SearchBox</RootNamespace>
|
||||||
<AssemblyName>SearchBox</AssemblyName>
|
<AssemblyName>SearchBox</AssemblyName>
|
||||||
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
|
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
|
||||||
|
@ -30,10 +30,28 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Reference Include="System" />
|
<Reference Include="System" />
|
||||||
|
<Reference Include="UnidecodeSharpFork">
|
||||||
|
<HintPath>..\packages\UnidecodeSharpFork.1.0.0\lib\UnidecodeSharpFork.dll</HintPath>
|
||||||
|
</Reference>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Compile Include="Program.cs" />
|
|
||||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
|
<Compile Include="Utilities\EmbeddedFiles.cs" />
|
||||||
|
<Compile Include="SearchBox.cs" />
|
||||||
|
<Compile Include="Tokenizer.cs" />
|
||||||
|
<Compile Include="Index.cs" />
|
||||||
|
<Compile Include="Utilities\StringPlus.cs" />
|
||||||
|
<Compile Include="StopwordTester.cs" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<Folder Include="EmbeddedFiles\" />
|
||||||
|
<Folder Include="Utilities\" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<None Include="packages.config" />
|
||||||
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<EmbeddedResource Include="EmbeddedFiles\Stopwords.txt" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
|
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
|
||||||
</Project>
|
</Project>
|
21
SearchBox/StopwordTester.cs
Normal file
21
SearchBox/StopwordTester.cs
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
namespace SearchBox
|
||||||
|
{
|
||||||
|
public class StopwordTester
|
||||||
|
{
|
||||||
|
private HashSet<string> words = new HashSet<string>();
|
||||||
|
|
||||||
|
public StopwordTester(IEnumerable<string> inWords)
|
||||||
|
{
|
||||||
|
foreach (string word in inWords)
|
||||||
|
words.Add(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
public bool IsStopword(string word)
|
||||||
|
{
|
||||||
|
return words.Contains(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
65
SearchBox/Tokenizer.cs
Normal file
65
SearchBox/Tokenizer.cs
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Net;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using SearchBox.Utilities;
|
||||||
|
using UnidecodeSharpFork;
|
||||||
|
|
||||||
|
namespace SearchBox
|
||||||
|
{
|
||||||
|
[Flags]
|
||||||
|
public enum TokenizerOptions
|
||||||
|
{
|
||||||
|
Lowercase = 1,
|
||||||
|
Transliterate = 2,
|
||||||
|
HidePunctuation = 4,
|
||||||
|
DecodeHtmlEntities = 8
|
||||||
|
}
|
||||||
|
public class Tokenizer : IEnumerable<Tuple<int, string>>
|
||||||
|
{
|
||||||
|
private static Regex splitter = new Regex(
|
||||||
|
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
|
||||||
|
RegexOptions.Compiled | RegexOptions.Multiline
|
||||||
|
);
|
||||||
|
|
||||||
|
private string source;
|
||||||
|
|
||||||
|
public bool Verbose = false;
|
||||||
|
|
||||||
|
public Tokenizer(string inSource, TokenizerOptions options = TokenizerOptions.Transliterate | TokenizerOptions.Lowercase | TokenizerOptions.HidePunctuation)
|
||||||
|
{
|
||||||
|
if (options.HasFlag(TokenizerOptions.Transliterate)) inSource = inSource.Unidecode();
|
||||||
|
if (options.HasFlag(TokenizerOptions.Lowercase)) inSource = inSource.ToLower();
|
||||||
|
if (options.HasFlag(TokenizerOptions.HidePunctuation)) inSource = inSource.ReplaceMultiple(
|
||||||
|
@"[]{}|/\".ToCharArray(),
|
||||||
|
" ".ToCharArray()
|
||||||
|
);
|
||||||
|
if (options.HasFlag(TokenizerOptions.DecodeHtmlEntities)) inSource = WebUtility.HtmlDecode(inSource);
|
||||||
|
|
||||||
|
source = inSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IEnumerable<Tuple<int, string>> IterateTokens()
|
||||||
|
{
|
||||||
|
int index = 0;
|
||||||
|
string[] parts = splitter.Split(source);
|
||||||
|
for (int i = 0; i < parts.Length; i++) {
|
||||||
|
index += parts[i].Length;
|
||||||
|
if (string.IsNullOrWhiteSpace(parts[i]) || splitter.IsMatch(parts[i])) continue;
|
||||||
|
|
||||||
|
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
|
||||||
|
|
||||||
|
yield return new Tuple<int, string>(index, parts[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public IEnumerator<Tuple<int, string>> GetEnumerator() {
|
||||||
|
return IterateTokens().GetEnumerator();
|
||||||
|
}
|
||||||
|
IEnumerator IEnumerable.GetEnumerator()
|
||||||
|
{
|
||||||
|
return GetEnumerator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
189
SearchBox/Utilities/EmbeddedFiles.cs
Normal file
189
SearchBox/Utilities/EmbeddedFiles.cs
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
using System;
|
||||||
|
using System.Reflection;
|
||||||
|
using System.IO;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
namespace SBRL.Utilities
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// A collection of static methods for manipulating embedded resources.
|
||||||
|
/// </summary>
|
||||||
|
/// <description>
|
||||||
|
/// v0.5, by Starbeamrainbowlabs <feedback@starbeamrainbowlabs.com>
|
||||||
|
/// Last updated 8th August 2016.
|
||||||
|
/// Licensed under MPL-2.0.
|
||||||
|
///
|
||||||
|
/// Changelog:
|
||||||
|
/// v0.1 (25th July 2016):
|
||||||
|
/// - Initial release.
|
||||||
|
/// v0.2 (8th August 2016):
|
||||||
|
/// - Changed namespace.
|
||||||
|
/// v0.3 (21st January 2017):
|
||||||
|
/// - Added GetRawReader().
|
||||||
|
/// v0.4 (8th April 2017):
|
||||||
|
/// - Removed unnecessary using statement.
|
||||||
|
/// v0.5 (3rd September 2017):
|
||||||
|
/// - Changed namespace
|
||||||
|
/// </description>
|
||||||
|
public static class EmbeddedFiles
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// An array of the filenames of all the resources embedded in the calling assembly.
|
||||||
|
/// </summary>
|
||||||
|
/// <value>The resource list.</value>
|
||||||
|
public static string[] ResourceList {
|
||||||
|
get {
|
||||||
|
return Assembly.GetCallingAssembly().GetManifestResourceNames();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string GetResourceListText()
|
||||||
|
{
|
||||||
|
StringWriter result = new StringWriter();
|
||||||
|
result.WriteLine("Files embedded in {0}:", Assembly.GetCallingAssembly().GetName().Name);
|
||||||
|
foreach (string filename in ResourceList)
|
||||||
|
result.WriteLine(" - {0}", filename);
|
||||||
|
return result.ToString();
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// Writes a list of embedded resources to the Console's standard output.
|
||||||
|
/// </summary>
|
||||||
|
public static void WriteResourceList()
|
||||||
|
{
|
||||||
|
Console.WriteLine(GetResourceListText());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets a StreamReader attached to the specified embedded resource.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename of the embedded resource to get a StreamReader of.</param>
|
||||||
|
/// <returns>A StreamReader attached to the specified embedded resource.</returns>
|
||||||
|
public static StreamReader GetReader(string filename)
|
||||||
|
{
|
||||||
|
return new StreamReader(GetRawReader(filename));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets a raw Stream that's attached to the specified embedded resource.
|
||||||
|
/// Useful when you want to copy an embedded resource to some other stream.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The path to the embedded resource.</param>
|
||||||
|
/// <returns>A raw Stream object attached to the specified file..</returns>
|
||||||
|
public static Stream GetRawReader(string filename)
|
||||||
|
{
|
||||||
|
return Assembly.GetCallingAssembly().GetManifestResourceStream(filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the specified embedded resource's content as a byte array.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename of the embedded resource to get conteent of.</param>
|
||||||
|
/// <returns>The specified embedded resource's content as a byte array.</returns>
|
||||||
|
public static byte[] ReadAllBytes(string filename)
|
||||||
|
{
|
||||||
|
// Referencing the Result property will block until the async method completes
|
||||||
|
return ReadAllBytesAsync(filename).Result;
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the specified embedded resource's content as a byte array asynchronously.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename of the embedded resource to get conteent of.</param>
|
||||||
|
/// <returns>The specified embedded resource's content as a byte array.</returns>
|
||||||
|
public static async Task<byte[]> ReadAllBytesAsync(string filename)
|
||||||
|
{
|
||||||
|
using (Stream resourceStream = Assembly.GetCallingAssembly().GetManifestResourceStream(filename))
|
||||||
|
using (MemoryStream temp = new MemoryStream())
|
||||||
|
{
|
||||||
|
await resourceStream.CopyToAsync(temp);
|
||||||
|
return temp.ToArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets all the text stored in the specified embedded resource.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename to fetch the content of.</param>
|
||||||
|
/// <returns>All the text stored in the specified embedded resource.</returns>
|
||||||
|
public static string ReadAllText(string filename)
|
||||||
|
{
|
||||||
|
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
|
||||||
|
{
|
||||||
|
return resourceReader.ReadToEnd();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// Gets all the text stored in the specified embedded resource asynchronously.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename to fetch the content of.</param>
|
||||||
|
/// <returns>All the text stored in the specified embedded resource.</returns>
|
||||||
|
public static async Task<string> ReadAllTextAsync(string filename)
|
||||||
|
{
|
||||||
|
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
|
||||||
|
{
|
||||||
|
return await resourceReader.ReadToEndAsync();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Enumerates the lines of text in the specified embedded resource.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename of the embedded resource to enumerate.</param>
|
||||||
|
/// <returns>An IEnumerator that enumerates the specified embedded resource.</returns>
|
||||||
|
public static IEnumerable<string> EnumerateLines(string filename)
|
||||||
|
{
|
||||||
|
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
|
||||||
|
{
|
||||||
|
string nextLine;
|
||||||
|
while ((nextLine = resourceReader.ReadLine()) != null)
|
||||||
|
{
|
||||||
|
yield return nextLine;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// Enumerates the lines of text in the specified embedded resource asynchronously.
|
||||||
|
/// Each successive call returns a task that, when complete, returns the next line of text stored
|
||||||
|
/// in the embedded resource.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename of the embedded resource to enumerate.</param>
|
||||||
|
/// <returns>An IEnumerator that enumerates the specified embedded resource.</returns>
|
||||||
|
public static IEnumerable<Task<string>> EnumerateLinesAsync(string filename)
|
||||||
|
{
|
||||||
|
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
|
||||||
|
{
|
||||||
|
while (!resourceReader.EndOfStream)
|
||||||
|
{
|
||||||
|
yield return resourceReader.ReadLineAsync();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets all the lines of text in the specified embedded resource.
|
||||||
|
/// You might find EnumerateLines(string filename) more useful depending on your situation.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename to obtain the lines of text from.</param>
|
||||||
|
/// <returns>A list of lines in the specified embedded resource.</returns>
|
||||||
|
public static List<string> GetAllLines(string filename)
|
||||||
|
{
|
||||||
|
// Referencing the Result property will block until the async method completes
|
||||||
|
return GetAllLinesAsync(filename).Result;
|
||||||
|
}
|
||||||
|
/// <summary>
|
||||||
|
/// Gets all the lines of text in the specified embedded resource asynchronously.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="filename">The filename to obtain the lines of text from.</param>
|
||||||
|
/// <returns>A list of lines in the specified embedded resource.</returns>
|
||||||
|
public static async Task<List<string>> GetAllLinesAsync(string filename)
|
||||||
|
{
|
||||||
|
List<string> lines = new List<string>();
|
||||||
|
IEnumerable<Task<string>> lineIterator = EnumerateLinesAsync(filename);
|
||||||
|
foreach (Task<string> nextLine in lineIterator)
|
||||||
|
{
|
||||||
|
lines.Add(await nextLine);
|
||||||
|
}
|
||||||
|
return lines;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
18
SearchBox/Utilities/StringPlus.cs
Normal file
18
SearchBox/Utilities/StringPlus.cs
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace SearchBox.Utilities
|
||||||
|
{
|
||||||
|
public static class StringPlus
|
||||||
|
{
|
||||||
|
public static string ReplaceMultiple(this string str, char[] find, char[] replace)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < find.Length; i++) {
|
||||||
|
str = str.Replace(
|
||||||
|
find[i],
|
||||||
|
i < replace.Length ? replace[i] : replace[replace.Length - 1]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
4
SearchBox/packages.config
Normal file
4
SearchBox/packages.config
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<packages>
|
||||||
|
<package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
|
||||||
|
</packages>
|
Loading…
Reference in a new issue