Compare commits

...

6 Commits

18 changed files with 1772 additions and 0 deletions

509
.gitignore vendored Normal file
View File

@ -0,0 +1,509 @@
# Created by https://www.gitignore.io/api/monodevelop,visualstudio,git,csharp
### Csharp ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
### Git ###
*.orig
### MonoDevelop ###
#User Specific
*.usertasks
#Mono Project Files
*.resources
test-results/
### VisualStudio ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
# User-specific files (MonoDevelop/Xamarin Studio)
# Build results
# Visual Studio 2015/2017 cache/options directory
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
# MSTest test Results
# NUNIT
# Build Results of an ATL Project
# Benchmark Results
# .NET Core
# StyleCop
# Files built by Visual Studio
# Chutzpah Test files
# Visual C++ cache files
# Visual Studio profiler
# Visual Studio Trace Files
# TFS 2012 Local Workspace
# Guidance Automation Toolkit
# ReSharper is a .NET coding add-in
# JustCode is a .NET coding add-in
# TeamCity is a build add-in
# DotCover is a Code Coverage Tool
# AxoCover is a Code Coverage Tool
# Visual Studio code coverage results
# NCrunch
# MightyMoose
# Web workbench (sass)
# Installshield output folder
# DocProject is a documentation generator add-in
# Click-Once directory
# Publish Web Output
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
# NuGet Packages
# The packages folder can be ignored because of Package Restore
# except build/, which is used as an MSBuild target.
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
# Microsoft Azure Build Output
# Microsoft Azure Emulator
# Windows Store app package directories and files
# Visual Studio cache files
# files ending in .cache can be ignored
# but keep track of directories ending in .cache
# Others
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
# SQL Server files
# Business Intelligence projects
# Microsoft Fakes
# GhostDoc plugin setting file
# Node.js Tools for Visual Studio
# Visual Studio 6 build log
# Visual Studio 6 workspace options file
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
# Visual Studio LightSwitch build output
# Paket dependency manager
# FAKE - F# Make
# JetBrains Rider
# CodeRush
# Python Tools for Visual Studio (PTVS)
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
# Telerik's JustMock configuration file
# BizTalk build output
# OpenCover UI analysis results
# Azure Stream Analytics local run output
# MSBuild Binary and Structured Log
# NVidia Nsight GPU debugger configuration file
# MFractors (Xamarin productivity tool) working folder
# Local History for Visual Studio
# End of https://www.gitignore.io/api/monodevelop,visualstudio,git,csharp

15
SearchBox-CLI/Program.cs Normal file
View File

@ -0,0 +1,15 @@
using System;
using SearchBox;
namespace SearchBoxCLI
{
class MainClass
{
public static void Main(string[] args)
{
string input = Console.In.ReadToEnd();
Index index = new Index(input);
Console.WriteLine(index);
}
}
}

View File

@ -0,0 +1,26 @@
using System.Reflection;
using System.Runtime.CompilerServices;
// Information about this assembly is defined by the following attributes.
// Change them to the values specific to your project.
[assembly: AssemblyTitle("SearchBox-CLI")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("")]
[assembly: AssemblyCopyright("sbrl")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// The assembly version has the format "{Major}.{Minor}.{Build}.{Revision}".
// The form "{Major}.{Minor}.*" will automatically update the build and revision,
// and "{Major}.{Minor}.{Build}.*" will update just the revision.
[assembly: AssemblyVersion("1.0.*")]
// The following attributes are used to specify the signing key for the assembly,
// if desired. See the Mono documentation for more information about signing.
//[assembly: AssemblyDelaySign(false)]
//[assembly: AssemblyKeyFile("")]

View File

@ -0,0 +1,45 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
<ProjectGuid>{1E2F5559-C918-4A5D-9C24-117F204708A4}</ProjectGuid>
<OutputType>Exe</OutputType>
<RootNamespace>SearchBoxCLI</RootNamespace>
<AssemblyName>SearchBox-CLI</AssemblyName>
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x86' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug</OutputPath>
<DefineConstants>DEBUG;</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<ExternalConsole>true</ExternalConsole>
<PlatformTarget>x86</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
<Optimize>true</Optimize>
<OutputPath>bin\Release</OutputPath>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<ExternalConsole>true</ExternalConsole>
<PlatformTarget>x86</PlatformTarget>
</PropertyGroup>
<ItemGroup>
<Reference Include="System" />
</ItemGroup>
<ItemGroup>
<Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\SearchBox\SearchBox.csproj">
<Project>{5243F60A-F822-4C52-A333-E4089754EC6A}</Project>
<Name>SearchBox</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
</Project>

23
SearchBox.sln Normal file
View File

@ -0,0 +1,23 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2012
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SearchBox", "SearchBox\SearchBox.csproj", "{5243F60A-F822-4C52-A333-E4089754EC6A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SearchBox-CLI", "SearchBox-CLI\SearchBox-CLI.csproj", "{1E2F5559-C918-4A5D-9C24-117F204708A4}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x86 = Debug|x86
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{5243F60A-F822-4C52-A333-E4089754EC6A}.Debug|x86.ActiveCfg = Debug|x86
{5243F60A-F822-4C52-A333-E4089754EC6A}.Debug|x86.Build.0 = Debug|x86
{5243F60A-F822-4C52-A333-E4089754EC6A}.Release|x86.ActiveCfg = Release|x86
{5243F60A-F822-4C52-A333-E4089754EC6A}.Release|x86.Build.0 = Release|x86
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Debug|x86.ActiveCfg = Debug|x86
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Debug|x86.Build.0 = Debug|x86
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Release|x86.ActiveCfg = Release|x86
{1E2F5559-C918-4A5D-9C24-117F204708A4}.Release|x86.Build.0 = Release|x86
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,311 @@
a
about
above
above
across
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
am
among
amongst
amoungst
amount
an
and
another
any
anyhow
anyone
anything
anyway
anywhere
are
around
as
at
back
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
between
beyond
bill
both
bottom
but
by
call
can
cannot
cant
co
con
could
couldnt
cry
de
describe
detail
do
done
down
due
during
each
eg
eight
either
eleven
else
elsewhere
empty
enough
etc
even
ever
every
everyone
everything
everywhere
except
few
fill
find
fire
first
five
for
former
formerly
found
four
from
front
full
further
get
give
go
had
has
hasnt
have
he
hence
her
here
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
ie
if
in
inc
indeed
interest
into
is
it
its
itself
keep
last
latter
latterly
least
less
ltd
made
many
may
me
meanwhile
might
mine
more
moreover
most
mostly
move
much
must
my
myself
name
namely
neither
never
nevertheless
next
nine
no
none
nor
not
nothing
now
nowhere
of
off
often
on
once
one
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
own
part
per
perhaps
please
put
rather
re
same
see
seem
seemed
seeming
seems
serious
several
she
should
show
side
since
sincere
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
such
system
take
ten
than
that
the
their
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
thereupon
these
they
thickv
thin
third
this
those
though
three
through
throughout
thru
thus
to
together
too
top
toward
towards
twelve
twenty
two
un
under
until
up
upon
us
very
via
was
we
well
were
what
whatever
when
whence
whenever
where
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
whoever
whole
whom
whose
why
will
with
within
without
would
yet
you
your
yours
yourself
yourselves

49
SearchBox/IdMapper.cs Normal file
View File

@ -0,0 +1,49 @@
using System;
using Stackoverflow.Utilities;
namespace SearchBox
{
public class IdNotFoundException : Exception { public IdNotFoundException(string message) : base(message) { } }
public class IdMapper
{
private int nextId = 0;
public BiDictionary<int, string> map = new BiDictionary<int, string>();
public IdMapper()
{
}
public int GetId(string pageName)
{
// Perform unicode normalization
pageName = pageName.Normalize(System.Text.NormalizationForm.FormC);
int result;
if (!map.TryGetBySecond(pageName, out result)) {
map.Add(result = nextId++, pageName);
}
return result;
}
public string GetPageName(int id)
{
string result;
if (!map.TryGetByFirst(id, out result))
throw new IdNotFoundException($"Error: Couldn't find {id} in the ID map.");
return result;
}
public void MovePageName(string oldPageName, string newPageName)
{
int id = map.GetBySecond(oldPageName);
map.RemoveBySecond(oldPageName);
map.Add(id, newPageName);
}
public void DeletePageName(string pageName)
{
map.RemoveBySecond(pageName);
}
}
}

94
SearchBox/Index.cs Normal file
View File

@ -0,0 +1,94 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
using SBRL.Utilities;
namespace SearchBox
{
[Flags]
public enum IndexOptions
{
ExcludeStopwords = 1
}
public class Index : IEnumerable<KeyValuePair<string, List<int>>>
{
private Dictionary<string, List<int>> index = new Dictionary<string, List<int>>();
private StopwordTester stopwordTester;
public Index(string inSource, IEnumerable<string> stopwords, IndexOptions options)
{
if (options.HasFlag(IndexOptions.ExcludeStopwords))
stopwordTester = new StopwordTester(stopwords);
// Tokenize the input and file it in our index
Tokenizer tokenizer = new Tokenizer(inSource);
foreach (Tuple<int, string> token in tokenizer) {
if (stopwordTester.IsStopword(token.Item2)) continue;
insert(token.Item2, token.Item1);
}
}
public Index(string inSource, IndexOptions options)
: this(inSource, EmbeddedFiles.EnumerateLines("SearchBox.EmbeddedFiles.Stopwords.txt"), options)
{
}
public Index(string inSource) : this(inSource, IndexOptions.ExcludeStopwords)
{
}
public List<int> this[string key] {
get {
return index[key];
}
}
protected void insert(string token, int offset)
{
if (!index.ContainsKey(token))
index.Add(token, new List<int>());
index[token].Add(offset);
}
public IEnumerable<string> Tokens()
{
return index.Keys;
}
public IEnumerable<KeyValuePair<string, List<int>>> IterateItems()
{
foreach(KeyValuePair<string, List<int>> item in index)
yield return item;
}
public IEnumerator<KeyValuePair<string, List<int>>> GetEnumerator()
{
return IterateItems().GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
public override string ToString()
{
StringBuilder result = new StringBuilder("Index: \n");
foreach (KeyValuePair<string, List<int>> item in index)
result.AppendLine($"\t{item.Key}: {string.Join(", ", item.Value)}");
return result.ToString();
}
// --------------------------------------------------------------------------------------
public static Index FromFile(string filename)
{
return new Index(File.ReadAllText(filename));
}
}
}

View File

@ -0,0 +1,41 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
namespace SearchBox
{
public class InvertedIndex
{
private ConcurrentDictionary<string, ConcurrentDictionary<int, List<int>>> invertedIndex = new ConcurrentDictionary<string, ConcurrentDictionary<int, List<int>>>();
public InvertedIndex()
{
}
public bool AddIndex(int pageId, Index newIndex)
{
foreach (KeyValuePair<string, List<int>> token in newIndex)
{
if (!invertedIndex.ContainsKey(token.Key) &&
!invertedIndex.TryAdd(token.Key, new ConcurrentDictionary<int, List<int>>()))
return false;
if (!invertedIndex[token.Key].TryAdd(pageId, token.Value))
return false;
}
return true;
}
public bool RemoveIndex(int pageId, Index newIndex)
{
foreach (string token in newIndex.Tokens())
{
if (!invertedIndex.ContainsKey(token) || !invertedIndex[token].ContainsKey(pageId)) continue;
if (!invertedIndex[token].TryRemove(pageId, out List<int> noop))
return false;
}
return false;
}
}
}

View File

@ -0,0 +1,26 @@
using System.Reflection;
using System.Runtime.CompilerServices;
// Information about this assembly is defined by the following attributes.
// Change them to the values specific to your project.
[assembly: AssemblyTitle("SearchBox")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("")]
[assembly: AssemblyCopyright("sbrl")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// The assembly version has the format "{Major}.{Minor}.{Build}.{Revision}".
// The form "{Major}.{Minor}.*" will automatically update the build and revision,
// and "{Major}.{Minor}.{Build}.*" will update just the revision.
[assembly: AssemblyVersion("1.0.*")]
// The following attributes are used to specify the signing key for the assembly,
// if desired. See the Mono documentation for more information about signing.
//[assembly: AssemblyDelaySign(false)]
//[assembly: AssemblyKeyFile("")]

10
SearchBox/SearchBox.cs Normal file
View File

@ -0,0 +1,10 @@
using System;
namespace SearchBox
{
public class SearchBox
{
public SearchBox()
{
}
}
}

View File

@ -0,0 +1,60 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">x86</Platform>
<ProjectGuid>{5243F60A-F822-4C52-A333-E4089754EC6A}</ProjectGuid>
<OutputType>Library</OutputType>
<RootNamespace>SearchBox</RootNamespace>
<AssemblyName>SearchBox</AssemblyName>
<TargetFrameworkVersion>v4.7</TargetFrameworkVersion>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x86' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug</OutputPath>
<DefineConstants>DEBUG;</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<ExternalConsole>true</ExternalConsole>
<PlatformTarget>x86</PlatformTarget>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x86' ">
<Optimize>true</Optimize>
<OutputPath>bin\Release</OutputPath>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<ExternalConsole>true</ExternalConsole>
<PlatformTarget>x86</PlatformTarget>
</PropertyGroup>
<ItemGroup>
<Reference Include="System" />
<Reference Include="UnidecodeSharpFork">
<HintPath>..\packages\UnidecodeSharpFork.1.0.0\lib\UnidecodeSharpFork.dll</HintPath>
</Reference>
</ItemGroup>
<ItemGroup>
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Utilities\EmbeddedFiles.cs" />
<Compile Include="SearchBox.cs" />
<Compile Include="Tokenizer.cs" />
<Compile Include="Index.cs" />
<Compile Include="Utilities\StringPlus.cs" />
<Compile Include="StopwordTester.cs" />
<Compile Include="InvertedIndex.cs" />
<Compile Include="IdMapper.cs" />
<Compile Include="Utilities\BiDictionary.cs" />
</ItemGroup>
<ItemGroup>
<Folder Include="EmbeddedFiles\" />
<Folder Include="Utilities\" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="EmbeddedFiles\Stopwords.txt" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
</Project>

View File

@ -0,0 +1,21 @@
using System;
using System.Collections.Generic;
namespace SearchBox
{
public class StopwordTester
{
private HashSet<string> words = new HashSet<string>();
public StopwordTester(IEnumerable<string> inWords)
{
foreach (string word in inWords)
words.Add(word);
}
public bool IsStopword(string word)
{
return words.Contains(word);
}
}
}

66
SearchBox/Tokenizer.cs Normal file
View File

@ -0,0 +1,66 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Net;
using System.Text.RegularExpressions;
using SearchBox.Utilities;
using UnidecodeSharpFork;
namespace SearchBox
{
[Flags]
public enum TokenizerOptions
{
Lowercase = 1,
Transliterate = 2,
HidePunctuation = 4,
DecodeHtmlEntities = 8
}
public class Tokenizer : IEnumerable<Tuple<int, string>>
{
private static Regex splitter = new Regex(
@"((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))|\|",
RegexOptions.Compiled | RegexOptions.Multiline
);
private string source;
public bool Verbose = false;
public Tokenizer(string inSource, TokenizerOptions options = TokenizerOptions.Transliterate | TokenizerOptions.Lowercase | TokenizerOptions.HidePunctuation)
{
if (options.HasFlag(TokenizerOptions.Transliterate)) inSource = inSource.Unidecode();
if (options.HasFlag(TokenizerOptions.Lowercase)) inSource = inSource.ToLower();
if (options.HasFlag(TokenizerOptions.HidePunctuation)) inSource = inSource.ReplaceMultiple(
@"[]{}|/\".ToCharArray(),
" ".ToCharArray()
);
if (options.HasFlag(TokenizerOptions.DecodeHtmlEntities)) inSource = WebUtility.HtmlDecode(inSource);
source = inSource;
}
public IEnumerable<Tuple<int, string>> IterateTokens()
{
int index = 0;
string[] parts = splitter.Split(source);
for (int i = 0; i < parts.Length; i++) {
index += parts[i].Length;
if (string.IsNullOrWhiteSpace(parts[i]) || splitter.IsMatch(parts[i])) continue;
if(Verbose) Console.WriteLine("#{0} @ {1}: '{2}'", i, index, parts[i]);
// FUTURE: We should swap this out for System.ValueTuple, as it's easier on the garbage collector.
yield return new Tuple<int, string>(index, parts[i]);
}
}
public IEnumerator<Tuple<int, string>> GetEnumerator() {
return IterateTokens().GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
}

View File

@ -0,0 +1,265 @@
using System;
using System.Collections;
using System.Collections.Generic;
namespace Stackoverflow.Utilities
{
/// <summary>
/// This is a dictionary guaranteed to have only one of each value and key.
/// It may be searched either by TFirst or by TSecond, giving a unique answer because it is 1 to 1.
/// It implements garbage-collector-friendly IEnumerable.
/// </summary>
/// <remarks>From https://stackoverflow.com/a/35949314/1460422</remarks>
/// <typeparam name="TFirst">The type of the "key"</typeparam>
/// <typeparam name="TSecond">The type of the "value"</typeparam>
public class BiDictionary<TFirst, TSecond> : IEnumerable<BiDictionary<TFirst, TSecond>.Pair>
{
public struct Pair
{
public TFirst First;
public TSecond Second;
}
public struct Enumerator : IEnumerator<Pair>, IEnumerator
{
public Enumerator(Dictionary<TFirst, TSecond>.Enumerator dictEnumerator)
{
_dictEnumerator = dictEnumerator;
}
public Pair Current {
get {
Pair pair;
pair.First = _dictEnumerator.Current.Key;
pair.Second = _dictEnumerator.Current.Value;
return pair;
}
}
object IEnumerator.Current {
get {
return Current;
}
}
public void Dispose()
{
_dictEnumerator.Dispose();
}
public bool MoveNext()
{
return _dictEnumerator.MoveNext();
}
public void Reset()
{
throw new NotSupportedException();
}
private Dictionary<TFirst, TSecond>.Enumerator _dictEnumerator;
}
#region Exception throwing methods
/// <summary>
/// Tries to add the pair to the dictionary.
/// Throws an exception if either element is already in the dictionary
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
public void Add(TFirst first, TSecond second)
{
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
throw new ArgumentException("Duplicate first or second");
_firstToSecond.Add(first, second);
_secondToFirst.Add(second, first);
}
/// <summary>
/// Find the TSecond corresponding to the TFirst first
/// Throws an exception if first is not in the dictionary.
/// </summary>
/// <param name="first">the key to search for</param>
/// <returns>the value corresponding to first</returns>
public TSecond GetByFirst(TFirst first)
{
TSecond second;
if (!_firstToSecond.TryGetValue(first, out second))
throw new ArgumentException("first");
return second;
}
/// <summary>
/// Find the TFirst corresponing to the Second second.
/// Throws an exception if second is not in the dictionary.
/// </summary>
/// <param name="second">the key to search for</param>
/// <returns>the value corresponding to second</returns>
public TFirst GetBySecond(TSecond second)
{
TFirst first;
if (!_secondToFirst.TryGetValue(second, out first))
throw new ArgumentException("second");
return first;
}
/// <summary>
/// Remove the record containing first.
/// If first is not in the dictionary, throws an Exception.
/// </summary>
/// <param name="first">the key of the record to delete</param>
public void RemoveByFirst(TFirst first)
{
TSecond second;
if (!_firstToSecond.TryGetValue(first, out second))
throw new ArgumentException("first");
_firstToSecond.Remove(first);
_secondToFirst.Remove(second);
}
/// <summary>
/// Remove the record containing second.
/// If second is not in the dictionary, throws an Exception.
/// </summary>
/// <param name="second">the key of the record to delete</param>
public void RemoveBySecond(TSecond second)
{
TFirst first;
if (!_secondToFirst.TryGetValue(second, out first))
throw new ArgumentException("second");
_secondToFirst.Remove(second);
_firstToSecond.Remove(first);
}
#endregion
#region Try methods
/// <summary>
/// Tries to add the pair to the dictionary.
/// Returns false if either element is already in the dictionary
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
/// <returns>true if successfully added, false if either element are already in the dictionary</returns>
public bool TryAdd(TFirst first, TSecond second)
{
if (_firstToSecond.ContainsKey(first) || _secondToFirst.ContainsKey(second))
return false;
_firstToSecond.Add(first, second);
_secondToFirst.Add(second, first);
return true;
}
/// <summary>
/// Find the TSecond corresponding to the TFirst first.
/// Returns false if first is not in the dictionary.
/// </summary>
/// <param name="first">the key to search for</param>
/// <param name="second">the corresponding value</param>
/// <returns>true if first is in the dictionary, false otherwise</returns>
public bool TryGetByFirst(TFirst first, out TSecond second)
{
return _firstToSecond.TryGetValue(first, out second);
}
/// <summary>
/// Find the TFirst corresponding to the TSecond second.
/// Returns false if second is not in the dictionary.
/// </summary>
/// <param name="second">the key to search for</param>
/// <param name="first">the corresponding value</param>
/// <returns>true if second is in the dictionary, false otherwise</returns>
public bool TryGetBySecond(TSecond second, out TFirst first)
{
return _secondToFirst.TryGetValue(second, out first);
}
/// <summary>
/// Remove the record containing first, if there is one.
/// </summary>
/// <param name="first"></param>
/// <returns> If first is not in the dictionary, returns false, otherwise true</returns>
public bool TryRemoveByFirst(TFirst first)
{
TSecond second;
if (!_firstToSecond.TryGetValue(first, out second))
return false;
_firstToSecond.Remove(first);
_secondToFirst.Remove(second);
return true;
}
/// <summary>
/// Remove the record containing second, if there is one.
/// </summary>
/// <param name="second"></param>
/// <returns> If second is not in the dictionary, returns false, otherwise true</returns>
public bool TryRemoveBySecond(TSecond second)
{
TFirst first;
if (!_secondToFirst.TryGetValue(second, out first))
return false;
_secondToFirst.Remove(second);
_firstToSecond.Remove(first);
return true;
}
#endregion
/// <summary>
/// The number of pairs stored in the dictionary
/// </summary>
public Int32 Count {
get { return _firstToSecond.Count; }
}
/// <summary>
/// Removes all items from the dictionary.
/// </summary>
public void Clear()
{
_firstToSecond.Clear();
_secondToFirst.Clear();
}
public Enumerator GetEnumerator()
{
//enumerator.Reset(firstToSecond.GetEnumerator());
return new Enumerator(_firstToSecond.GetEnumerator());
}
IEnumerator<Pair> IEnumerable<Pair>.GetEnumerator()
{
return GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
private Dictionary<TFirst, TSecond> _firstToSecond = new Dictionary<TFirst, TSecond>();
private Dictionary<TSecond, TFirst> _secondToFirst = new Dictionary<TSecond, TFirst>();
}
}

View File

@ -0,0 +1,189 @@
using System;
using System.Reflection;
using System.IO;
using System.Threading.Tasks;
using System.Collections.Generic;
namespace SBRL.Utilities
{
/// <summary>
/// A collection of static methods for manipulating embedded resources.
/// </summary>
/// <description>
/// v0.5, by Starbeamrainbowlabs <feedback@starbeamrainbowlabs.com>
/// Last updated 8th August 2016.
/// Licensed under MPL-2.0.
///
/// Changelog:
/// v0.1 (25th July 2016):
/// - Initial release.
/// v0.2 (8th August 2016):
/// - Changed namespace.
/// v0.3 (21st January 2017):
/// - Added GetRawReader().
/// v0.4 (8th April 2017):
/// - Removed unnecessary using statement.
/// v0.5 (3rd September 2017):
/// - Changed namespace
/// </description>
public static class EmbeddedFiles
{
/// <summary>
/// An array of the filenames of all the resources embedded in the calling assembly.
/// </summary>
/// <value>The resource list.</value>
public static string[] ResourceList {
get {
return Assembly.GetCallingAssembly().GetManifestResourceNames();
}
}
public static string GetResourceListText()
{
StringWriter result = new StringWriter();
result.WriteLine("Files embedded in {0}:", Assembly.GetCallingAssembly().GetName().Name);
foreach (string filename in ResourceList)
result.WriteLine(" - {0}", filename);
return result.ToString();
}
/// <summary>
/// Writes a list of embedded resources to the Console's standard output.
/// </summary>
public static void WriteResourceList()
{
Console.WriteLine(GetResourceListText());
}
/// <summary>
/// Gets a StreamReader attached to the specified embedded resource.
/// </summary>
/// <param name="filename">The filename of the embedded resource to get a StreamReader of.</param>
/// <returns>A StreamReader attached to the specified embedded resource.</returns>
public static StreamReader GetReader(string filename)
{
return new StreamReader(GetRawReader(filename));
}
/// <summary>
/// Gets a raw Stream that's attached to the specified embedded resource.
/// Useful when you want to copy an embedded resource to some other stream.
/// </summary>
/// <param name="filename">The path to the embedded resource.</param>
/// <returns>A raw Stream object attached to the specified file..</returns>
public static Stream GetRawReader(string filename)
{
return Assembly.GetCallingAssembly().GetManifestResourceStream(filename);
}
/// <summary>
/// Gets the specified embedded resource's content as a byte array.
/// </summary>
/// <param name="filename">The filename of the embedded resource to get conteent of.</param>
/// <returns>The specified embedded resource's content as a byte array.</returns>
public static byte[] ReadAllBytes(string filename)
{
// Referencing the Result property will block until the async method completes
return ReadAllBytesAsync(filename).Result;
}
/// <summary>
/// Gets the specified embedded resource's content as a byte array asynchronously.
/// </summary>
/// <param name="filename">The filename of the embedded resource to get conteent of.</param>
/// <returns>The specified embedded resource's content as a byte array.</returns>
public static async Task<byte[]> ReadAllBytesAsync(string filename)
{
using (Stream resourceStream = Assembly.GetCallingAssembly().GetManifestResourceStream(filename))
using (MemoryStream temp = new MemoryStream())
{
await resourceStream.CopyToAsync(temp);
return temp.ToArray();
}
}
/// <summary>
/// Gets all the text stored in the specified embedded resource.
/// </summary>
/// <param name="filename">The filename to fetch the content of.</param>
/// <returns>All the text stored in the specified embedded resource.</returns>
public static string ReadAllText(string filename)
{
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
{
return resourceReader.ReadToEnd();
}
}
/// <summary>
/// Gets all the text stored in the specified embedded resource asynchronously.
/// </summary>
/// <param name="filename">The filename to fetch the content of.</param>
/// <returns>All the text stored in the specified embedded resource.</returns>
public static async Task<string> ReadAllTextAsync(string filename)
{
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
{
return await resourceReader.ReadToEndAsync();
}
}
/// <summary>
/// Enumerates the lines of text in the specified embedded resource.
/// </summary>
/// <param name="filename">The filename of the embedded resource to enumerate.</param>
/// <returns>An IEnumerator that enumerates the specified embedded resource.</returns>
public static IEnumerable<string> EnumerateLines(string filename)
{
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
{
string nextLine;
while ((nextLine = resourceReader.ReadLine()) != null)
{
yield return nextLine;
}
}
}
/// <summary>
/// Enumerates the lines of text in the specified embedded resource asynchronously.
/// Each successive call returns a task that, when complete, returns the next line of text stored
/// in the embedded resource.
/// </summary>
/// <param name="filename">The filename of the embedded resource to enumerate.</param>
/// <returns>An IEnumerator that enumerates the specified embedded resource.</returns>
public static IEnumerable<Task<string>> EnumerateLinesAsync(string filename)
{
using (StreamReader resourceReader = new StreamReader(Assembly.GetCallingAssembly().GetManifestResourceStream(filename)))
{
while (!resourceReader.EndOfStream)
{
yield return resourceReader.ReadLineAsync();
}
}
}
/// <summary>
/// Gets all the lines of text in the specified embedded resource.
/// You might find EnumerateLines(string filename) more useful depending on your situation.
/// </summary>
/// <param name="filename">The filename to obtain the lines of text from.</param>
/// <returns>A list of lines in the specified embedded resource.</returns>
public static List<string> GetAllLines(string filename)
{
// Referencing the Result property will block until the async method completes
return GetAllLinesAsync(filename).Result;
}
/// <summary>
/// Gets all the lines of text in the specified embedded resource asynchronously.
/// </summary>
/// <param name="filename">The filename to obtain the lines of text from.</param>
/// <returns>A list of lines in the specified embedded resource.</returns>
public static async Task<List<string>> GetAllLinesAsync(string filename)
{
List<string> lines = new List<string>();
IEnumerable<Task<string>> lineIterator = EnumerateLinesAsync(filename);
foreach (Task<string> nextLine in lineIterator)
{
lines.Add(await nextLine);
}
return lines;
}
}
}

View File

@ -0,0 +1,18 @@
using System;
namespace SearchBox.Utilities
{
public static class StringPlus
{
public static string ReplaceMultiple(this string str, char[] find, char[] replace)
{
for (int i = 0; i < find.Length; i++) {
str = str.Replace(
find[i],
i < replace.Length ? replace[i] : replace[replace.Length - 1]
);
}
return str;
}
}
}

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="UnidecodeSharpFork" version="1.0.0" targetFramework="net47" />
</packages>