2018-09-02 15:36:37 +00:00
using System ;
2018-09-11 13:27:25 +00:00
using System.Collections.Concurrent ;
using System.Collections.Generic ;
2018-09-18 18:49:03 +00:00
using System.Linq ;
2018-09-22 16:12:46 +00:00
using System.Net ;
2018-09-25 19:25:05 +00:00
using System.Text.RegularExpressions ;
2018-09-18 18:49:03 +00:00
using System.Threading.Tasks ;
2018-09-11 13:47:24 +00:00
using Newtonsoft.Json ;
2018-09-22 17:48:33 +00:00
using UnidecodeSharpFork ;
2018-09-11 13:27:25 +00:00
namespace LibSearchBox
2018-09-02 15:36:37 +00:00
{
2018-09-22 16:12:46 +00:00
public class QuerySettings
{
public int WeightTitleMatch = 100 ;
public int WeightTagMatch = 10 ;
public QuerySettings ( )
{
}
}
public class ContextSettings
{
/// <summary>
/// The number of characters that should be displayed either side of a
/// matching term.
/// </summary>
public int ContextCharacters = 75 ;
/// <summary>
/// The maximum length of the generated context string.
/// </summary>
public int MaxLength = 250 ;
/// <summary>
/// The separator to use between snippets in the generated context.
/// </summary>
public string Separator = " … " ;
/// <summary>
/// Whether to output the generated context in HTML.
/// </summary>
public bool Html = false ;
}
2018-09-11 13:27:25 +00:00
public class SearchBoxException : Exception { public SearchBoxException ( string message ) : base ( message ) { } }
2018-09-11 15:01:32 +00:00
[JsonObject(MemberSerialization.OptIn)]
2018-09-02 15:36:37 +00:00
public class SearchBox
{
2018-09-11 15:01:32 +00:00
private IdMap idMap = new IdMap ( ) ;
[JsonProperty("ids")]
public Dictionary < int , string > IdMap {
get {
Dictionary < int , string > result = idMap . MapOut ;
if ( result . Count = = 0 ) return null ;
return result ;
}
set {
if ( value = = null ) return ;
idMap . Import ( value ) ;
}
}
[JsonProperty]
2018-09-11 13:47:24 +00:00
public ConcurrentDictionary < int , DocumentMeta > metaTable = new ConcurrentDictionary < int , DocumentMeta > ( ) ;
2018-09-11 15:01:32 +00:00
[JsonProperty]
public InvertedIndex index = new InvertedIndex ( ) ;
2018-09-11 13:27:25 +00:00
2018-09-18 18:49:03 +00:00
public bool Verbose { get ; set ; } = false ;
2018-09-02 15:36:37 +00:00
public SearchBox ( )
{
2018-09-11 13:27:25 +00:00
}
2018-09-18 18:49:03 +00:00
#region Index Management
2018-09-11 13:27:25 +00:00
public void AddDocument ( string title , IEnumerable < string > tags , string content )
{
DocumentMeta info = new DocumentMeta ( title , tags ) ;
int id = idMap . GetId ( info . Title ) ;
2018-09-11 15:01:32 +00:00
metaTable . AddOrUpdate ( id , info , ( key , oldValue ) = > info ) ;
2018-09-11 13:27:25 +00:00
Index upsideIndex = new Index ( content ) ;
index . AddIndex ( id , upsideIndex ) ;
}
public void UpdateDocument ( string title , IEnumerable < string > newTags , string oldContent , string newContent )
{
int id = idMap . GetId ( title ) ;
DocumentMeta info = metaTable [ id ] ;
info . ReplaceTags ( newTags ) ;
Index oldIndex = new Index ( oldContent ) , newIndex = new Index ( newContent ) ;
if ( ! index . ReplaceIndex ( id , oldIndex , newIndex ) )
2018-09-11 15:01:32 +00:00
throw new SearchBoxException ( $"Error: Failed to replace index for document with title {title}." ) ;
2018-09-11 13:27:25 +00:00
}
public void RemoveDocument ( string title )
{
int id = idMap . DeletePageName ( title ) ;
metaTable . TryRemove ( id , out DocumentMeta noop ) ;
if ( ! index . RemoveById ( id ) )
throw new SearchBoxException ( $"Failed to remove page with title '{title}' from inverted index." ) ;
2018-09-18 18:49:03 +00:00
}
#endregion
#region Query
public List < SearchResult > Query ( string query , QuerySettings settings )
{
// pageId => token -> count
ConcurrentDictionary < int , ConcurrentDictionary < string , int > > matchingPages = new ConcurrentDictionary < int , ConcurrentDictionary < string , int > > ( ) ;
Tokenizer tokenizer = new Tokenizer ( query ) ;
2018-09-22 16:12:46 +00:00
foreach ( ( int , string ) token in tokenizer . IterateTokens ( ) )
2018-09-18 18:49:03 +00:00
{
ConcurrentDictionary < int , List < int > > tokenResults = index . Query ( token . Item2 ) ;
Parallel . ForEach ( tokenResults , ( KeyValuePair < int , List < int > > pageTokenDef ) = > {
// pageTokenDef: pageId => List of token offsets
ConcurrentDictionary < string , int > pageData = matchingPages . GetOrAdd ( pageTokenDef . Key , new ConcurrentDictionary < string , int > ( ) ) ;
if ( ! pageData . TryAdd ( token . Item2 , pageTokenDef . Value . Count ) )
throw new SearchBoxException ( "Error: Failed to add token count to page data in search " +
"results - the key already exists (are there duplicate tokens for this page id " +
"in the inverted index?" ) ;
} ) ;
}
ConcurrentBag < SearchResult > resultsRaw = new ConcurrentBag < SearchResult > ( ) ;
2018-09-22 16:12:46 +00:00
Parallel . ForEach ( matchingPages , ( KeyValuePair < int , ConcurrentDictionary < string , int > > pageDef ) = >
{
2018-09-22 13:25:10 +00:00
int rank = pageDef . Value . Values . Sum ( ) ; // FUTURE: Linq is slow. Is there a faster way of doing this?
2018-09-22 16:12:46 +00:00
if ( ! metaTable . TryGetValue ( pageDef . Key , out DocumentMeta metaInfo ) )
{
2018-09-18 18:49:03 +00:00
if ( Verbose ) Console . Error . WriteLine ( $"Warning: Failed to fetch meta info for page id {pageDef.Key}" ) ;
}
2018-09-22 16:12:46 +00:00
else
{
foreach ( ( int , string ) token in tokenizer . IterateTokens ( ) )
{
2018-09-18 18:49:03 +00:00
if ( metaInfo . SearchableTitle . Contains ( token . Item2 ) )
rank + = settings . WeightTitleMatch ; // FUTURE: Alter the amount added by the length of the word matched
foreach ( string nextTag in metaInfo . SearchableTags )
if ( nextTag . Contains ( token . Item2 ) )
rank + = settings . WeightTagMatch ;
}
}
2018-09-22 16:12:46 +00:00
List < SearchOffset > offsets = getPageOffsets ( pageDef . Key , tokenizer ) ;
2018-09-21 22:36:03 +00:00
resultsRaw . Add ( new SearchResult (
idMap . GetPageName ( pageDef . Key ) ,
rank ,
offsets
) ) ;
2018-09-18 18:49:03 +00:00
} ) ;
List < SearchResult > results = new List < SearchResult > ( resultsRaw . AsEnumerable ( ) ) ;
2018-09-22 11:58:14 +00:00
results . Sort ( ( SearchResult a , SearchResult b ) = > ( int ) Math . Round ( b . Rank - a . Rank ) ) ;
2018-09-18 18:49:03 +00:00
return results ;
2018-09-02 15:36:37 +00:00
}
2018-09-18 18:49:03 +00:00
2018-09-22 16:12:46 +00:00
private List < SearchOffset > getPageOffsets ( int pageId , Tokenizer tokenizer )
{
List < SearchOffset > offsets = new List < SearchOffset > ( ) ;
foreach ( ( int , string ) token in tokenizer . IterateTokens ( ) )
{
ConcurrentDictionary < int , List < int > > tokenQuery = index . Query ( token . Item2 ) ;
2018-09-25 19:30:39 +00:00
if ( tokenQuery = = null | | ! tokenQuery . ContainsKey ( pageId ) )
continue ; // Don't bother if this page doesn't contain this token
2018-09-22 16:12:46 +00:00
offsets . AddRange ( tokenQuery [ pageId ] . Select ( ( int offset ) = > new SearchOffset ( token . Item2 , offset ) ) ) ;
}
offsets . Sort ( ( SearchOffset x , SearchOffset y ) = > x . Offset - y . Offset ) ;
return offsets ;
}
public string GenerateContext ( string pageName , string source , string query , ContextSettings settings )
{
int pageId = idMap . GetId ( pageName ) ;
Tokenizer tokenizer = new Tokenizer ( query ) ;
List < SearchOffset > offsets = getPageOffsets ( pageId , tokenizer ) ;
int currentLength = 0 ;
List < ( int , int ) > tokenLocations = new List < ( int , int ) > ( ) ; // offset, length
List < ( int , int ) > snippets = new List < ( int , int ) > ( ) ; // from, to
for ( int i = 0 ; i < offsets . Count ; i + + )
{
// Don't go over the maximum length
// FUTURE: Would it be faster to keep track of this as we go? It's probably not worth it though, as we're not going to be generate *that* many at once - we'll have to see.
if ( currentLength > settings . MaxLength )
break ;
// Generate the next snippet
( int , int ) nextSnippet = (
Math . Max ( 0 , offsets [ i ] . Offset - settings . ContextCharacters ) ,
Math . Min ( source . Length , offsets [ i ] . Offset + offsets [ i ] . Term . Length + settings . ContextCharacters )
) ;
tokenLocations . Add ( ( offsets [ i ] . Offset , offsets [ i ] . Term . Length ) ) ;
// If the next snippet overlaps with the previous one, then combine the 2
if ( snippets . Count > 0 & & snippets . Last ( ) . Item2 > nextSnippet . Item1 ) {
// BUG: This *might* exceed the MaxLength a bit
// Pop the last snippet from the list
( int , int ) lastSnippet = snippets [ snippets . Count - 1 ] ; snippets . RemoveAt ( snippets . Count - 1 ) ;
currentLength + = nextSnippet . Item2 - lastSnippet . Item2 ;
lastSnippet . Item2 = nextSnippet . Item2 ; // Fiddle it a bit
snippets . Add ( lastSnippet ) ; // Push it back on again
continue ;
}
// No overlap! Add it to the list
snippets . Add ( nextSnippet ) ;
currentLength + = nextSnippet . Item2 - nextSnippet . Item1 ;
}
2018-09-22 17:48:33 +00:00
List < string > snippetsText = new List < string > (
snippets . Select ( ( ( int , int ) snippet ) = > {
string result = source . Substring ( snippet . Item1 , snippet . Item2 - snippet . Item1 ) ;
if ( settings . Html ) {
result = WebUtility . HtmlEncode ( result ) ;
2018-09-25 19:25:05 +00:00
foreach ( ( int , string ) nextToken in tokenizer . IterateTokens ( ) ) {
result = Regex . Replace (
result ,
Regex . Escape ( nextToken . Item2 ) ,
$"<strong class='query-term'>{nextToken.Item2}</strong>" ,
RegexOptions . IgnoreCase // Also ignores accents, apparently
) ;
2018-09-22 17:48:33 +00:00
}
2018-09-22 16:12:46 +00:00
}
2018-09-22 17:48:33 +00:00
return result ;
} )
. Where ( ( string snippet ) = > ! string . IsNullOrWhiteSpace ( snippet ) )
) ;
2018-09-22 16:12:46 +00:00
// Add the separator at the beginning and end if we aren't at the bounds of the source document
2018-09-25 19:30:39 +00:00
if ( snippets . Count > 0 & & snippets . First ( ) . Item1 > 0 )
2018-09-22 16:12:46 +00:00
snippetsText . Insert ( 0 , "" ) ;
2018-09-25 19:30:39 +00:00
if ( snippets . Count > 0 & & snippets . Last ( ) . Item2 < source . Length )
2018-09-22 16:12:46 +00:00
snippetsText . Add ( "" ) ;
return string . Join (
settings . Separator ,
snippetsText
) ;
}
2018-09-18 18:49:03 +00:00
#endregion
2018-09-02 15:36:37 +00:00
}
}