1
0
Fork 0
mirror of https://github.com/sbrl/PolyFeed.git synced 2024-12-24 10:45:02 +00:00

Finish the initial HTML implementation.

This commit is contained in:
Starbeamrainbowlabs 2019-07-28 17:24:21 +01:00
parent 59a0289b3a
commit 14fca32a5e
Signed by: sbrl
GPG key ID: 1BE5172E637709C2
12 changed files with 746 additions and 14 deletions

View file

@ -1,9 +1,13 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Xml;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using Microsoft.SyndicationFeed;
using Microsoft.SyndicationFeed.Atom;
namespace PolyFeed
@ -22,9 +26,68 @@ namespace PolyFeed
public async Task AddSource(FeedSource source) {
WebResponse response = await WebRequest.Create(source.Url).GetResponseAsync();
using StreamReader reader = new StreamReader(response.GetResponseStream());
// Write the header
await feed.WriteGenerator("Polyfeed", "https://gitlab.com/sbrl/PolyFeed.git", Program.getProgramVersion());
await feed.WriteId(source.Url);
string lastModified = response.Headers.Get("last-modified");
if (string.IsNullOrWhiteSpace(lastModified))
await feed.WriteUpdated(DateTimeOffset.Now);
else
await feed.WriteUpdated(DateTimeOffset.Parse(lastModified));
string contentType = response.Headers.Get("content-type");
switch (source.SourceType) {
case SourceType.HTML:
await AddSourceHtml(source, response);
break;
default:
throw new NotImplementedException($"Error: The source type {source.SourceType} hasn't been implemented yet.");
}
}
private async Task AddSourceHtml(FeedSource source, WebResponse response) {
HtmlDocument html = new HtmlDocument();
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
html.LoadHtml(await reader.ReadToEndAsync());
HtmlNode document = html.DocumentNode;
await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Title, document));
await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Subtitle, document));
foreach (HtmlNode nextNode in document.QuerySelectorAll(source.EntrySelector)) {
HtmlNode urlNode = nextNode.QuerySelector(source.EntryUrlSelector);
string url = source.EntryUrlAttribute == string.Empty ?
urlNode.InnerText : urlNode.Attributes[source.EntryUrlAttribute].DeEntitizeValue;
SyndicationItem nextItem = new SyndicationItem() {
Id = url,
Title = ReferenceSubstitutor.Replace(source.EntryTitle, nextNode),
Description = ReferenceSubstitutor.Replace(source.EntryContent, nextNode)
};
if (source.EntryPublishedSelector != string.Empty) {
HtmlNode publishedNode = nextNode.QuerySelector(source.EntryPublishedSelector);
nextItem.Published = DateTime.Parse(
source.EntryPublishedAttribute == string.Empty
? publishedNode.InnerText
: publishedNode.Attributes[source.EntryPublishedAttribute].DeEntitizeValue
);
}
if (source.EntryPublishedSelector != string.Empty) {
HtmlNode lastUpdatedNode = nextNode.QuerySelector(source.EntryLastUpdatedSelector);
nextItem.Published = DateTime.Parse(
source.EntryLastUpdatedAttribute == string.Empty
? lastUpdatedNode.InnerText
: lastUpdatedNode.Attributes[source.EntryLastUpdatedAttribute].DeEntitizeValue
);
}
}
}
}
}

View file

@ -1,4 +1,5 @@
using System;
namespace PolyFeed
{
public enum SourceType { HTML, XML, JSON };
@ -10,31 +11,43 @@ namespace PolyFeed
/// </summary>
/// <value>The URL.</value>
public string Url { get; set; }
/// <summary>
/// The title of the feed.
/// </summary>
public string Title { get; set; }
/// <summary>
/// The type of source document to expect.
/// </summary>
public SourceType SourceType { get; set; }
/// <summary>
/// The title of the feed.
/// Supports the same {} syntax as <see cref="EntryTitle" />.
/// </summary>
public string Title { get; set; }
/// <summary>
/// The subtitle of the feed.
/// Supports the same {} syntax as <see cref="EntryTitle" />.
/// </summary>
/// <value>The subtitle.</value>
public string Subtitle { get; set; }
#region Entries
/// <summary>
/// A selector that matches against an element that contains the URL that an
/// entry should link to.
/// Relative to the element selected by <see cref="EntrySelector" />.
/// </summary>
public string UrlSelector { get; set; }
public string EntryUrlSelector { get; set; }
/// <summary>
/// The name of the attribute on the element selected by <see cref="UrlSelector" />.
/// The name of the attribute on the element selected by <see cref="EntryUrlSelector" />.
/// Set to an empty string to select the content of the element instead of the
/// content of an attribute.
/// </summary>
public string UrlElementAttribute { get; set; } = "";
public string EntryUrlAttribute { get; set; } = "";
/// <summary>
/// The selector that specifies the location in the object model of nodes that should
/// be added to the feed.
/// The selector that specifies the location of nodes in the object model that
/// should be added to the feed.
/// The format varies depending on the <see cref="SourceType" />.
/// - HTML: CSS selector (e.g. main > article)
/// - XML: XPath (e.g. //element_name)
@ -52,5 +65,29 @@ namespace PolyFeed
/// Same as <see cref="EntryTitle" />, but for the body of an entry. HTML is allowed.
/// </summary>
public string EntryContent { get; set; }
/// <summary>
/// The selector for the node that contains the date published for an entry.
/// </summary>
public string EntryPublishedSelector { get; set; }
/// <summary>
/// The name of the attribute that contains the date published for an entry.
/// Set to <see cref="string.Empty" /> to use the content of the node itself.
/// </summary>
public string EntryPublishedAttribute { get; set; }
/// <summary>
/// Same as <see cref="EntryPublishedSelector" />, but for the last updated.
/// If not specified, the last updated will be omitted.
/// </summary>
public string EntryLastUpdatedSelector { get; set; }
/// <summary>
/// Same as <see cref="EntryPublishedAttribute" />.
/// </summary>
public string EntryLastUpdatedAttribute { get; set; }
#endregion
}
}

View file

@ -34,7 +34,7 @@
<HintPath>..\packages\Fizzler.1.2.0\lib\netstandard2.0\Fizzler.dll</HintPath>
</Reference>
<Reference Include="HtmlAgilityPack">
<HintPath>..\packages\HtmlAgilityPack.1.11.9\lib\Net45\HtmlAgilityPack.dll</HintPath>
<HintPath>..\packages\HtmlAgilityPack.1.11.12\lib\Net45\HtmlAgilityPack.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Win32.Primitives">
<HintPath>..\packages\Microsoft.Win32.Primitives.4.3.0\lib\net46\Microsoft.Win32.Primitives.dll</HintPath>
@ -136,10 +136,20 @@
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="FeedBuilder.cs" />
<Compile Include="FeedSource.cs" />
<Compile Include="Salamander.Core\Lexer.cs" />
<Compile Include="Salamander.Core\LexerRule.cs" />
<Compile Include="Salamander.Core\LexerToken.cs" />
<Compile Include="Salamander.Core\Ansi.cs" />
<Compile Include="SubstitutionLexer.cs" />
<Compile Include="Salamander.Core\LexerPool.cs" />
<Compile Include="ReferenceSubstitutor.cs" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
</ItemGroup>
<ItemGroup>
<Folder Include="Salamander.Core\" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<Import Project="..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets" Condition="Exists('..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets')" />
</Project>

View file

@ -3,7 +3,7 @@ using System.Collections.Generic;
using System.IO;
using System.Reflection;
namespace ProjectNamespace
namespace PolyFeed
{
internal class Settings
{
@ -67,7 +67,7 @@ namespace ProjectNamespace
#region Helper Methods
private static string getProgramVersion()
public static string getProgramVersion()
{
Version version = Assembly.GetExecutingAssembly().GetName().Version;
return $"{version.Major}.{version.Minor}";

View file

@ -0,0 +1,44 @@
using System;
using System.Text;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
using Salamander.Core.Lexer;
namespace PolyFeed
{
internal static class ReferenceSubstitutor {
private static LexerPool<SubstitutionLexer, SubstitutionToken> lexerPool = new LexerPool<SubstitutionLexer, SubstitutionToken>();
public static string Replace(string inputString, HtmlNode rootElement)
{
StringBuilder result = new StringBuilder();
SubstitutionLexer lexer = lexerPool.AcquireLexer();
lexer.Initialise(inputString);
foreach (LexerToken<SubstitutionToken> nextToken in lexer.TokenStream())
{
switch (nextToken.Type) {
case SubstitutionToken.BraceOpen:
lexer.SaveRuleStates();
lexer.EnableRule(SubstitutionToken.Identifier);
lexer.DisableRule(SubstitutionToken.Text);
break;
case SubstitutionToken.BraceClose:
lexer.RestoreRuleStates();
break;
case SubstitutionToken.Text:
result.Append(nextToken.Value);
break;
case SubstitutionToken.Identifier:
result.Append(rootElement.QuerySelector(nextToken.Value));
break;
}
}
lexerPool.ReleaseLexer(lexer);
return result.ToString();
}
}
}

View file

@ -0,0 +1,49 @@
using System;
namespace Salamander.Core.Helpers
{
public static class Ansi
{
/// <summary>
/// Whether we should *actually* emit ANSI escape codes or not.
/// Useful when we want to output to a log file, for example.
/// </summary>
public static bool Enabled { get; set; } = true;
// Solution on how to output ANSI escape codes in C# from here:
// https://www.jerriepelser.com/blog/using-ansi-color-codes-in-net-console-apps
public static string Reset => Enabled ? "\u001b[0m" : "";
public static string HiCol => Enabled ? "\u001b[1m" : "";
public static string Underline => Enabled ? "\u001b[4m" : "";
public static string Inverse => Enabled ? "\u001b[7m" : "";
public static string FBlack => Enabled ? "\u001b[30m" : "";
public static string FRed => Enabled ? "\u001b[31m" : "";
public static string FGreen => Enabled ? "\u001b[32m" : "";
public static string FYellow => Enabled ? "\u001b[33m" : "";
public static string FBlue => Enabled ? "\u001b[34m" : "";
public static string FMagenta => Enabled ? "\u001b[35m" : "";
public static string FCyan => Enabled ? "\u001b[36m" : "";
public static string FWhite => Enabled ? "\u001b[37m" : "";
public static string BBlack => Enabled ? "\u001b[40m" : "";
public static string BRed => Enabled ? "\u001b[41m" : "";
public static string BGreen => Enabled ? "\u001b[42m" : "";
public static string BYellow => Enabled ? "\u001b[43m" : "";
public static string BBlue => Enabled ? "\u001b[44m" : "";
public static string BMagenta => Enabled ? "\u001b[45m" : "";
public static string BCyan => Enabled ? "\u001b[46m" : "";
public static string BWhite => Enabled ? "\u001b[47m" : "";
// Thanks to http://ascii-table.com/ansi-escape-sequences.php for the following ANSI escape sequences
public static string Up(int lines = 1) => Enabled ? $"\u001b[{lines}A" : "";
public static string Down(int lines = 1) => Enabled ? $"\u001b[{lines}B" : "";
public static string Right(int lines = 1) => Enabled ? $"\u001b[{lines}C" : "";
public static string Left(int lines = 1) => Enabled ? $"\u001b[{lines}D" : "";
//public static string JumpTo(Vector2 pos) => $"\u001b[{pos.Y};{pos.X}H" : "";
public static string CursorPosSave => Enabled ? $"\u001b[s" : "";
public static string CursorPosRestore => Enabled ? $"\u001b[u" : "";
}
}

View file

@ -0,0 +1,328 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Salamander.Core.Helpers;
namespace Salamander.Core.Lexer
{
public class Lexer<TokenType>
{
/// <summary>
/// The rules that should be used during the lexing process.
/// </summary>
public List<LexerRule<TokenType>> Rules { get; private set; } = new List<LexerRule<TokenType>>();
/// <summary>
/// Tokens in this list will be matched against, but not emitted by the lexer
/// into the main token stream.
/// Useful for catching and disposing of sequences of characters you don't want escaping
/// or breaking your parser.
/// </summary>
public List<TokenType> IgnoreTokens { get; private set; } = new List<TokenType>();
/// <summary>
/// Whether the lexer should be verbose and log a bunch of debugging information
/// to the console.
/// </summary>
public bool Verbose { get; set; } = false;
/// <summary>
/// The number of the line that currently being scanned.
/// </summary>
public int CurrentLineNumber { get; private set; } = 0;
/// <summary>
/// The number of characters on the current line that have been scanned.
/// </summary>
/// <value>The current line position.</value>
public int CurrentLinePos { get; private set; } = 0;
/// <summary>
/// The total number of characters currently scanned by this lexer instance.
/// Only updated every newline!
/// </summary>
public int TotalCharsScanned { get; private set; } = 0;
/// <summary>
/// The internal stream that we should read from when lexing.
/// </summary>
private StreamReader textStream;
/// <summary>
/// A stack of rule states.
/// Whether rules are enabled or disabled can be recursively saved and restored -
/// this <see cref="Stack{T}" /> is how the lexer saves this information.
/// </summary>
private Stack<Dictionary<LexerRule<TokenType>, bool>> EnabledStateStack = new Stack<Dictionary<LexerRule<TokenType>, bool>>();
/// <summary>
/// Creates a new <see cref="Lexer{TokenType}" />, optionally containing the given
/// <see cref="LexerRule{TokenType}" /> instances.
/// </summary>
/// <param name="initialRules">The rules to add to the new <see cref="Lexer{TokenType}" />.</param>
public Lexer(params LexerRule<TokenType>[] initialRules)
{
AddRules(initialRules);
}
/// <summary>
/// Adds a single lexing rule to the <see cref="Lexer{TokenType}" />.
/// </summary>
/// <param name="newRule">The rule to add.</param>
public void AddRule(LexerRule<TokenType> newRule)
=> Rules.Add(newRule);
/// <summary>
/// Adds a bunch of lexing rules to the <see cref="Lexer{TokenType}" />.
/// </summary>
/// <param name="newRules">The rules to add.</param>
public void AddRules(IEnumerable<LexerRule<TokenType>> newRules)
=> Rules.AddRange(newRules);
/// <summary>
/// Reinitialises the parser with a new input stream.
/// </summary>
/// <remarks>
/// Child classes should override this method to do their own state initialisation,
/// as lexers MAY be re-used on multiple input streams.
/// Implementors must be careful not to forget to call this base method though.
/// </remarks>
/// <param name="reader">The <see cref="StreamReader"/> to use as the new input stream..</param>
public virtual void Initialise(StreamReader reader)
{
// Reset the counters
CurrentLineNumber = 0;
CurrentLinePos = 0;
TotalCharsScanned = 0;
// Reset the state stack
EnabledStateStack.Clear();
// Re-enable all rules
EnableAllRules();
textStream = reader;
}
public void Initialise(string input)
{
MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(input));
Initialise(new StreamReader(stream));
}
/// <summary>
/// Performs the lexing process itself in an incremental manner.
/// Note that a single Lexer may only do a single lex at a time - even if it's the
/// same document multiple times over.
/// </summary>
/// <returns>A stream of lexical tokens.</returns>
public IEnumerable<LexerToken<TokenType>> TokenStream()
{
string nextLine;
List<LexerToken<TokenType>> matches = new List<LexerToken<TokenType>>();
while ((nextLine = textStream.ReadLine()) != null)
{
CurrentLinePos = 0;
while (CurrentLinePos < nextLine.Length)
{
matches.Clear();
foreach (LexerRule<TokenType> rule in Rules)
{
if (!rule.Enabled) continue;
Match nextMatch = rule.RegEx.Match(nextLine, CurrentLinePos);
if (!nextMatch.Success) continue;
matches.Add(
new LexerToken<TokenType>(rule, nextMatch)
{
LineNumber = CurrentLineNumber,
ColumnNumber = nextMatch.Index
}
);
}
if (matches.Count == 0)
{
string unknownTokenContent = nextLine.Substring(CurrentLinePos);
if (Verbose) Console.WriteLine($"{Ansi.FRed}[Unknown Token: No matches found for this line]{Ansi.Reset} {0}", unknownTokenContent);
yield return new LexerToken<TokenType>(unknownTokenContent)
{
LineNumber = CurrentLineNumber,
ColumnNumber = CurrentLinePos
};
break;
}
matches.Sort((LexerToken<TokenType> a, LexerToken<TokenType> b) => {
// Match of offset position position
int result = a.ColumnNumber - b.ColumnNumber;
// If they both start at the same position, then go with highest priority one
if (result == 0)
result = b.Rule.Priority - a.Rule.Priority;
// Failing that, try the longest one
if (result == 0)
result = b.RegexMatch.Length - a.RegexMatch.Length;
return result;
});
LexerToken<TokenType> selectedToken = matches[0];
int selectedTokenOffset = nextLine.IndexOf(selectedToken.RegexMatch.Value, CurrentLinePos) - CurrentLinePos;
if (selectedTokenOffset > 0)
{
string extraTokenContent = nextLine.Substring(CurrentLinePos, selectedTokenOffset);
int unmatchedLinePos = CurrentLinePos;
CurrentLinePos += selectedTokenOffset;
if (Verbose) Console.WriteLine($"{Ansi.FRed}[Unmatched content]{Ansi.Reset} '{extraTokenContent}'");
// Return the an unknown token, but only if we're not meant to be ignoring them
if (!IgnoreTokens.Contains((TokenType)Enum.ToObject(typeof(TokenType), 0)))
{
yield return new LexerToken<TokenType>(extraTokenContent)
{
LineNumber = CurrentLineNumber,
ColumnNumber = unmatchedLinePos
};
}
}
CurrentLinePos += selectedToken.RegexMatch.Length;
if (Verbose) Console.WriteLine($"{(IgnoreTokens.Contains(selectedToken.Type) ? Ansi.FBlack : Ansi.FGreen)}{selectedToken}{Ansi.Reset}");
// Yield the token, but only if we aren't supposed to be ignoring it
if (IgnoreTokens.Contains(selectedToken.Type))
continue;
yield return selectedToken;
}
if (Verbose) Console.WriteLine($"{Ansi.FBlue}[Lexer]{Ansi.Reset} Next line");
CurrentLineNumber++;
TotalCharsScanned += CurrentLinePos;
}
}
#region Rule Management
/// <summary>
/// Enables all <see cref="LexerRule{TokenType}" />s currently registered against
/// this Lexer.
/// </summary>
public void EnableAllRules() => EnableRulesByPrefix("");
/// <summary>
/// Disables all <see cref="LexerRule{TokenType}" />s currently registered against
/// this Lexer.
/// </summary>
public void DisableAllRules() => DisableRulesByPrefix("");
/// <summary>
/// Enables the rule that matches against the given <see cref="TokenType" />.
/// </summary>
/// <param name="type">The token type to use to find the rule to enable.</param>
public void EnableRule(TokenType type) => SetRule(type, true);
/// <summary>
/// Disables the rule that matches against the given <see cref="TokenType" />.
/// </summary>
/// <param name="type">The token type to use to find the rule to disable.</param>
public void DisableRule(TokenType type) => SetRule(type, false);
/// <summary>
/// Sets the enabled status of the rule that matches against the given
/// <see cref="TokenType" /> to the given state.
/// </summary>
/// <param name="type">The <see cref="TokenType" /> to use to find the rule to
/// sets the enabled state of.</param>
/// <param name="state">Whether to enable or disable the rule. <see langword="true"/> = enable it, <see langword="false"/> = disable it.</param>
public void SetRule(TokenType type, bool state)
{
foreach (LexerRule<TokenType> rule in Rules)
{
// We have to do a string comparison here because of the generic type we're using in multiple nested
// classes
if (Enum.GetName(rule.Type.GetType(), rule.Type) == Enum.GetName(type.GetType(), type))
{
rule.Enabled = state;
return;
}
}
}
/// <summary>
/// Toggles the enabled status of multiple rules by finding rules that generate
/// tokens whose name begins with a specific substring.
/// </summary>
/// <param name="tokenTypePrefix">The prefix to use when finding rules to toggle.</param>
public void ToggleRulesByPrefix(string tokenTypePrefix)
{
foreach (LexerRule<TokenType> rule in Rules)
{
// We have to do a string comparison here because of the generic type we're using in multiple nested
// classes
if (Enum.GetName(rule.Type.GetType(), rule.Type).StartsWith(tokenTypePrefix, StringComparison.CurrentCulture))
rule.Enabled = !rule.Enabled;
}
}
/// <summary>
/// Enables multiple rules by finding rules that generate
/// tokens whose name begins with a specific substring.
/// </summary>
/// <param name="tokenTypePrefix">The prefix to use when finding rules to enable.</param>
public void EnableRulesByPrefix(string tokenTypePrefix)
=> SetRulesByPrefix(tokenTypePrefix, true);
/// <summary>
/// Disables multiple rules by finding rules that generate
/// tokens whose name begins with a specific substring.
/// </summary>
/// <param name="tokenTypePrefix">The prefix to use when finding rules to disable.</param>
public void DisableRulesByPrefix(string tokenTypePrefix)
=> SetRulesByPrefix(tokenTypePrefix, false);
/// <summary>
/// Set the enabled status of multiple rules by finding rules that generate
/// tokens whose name begins with a specific substring.
/// </summary>
/// <param name="tokenTypePrefix">The prefix to use when finding rules to set the
/// status of.</param>
public void SetRulesByPrefix(string tokenTypePrefix, bool state)
{
foreach (LexerRule<TokenType> rule in Rules)
{
// We have to do a string comparison here because of the generic type we're using in multiple nested
// classes
if (Enum.GetName(rule.Type.GetType(), rule.Type).StartsWith(tokenTypePrefix, StringComparison.CurrentCulture))
{
//if(Verbose) Console.WriteLine($"{Ansi.FBlue}[Lexer/Rules] {Ansi.FCyan}Setting {rule.Type} to {state}");
rule.Enabled = state;
}
}
}
/// <summary>
/// Saves the current rule states (i.e. whether they are enabled or not) as a snapshot to an
/// internal stack.
/// </summary>
public void SaveRuleStates()
{
Dictionary<LexerRule<TokenType>, bool> states = new Dictionary<LexerRule<TokenType>, bool>();
foreach (LexerRule<TokenType> nextRule in Rules)
states[nextRule] = nextRule.Enabled;
EnabledStateStack.Push(states);
}
/// <summary>
/// Restores the top-most rule states snapshot from the internal stack.
/// </summary>
/// <exception cref="InvalidOperationException">Thrown if there aren't any states left on the stack to restore.</exception>
public void RestoreRuleStates()
{
if (EnabledStateStack.Count < 1)
throw new InvalidOperationException("Error: Can't restore the lexer rule states when no states have been saved!");
Dictionary<LexerRule<TokenType>, bool> states = EnabledStateStack.Pop();
foreach (KeyValuePair<LexerRule<TokenType>, bool> nextRulePair in states)
nextRulePair.Key.Enabled = nextRulePair.Value;
}
#endregion
}
}

View file

@ -0,0 +1,34 @@
using System;
using System.Collections.Generic;
namespace Salamander.Core.Lexer
{
/// <summary>
/// Represents a pool of reusable <see cref="Lexer{TokenType}"/>s.
/// Useful to avoid memory churn when lexing lots of different input streams.
/// </summary>
public class LexerPool<T, E> where T : Lexer<E>, new()
{
private List<T> freeLexers = new List<T>();
public LexerPool()
{
}
public T AcquireLexer()
{
if (freeLexers.Count > 0)
{
T lexer = freeLexers[0];
freeLexers.Remove(lexer);
return lexer;
}
return new T();
}
public void ReleaseLexer(T lexer)
{
freeLexers.Add(lexer);
}
}
}

View file

@ -0,0 +1,52 @@
using System;
using System.Text.RegularExpressions;
namespace Salamander.Core.Lexer
{
public class LexerRule<TokenType>
{
/// <summary>
/// The token type that a match against this rule should generate.
/// </summary>
public readonly TokenType Type;
/// <summary>
/// The regular expression to use to find matches.
/// </summary>
public readonly Regex RegEx;
/// <summary>
/// The priority of this rule.
/// </summary>
/// <remarks>
/// If there are multiple matches, then the one with the highest priority will be matched
/// against first.
/// Failing that, the longest match will be taken first.
/// Note that even if a match has a higher priority, a match from a lower priority rule
/// will be used instead if it occurs earlier in the source, as this will result in fewer
/// unmatched characters.
/// </remarks>
public int Priority { get; set; } = 0;
/// <summary>
/// Whether this rule is currently enabled or not. This can be changed on-the-fly whilst lexing.
/// Sometimes useful when handling more complicated logic.
/// Be careful though, as if you start needing this, perhaps you should evaluate whether
/// utilising the fuller capabilities of the parser would be more appropriate instead.
/// </summary>
public bool Enabled { get; set; } = true;
public LexerRule(TokenType inName, string inRegEx, RegexOptions inRegexOptions = RegexOptions.None, int inPriority = 0)
{
if (!typeof(TokenType).IsEnum)
throw new ArgumentException($"Error: inName must be an enum - {typeof(TokenType)} passed");
Type = inName;
RegEx = new Regex(inRegEx, inRegexOptions | RegexOptions.Compiled);
Priority = inPriority;
}
public bool Toggle()
{
Enabled = !Enabled;
return Enabled;
}
}
}

View file

@ -0,0 +1,76 @@
using System;
using System.Text.RegularExpressions;
namespace Salamander.Core.Lexer
{
public class LexerToken<TokenType>
{
private int _lineNumber = -1, _columnNumber = -1;
public int LineNumber {
get => _lineNumber;
set {
if (_lineNumber != -1)
throw new InvalidOperationException("Can't overwrite existing line number data");
if (value < 0)
throw new ArgumentException("Error: Negative line numbers don't make sense.");
_lineNumber = value;
}
}
public int ColumnNumber {
get => _columnNumber;
set {
if(_columnNumber != -1)
throw new InvalidOperationException("Can't overwrite existing column number data");
if(value < 0)
throw new ArgumentException("Error: Negative column numbers don't make sense.");
_columnNumber = value;
}
}
public readonly bool IsNullMatch = false;
public readonly LexerRule<TokenType> Rule = null;
public readonly Match RegexMatch;
public TokenType Type {
get {
try
{
return Rule.Type;
}
catch (NullReferenceException)
{
return default(TokenType);
}
}
}
private string nullValueData;
public string Value {
get {
return IsNullMatch ? nullValueData : RegexMatch.Value;
}
}
public LexerToken(LexerRule<TokenType> inRule, Match inMatch)
{
Rule = inRule;
RegexMatch = inMatch;
}
public LexerToken(string unknownData)
{
IsNullMatch = true;
nullValueData = unknownData;
}
#region Overrides
public override string ToString()
{
return $"[LexerToken @ {LineNumber}:{ColumnNumber} Type={Type}, Value={Value}]";
}
#endregion
}
}

View file

@ -0,0 +1,39 @@
using System;
using System.Collections.Generic;
using System.IO;
using Salamander.Core.Lexer;
namespace PolyFeed
{
internal enum SubstitutionToken
{
Unknown = 0,
Text,
BraceOpen,
BraceClose,
Identifier
}
internal class SubstitutionLexer : Lexer<SubstitutionToken>
{
public SubstitutionLexer()
{
AddRules(new List<LexerRule<SubstitutionToken>>() {
new LexerRule<SubstitutionToken>(SubstitutionToken.Text, @"[^{}]+"),
new LexerRule<SubstitutionToken>(SubstitutionToken.Identifier, @"[^{}]+"),
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceOpen, @"\{"),
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceClose, @"\}"),
});
}
public override void Initialise(StreamReader reader)
{
base.Initialise(reader);
DisableRule(SubstitutionToken.Identifier);
}
}
}

View file

@ -2,7 +2,7 @@
<packages>
<package id="Fizzler" version="1.2.0" targetFramework="net47" />
<package id="Fizzler.Systems.HtmlAgilityPack" version="1.2.0" targetFramework="net47" />
<package id="HtmlAgilityPack" version="1.11.9" targetFramework="net47" />
<package id="HtmlAgilityPack" version="1.11.12" targetFramework="net47" />
<package id="Microsoft.NETCore.Platforms" version="2.2.2" targetFramework="net47" />
<package id="Microsoft.SyndicationFeed.ReaderWriter" version="1.0.2" targetFramework="net47" />
<package id="Microsoft.Win32.Primitives" version="4.3.0" targetFramework="net47" />