mirror of
https://github.com/sbrl/PolyFeed.git
synced 2024-11-22 06:23:02 +00:00
Finish the initial HTML implementation.
This commit is contained in:
parent
59a0289b3a
commit
14fca32a5e
12 changed files with 746 additions and 14 deletions
|
@ -1,9 +1,13 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Net;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Xml;
|
||||
using Fizzler.Systems.HtmlAgilityPack;
|
||||
using HtmlAgilityPack;
|
||||
using Microsoft.SyndicationFeed;
|
||||
using Microsoft.SyndicationFeed.Atom;
|
||||
|
||||
namespace PolyFeed
|
||||
|
@ -22,9 +26,68 @@ namespace PolyFeed
|
|||
public async Task AddSource(FeedSource source) {
|
||||
WebResponse response = await WebRequest.Create(source.Url).GetResponseAsync();
|
||||
|
||||
using StreamReader reader = new StreamReader(response.GetResponseStream());
|
||||
|
||||
|
||||
// Write the header
|
||||
await feed.WriteGenerator("Polyfeed", "https://gitlab.com/sbrl/PolyFeed.git", Program.getProgramVersion());
|
||||
await feed.WriteId(source.Url);
|
||||
string lastModified = response.Headers.Get("last-modified");
|
||||
if (string.IsNullOrWhiteSpace(lastModified))
|
||||
await feed.WriteUpdated(DateTimeOffset.Now);
|
||||
else
|
||||
await feed.WriteUpdated(DateTimeOffset.Parse(lastModified));
|
||||
|
||||
string contentType = response.Headers.Get("content-type");
|
||||
|
||||
switch (source.SourceType) {
|
||||
case SourceType.HTML:
|
||||
await AddSourceHtml(source, response);
|
||||
break;
|
||||
default:
|
||||
throw new NotImplementedException($"Error: The source type {source.SourceType} hasn't been implemented yet.");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task AddSourceHtml(FeedSource source, WebResponse response) {
|
||||
HtmlDocument html = new HtmlDocument();
|
||||
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
|
||||
html.LoadHtml(await reader.ReadToEndAsync());
|
||||
|
||||
HtmlNode document = html.DocumentNode;
|
||||
|
||||
await feed.WriteTitle(ReferenceSubstitutor.Replace(source.Title, document));
|
||||
await feed.WriteSubtitle(ReferenceSubstitutor.Replace(source.Subtitle, document));
|
||||
|
||||
foreach (HtmlNode nextNode in document.QuerySelectorAll(source.EntrySelector)) {
|
||||
HtmlNode urlNode = nextNode.QuerySelector(source.EntryUrlSelector);
|
||||
string url = source.EntryUrlAttribute == string.Empty ?
|
||||
urlNode.InnerText : urlNode.Attributes[source.EntryUrlAttribute].DeEntitizeValue;
|
||||
|
||||
|
||||
SyndicationItem nextItem = new SyndicationItem() {
|
||||
Id = url,
|
||||
Title = ReferenceSubstitutor.Replace(source.EntryTitle, nextNode),
|
||||
Description = ReferenceSubstitutor.Replace(source.EntryContent, nextNode)
|
||||
};
|
||||
|
||||
if (source.EntryPublishedSelector != string.Empty) {
|
||||
HtmlNode publishedNode = nextNode.QuerySelector(source.EntryPublishedSelector);
|
||||
nextItem.Published = DateTime.Parse(
|
||||
source.EntryPublishedAttribute == string.Empty
|
||||
? publishedNode.InnerText
|
||||
: publishedNode.Attributes[source.EntryPublishedAttribute].DeEntitizeValue
|
||||
);
|
||||
|
||||
}
|
||||
if (source.EntryPublishedSelector != string.Empty) {
|
||||
HtmlNode lastUpdatedNode = nextNode.QuerySelector(source.EntryLastUpdatedSelector);
|
||||
nextItem.Published = DateTime.Parse(
|
||||
source.EntryLastUpdatedAttribute == string.Empty
|
||||
? lastUpdatedNode.InnerText
|
||||
: lastUpdatedNode.Attributes[source.EntryLastUpdatedAttribute].DeEntitizeValue
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
using System;
|
||||
|
||||
namespace PolyFeed
|
||||
{
|
||||
public enum SourceType { HTML, XML, JSON };
|
||||
|
@ -10,31 +11,43 @@ namespace PolyFeed
|
|||
/// </summary>
|
||||
/// <value>The URL.</value>
|
||||
public string Url { get; set; }
|
||||
/// <summary>
|
||||
/// The title of the feed.
|
||||
/// </summary>
|
||||
public string Title { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The type of source document to expect.
|
||||
/// </summary>
|
||||
public SourceType SourceType { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The title of the feed.
|
||||
/// Supports the same {} syntax as <see cref="EntryTitle" />.
|
||||
/// </summary>
|
||||
public string Title { get; set; }
|
||||
/// <summary>
|
||||
/// The subtitle of the feed.
|
||||
/// Supports the same {} syntax as <see cref="EntryTitle" />.
|
||||
/// </summary>
|
||||
/// <value>The subtitle.</value>
|
||||
public string Subtitle { get; set; }
|
||||
|
||||
|
||||
#region Entries
|
||||
|
||||
/// <summary>
|
||||
/// A selector that matches against an element that contains the URL that an
|
||||
/// entry should link to.
|
||||
/// Relative to the element selected by <see cref="EntrySelector" />.
|
||||
/// </summary>
|
||||
public string UrlSelector { get; set; }
|
||||
public string EntryUrlSelector { get; set; }
|
||||
/// <summary>
|
||||
/// The name of the attribute on the element selected by <see cref="UrlSelector" />.
|
||||
/// The name of the attribute on the element selected by <see cref="EntryUrlSelector" />.
|
||||
/// Set to an empty string to select the content of the element instead of the
|
||||
/// content of an attribute.
|
||||
/// </summary>
|
||||
public string UrlElementAttribute { get; set; } = "";
|
||||
public string EntryUrlAttribute { get; set; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// The selector that specifies the location in the object model of nodes that should
|
||||
/// be added to the feed.
|
||||
/// The selector that specifies the location of nodes in the object model that
|
||||
/// should be added to the feed.
|
||||
/// The format varies depending on the <see cref="SourceType" />.
|
||||
/// - HTML: CSS selector (e.g. main > article)
|
||||
/// - XML: XPath (e.g. //element_name)
|
||||
|
@ -52,5 +65,29 @@ namespace PolyFeed
|
|||
/// Same as <see cref="EntryTitle" />, but for the body of an entry. HTML is allowed.
|
||||
/// </summary>
|
||||
public string EntryContent { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The selector for the node that contains the date published for an entry.
|
||||
/// </summary>
|
||||
public string EntryPublishedSelector { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the attribute that contains the date published for an entry.
|
||||
/// Set to <see cref="string.Empty" /> to use the content of the node itself.
|
||||
/// </summary>
|
||||
public string EntryPublishedAttribute { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Same as <see cref="EntryPublishedSelector" />, but for the last updated.
|
||||
/// If not specified, the last updated will be omitted.
|
||||
/// </summary>
|
||||
public string EntryLastUpdatedSelector { get; set; }
|
||||
/// <summary>
|
||||
/// Same as <see cref="EntryPublishedAttribute" />.
|
||||
/// </summary>
|
||||
public string EntryLastUpdatedAttribute { get; set; }
|
||||
|
||||
#endregion
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
<HintPath>..\packages\Fizzler.1.2.0\lib\netstandard2.0\Fizzler.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="HtmlAgilityPack">
|
||||
<HintPath>..\packages\HtmlAgilityPack.1.11.9\lib\Net45\HtmlAgilityPack.dll</HintPath>
|
||||
<HintPath>..\packages\HtmlAgilityPack.1.11.12\lib\Net45\HtmlAgilityPack.dll</HintPath>
|
||||
</Reference>
|
||||
<Reference Include="Microsoft.Win32.Primitives">
|
||||
<HintPath>..\packages\Microsoft.Win32.Primitives.4.3.0\lib\net46\Microsoft.Win32.Primitives.dll</HintPath>
|
||||
|
@ -136,10 +136,20 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="FeedBuilder.cs" />
|
||||
<Compile Include="FeedSource.cs" />
|
||||
<Compile Include="Salamander.Core\Lexer.cs" />
|
||||
<Compile Include="Salamander.Core\LexerRule.cs" />
|
||||
<Compile Include="Salamander.Core\LexerToken.cs" />
|
||||
<Compile Include="Salamander.Core\Ansi.cs" />
|
||||
<Compile Include="SubstitutionLexer.cs" />
|
||||
<Compile Include="Salamander.Core\LexerPool.cs" />
|
||||
<Compile Include="ReferenceSubstitutor.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="packages.config" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Folder Include="Salamander.Core\" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
|
||||
<Import Project="..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets" Condition="Exists('..\packages\NETStandard.Library.2.0.3\build\netstandard2.0\NETStandard.Library.targets')" />
|
||||
</Project>
|
|
@ -3,7 +3,7 @@ using System.Collections.Generic;
|
|||
using System.IO;
|
||||
using System.Reflection;
|
||||
|
||||
namespace ProjectNamespace
|
||||
namespace PolyFeed
|
||||
{
|
||||
internal class Settings
|
||||
{
|
||||
|
@ -67,7 +67,7 @@ namespace ProjectNamespace
|
|||
|
||||
#region Helper Methods
|
||||
|
||||
private static string getProgramVersion()
|
||||
public static string getProgramVersion()
|
||||
{
|
||||
Version version = Assembly.GetExecutingAssembly().GetName().Version;
|
||||
return $"{version.Major}.{version.Minor}";
|
||||
|
|
44
PolyFeed/ReferenceSubstitutor.cs
Normal file
44
PolyFeed/ReferenceSubstitutor.cs
Normal file
|
@ -0,0 +1,44 @@
|
|||
using System;
|
||||
using System.Text;
|
||||
using Fizzler.Systems.HtmlAgilityPack;
|
||||
using HtmlAgilityPack;
|
||||
using Salamander.Core.Lexer;
|
||||
|
||||
namespace PolyFeed
|
||||
{
|
||||
internal static class ReferenceSubstitutor {
|
||||
private static LexerPool<SubstitutionLexer, SubstitutionToken> lexerPool = new LexerPool<SubstitutionLexer, SubstitutionToken>();
|
||||
|
||||
public static string Replace(string inputString, HtmlNode rootElement)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
SubstitutionLexer lexer = lexerPool.AcquireLexer();
|
||||
lexer.Initialise(inputString);
|
||||
|
||||
foreach (LexerToken<SubstitutionToken> nextToken in lexer.TokenStream())
|
||||
{
|
||||
switch (nextToken.Type) {
|
||||
case SubstitutionToken.BraceOpen:
|
||||
lexer.SaveRuleStates();
|
||||
lexer.EnableRule(SubstitutionToken.Identifier);
|
||||
lexer.DisableRule(SubstitutionToken.Text);
|
||||
break;
|
||||
case SubstitutionToken.BraceClose:
|
||||
lexer.RestoreRuleStates();
|
||||
break;
|
||||
|
||||
case SubstitutionToken.Text:
|
||||
result.Append(nextToken.Value);
|
||||
break;
|
||||
|
||||
case SubstitutionToken.Identifier:
|
||||
result.Append(rootElement.QuerySelector(nextToken.Value));
|
||||
break;
|
||||
}
|
||||
}
|
||||
lexerPool.ReleaseLexer(lexer);
|
||||
|
||||
return result.ToString();
|
||||
}
|
||||
}
|
||||
}
|
49
PolyFeed/Salamander.Core/Ansi.cs
Normal file
49
PolyFeed/Salamander.Core/Ansi.cs
Normal file
|
@ -0,0 +1,49 @@
|
|||
using System;
|
||||
|
||||
namespace Salamander.Core.Helpers
|
||||
{
|
||||
public static class Ansi
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether we should *actually* emit ANSI escape codes or not.
|
||||
/// Useful when we want to output to a log file, for example.
|
||||
/// </summary>
|
||||
public static bool Enabled { get; set; } = true;
|
||||
|
||||
// Solution on how to output ANSI escape codes in C# from here:
|
||||
// https://www.jerriepelser.com/blog/using-ansi-color-codes-in-net-console-apps
|
||||
public static string Reset => Enabled ? "\u001b[0m" : "";
|
||||
public static string HiCol => Enabled ? "\u001b[1m" : "";
|
||||
public static string Underline => Enabled ? "\u001b[4m" : "";
|
||||
public static string Inverse => Enabled ? "\u001b[7m" : "";
|
||||
|
||||
public static string FBlack => Enabled ? "\u001b[30m" : "";
|
||||
public static string FRed => Enabled ? "\u001b[31m" : "";
|
||||
public static string FGreen => Enabled ? "\u001b[32m" : "";
|
||||
public static string FYellow => Enabled ? "\u001b[33m" : "";
|
||||
public static string FBlue => Enabled ? "\u001b[34m" : "";
|
||||
public static string FMagenta => Enabled ? "\u001b[35m" : "";
|
||||
public static string FCyan => Enabled ? "\u001b[36m" : "";
|
||||
public static string FWhite => Enabled ? "\u001b[37m" : "";
|
||||
|
||||
public static string BBlack => Enabled ? "\u001b[40m" : "";
|
||||
public static string BRed => Enabled ? "\u001b[41m" : "";
|
||||
public static string BGreen => Enabled ? "\u001b[42m" : "";
|
||||
public static string BYellow => Enabled ? "\u001b[43m" : "";
|
||||
public static string BBlue => Enabled ? "\u001b[44m" : "";
|
||||
public static string BMagenta => Enabled ? "\u001b[45m" : "";
|
||||
public static string BCyan => Enabled ? "\u001b[46m" : "";
|
||||
public static string BWhite => Enabled ? "\u001b[47m" : "";
|
||||
|
||||
// Thanks to http://ascii-table.com/ansi-escape-sequences.php for the following ANSI escape sequences
|
||||
public static string Up(int lines = 1) => Enabled ? $"\u001b[{lines}A" : "";
|
||||
public static string Down(int lines = 1) => Enabled ? $"\u001b[{lines}B" : "";
|
||||
public static string Right(int lines = 1) => Enabled ? $"\u001b[{lines}C" : "";
|
||||
public static string Left(int lines = 1) => Enabled ? $"\u001b[{lines}D" : "";
|
||||
|
||||
//public static string JumpTo(Vector2 pos) => $"\u001b[{pos.Y};{pos.X}H" : "";
|
||||
|
||||
public static string CursorPosSave => Enabled ? $"\u001b[s" : "";
|
||||
public static string CursorPosRestore => Enabled ? $"\u001b[u" : "";
|
||||
}
|
||||
}
|
328
PolyFeed/Salamander.Core/Lexer.cs
Normal file
328
PolyFeed/Salamander.Core/Lexer.cs
Normal file
|
@ -0,0 +1,328 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using Salamander.Core.Helpers;
|
||||
|
||||
namespace Salamander.Core.Lexer
|
||||
{
|
||||
public class Lexer<TokenType>
|
||||
{
|
||||
/// <summary>
|
||||
/// The rules that should be used during the lexing process.
|
||||
/// </summary>
|
||||
public List<LexerRule<TokenType>> Rules { get; private set; } = new List<LexerRule<TokenType>>();
|
||||
/// <summary>
|
||||
/// Tokens in this list will be matched against, but not emitted by the lexer
|
||||
/// into the main token stream.
|
||||
/// Useful for catching and disposing of sequences of characters you don't want escaping
|
||||
/// or breaking your parser.
|
||||
/// </summary>
|
||||
public List<TokenType> IgnoreTokens { get; private set; } = new List<TokenType>();
|
||||
|
||||
/// <summary>
|
||||
/// Whether the lexer should be verbose and log a bunch of debugging information
|
||||
/// to the console.
|
||||
/// </summary>
|
||||
public bool Verbose { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// The number of the line that currently being scanned.
|
||||
/// </summary>
|
||||
public int CurrentLineNumber { get; private set; } = 0;
|
||||
/// <summary>
|
||||
/// The number of characters on the current line that have been scanned.
|
||||
/// </summary>
|
||||
/// <value>The current line position.</value>
|
||||
public int CurrentLinePos { get; private set; } = 0;
|
||||
/// <summary>
|
||||
/// The total number of characters currently scanned by this lexer instance.
|
||||
/// Only updated every newline!
|
||||
/// </summary>
|
||||
public int TotalCharsScanned { get; private set; } = 0;
|
||||
|
||||
/// <summary>
|
||||
/// The internal stream that we should read from when lexing.
|
||||
/// </summary>
|
||||
private StreamReader textStream;
|
||||
|
||||
/// <summary>
|
||||
/// A stack of rule states.
|
||||
/// Whether rules are enabled or disabled can be recursively saved and restored -
|
||||
/// this <see cref="Stack{T}" /> is how the lexer saves this information.
|
||||
/// </summary>
|
||||
private Stack<Dictionary<LexerRule<TokenType>, bool>> EnabledStateStack = new Stack<Dictionary<LexerRule<TokenType>, bool>>();
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new <see cref="Lexer{TokenType}" />, optionally containing the given
|
||||
/// <see cref="LexerRule{TokenType}" /> instances.
|
||||
/// </summary>
|
||||
/// <param name="initialRules">The rules to add to the new <see cref="Lexer{TokenType}" />.</param>
|
||||
public Lexer(params LexerRule<TokenType>[] initialRules)
|
||||
{
|
||||
AddRules(initialRules);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds a single lexing rule to the <see cref="Lexer{TokenType}" />.
|
||||
/// </summary>
|
||||
/// <param name="newRule">The rule to add.</param>
|
||||
public void AddRule(LexerRule<TokenType> newRule)
|
||||
=> Rules.Add(newRule);
|
||||
/// <summary>
|
||||
/// Adds a bunch of lexing rules to the <see cref="Lexer{TokenType}" />.
|
||||
/// </summary>
|
||||
/// <param name="newRules">The rules to add.</param>
|
||||
public void AddRules(IEnumerable<LexerRule<TokenType>> newRules)
|
||||
=> Rules.AddRange(newRules);
|
||||
|
||||
/// <summary>
|
||||
/// Reinitialises the parser with a new input stream.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Child classes should override this method to do their own state initialisation,
|
||||
/// as lexers MAY be re-used on multiple input streams.
|
||||
/// Implementors must be careful not to forget to call this base method though.
|
||||
/// </remarks>
|
||||
/// <param name="reader">The <see cref="StreamReader"/> to use as the new input stream..</param>
|
||||
public virtual void Initialise(StreamReader reader)
|
||||
{
|
||||
// Reset the counters
|
||||
CurrentLineNumber = 0;
|
||||
CurrentLinePos = 0;
|
||||
TotalCharsScanned = 0;
|
||||
|
||||
// Reset the state stack
|
||||
EnabledStateStack.Clear();
|
||||
|
||||
// Re-enable all rules
|
||||
EnableAllRules();
|
||||
|
||||
textStream = reader;
|
||||
}
|
||||
public void Initialise(string input)
|
||||
{
|
||||
MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(input));
|
||||
Initialise(new StreamReader(stream));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Performs the lexing process itself in an incremental manner.
|
||||
/// Note that a single Lexer may only do a single lex at a time - even if it's the
|
||||
/// same document multiple times over.
|
||||
/// </summary>
|
||||
/// <returns>A stream of lexical tokens.</returns>
|
||||
public IEnumerable<LexerToken<TokenType>> TokenStream()
|
||||
{
|
||||
string nextLine;
|
||||
List<LexerToken<TokenType>> matches = new List<LexerToken<TokenType>>();
|
||||
while ((nextLine = textStream.ReadLine()) != null)
|
||||
{
|
||||
CurrentLinePos = 0;
|
||||
|
||||
while (CurrentLinePos < nextLine.Length)
|
||||
{
|
||||
matches.Clear();
|
||||
foreach (LexerRule<TokenType> rule in Rules)
|
||||
{
|
||||
if (!rule.Enabled) continue;
|
||||
|
||||
Match nextMatch = rule.RegEx.Match(nextLine, CurrentLinePos);
|
||||
if (!nextMatch.Success) continue;
|
||||
|
||||
matches.Add(
|
||||
new LexerToken<TokenType>(rule, nextMatch)
|
||||
{
|
||||
LineNumber = CurrentLineNumber,
|
||||
ColumnNumber = nextMatch.Index
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
if (matches.Count == 0)
|
||||
{
|
||||
string unknownTokenContent = nextLine.Substring(CurrentLinePos);
|
||||
if (Verbose) Console.WriteLine($"{Ansi.FRed}[Unknown Token: No matches found for this line]{Ansi.Reset} {0}", unknownTokenContent);
|
||||
yield return new LexerToken<TokenType>(unknownTokenContent)
|
||||
{
|
||||
LineNumber = CurrentLineNumber,
|
||||
ColumnNumber = CurrentLinePos
|
||||
};
|
||||
break;
|
||||
}
|
||||
|
||||
matches.Sort((LexerToken<TokenType> a, LexerToken<TokenType> b) => {
|
||||
// Match of offset position position
|
||||
int result = a.ColumnNumber - b.ColumnNumber;
|
||||
// If they both start at the same position, then go with highest priority one
|
||||
if (result == 0)
|
||||
result = b.Rule.Priority - a.Rule.Priority;
|
||||
// Failing that, try the longest one
|
||||
if (result == 0)
|
||||
result = b.RegexMatch.Length - a.RegexMatch.Length;
|
||||
|
||||
return result;
|
||||
});
|
||||
LexerToken<TokenType> selectedToken = matches[0];
|
||||
int selectedTokenOffset = nextLine.IndexOf(selectedToken.RegexMatch.Value, CurrentLinePos) - CurrentLinePos;
|
||||
|
||||
if (selectedTokenOffset > 0)
|
||||
{
|
||||
string extraTokenContent = nextLine.Substring(CurrentLinePos, selectedTokenOffset);
|
||||
int unmatchedLinePos = CurrentLinePos;
|
||||
CurrentLinePos += selectedTokenOffset;
|
||||
if (Verbose) Console.WriteLine($"{Ansi.FRed}[Unmatched content]{Ansi.Reset} '{extraTokenContent}'");
|
||||
// Return the an unknown token, but only if we're not meant to be ignoring them
|
||||
if (!IgnoreTokens.Contains((TokenType)Enum.ToObject(typeof(TokenType), 0)))
|
||||
{
|
||||
yield return new LexerToken<TokenType>(extraTokenContent)
|
||||
{
|
||||
LineNumber = CurrentLineNumber,
|
||||
ColumnNumber = unmatchedLinePos
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
CurrentLinePos += selectedToken.RegexMatch.Length;
|
||||
if (Verbose) Console.WriteLine($"{(IgnoreTokens.Contains(selectedToken.Type) ? Ansi.FBlack : Ansi.FGreen)}{selectedToken}{Ansi.Reset}");
|
||||
|
||||
// Yield the token, but only if we aren't supposed to be ignoring it
|
||||
if (IgnoreTokens.Contains(selectedToken.Type))
|
||||
continue;
|
||||
yield return selectedToken;
|
||||
}
|
||||
|
||||
if (Verbose) Console.WriteLine($"{Ansi.FBlue}[Lexer]{Ansi.Reset} Next line");
|
||||
CurrentLineNumber++;
|
||||
TotalCharsScanned += CurrentLinePos;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#region Rule Management
|
||||
|
||||
/// <summary>
|
||||
/// Enables all <see cref="LexerRule{TokenType}" />s currently registered against
|
||||
/// this Lexer.
|
||||
/// </summary>
|
||||
public void EnableAllRules() => EnableRulesByPrefix("");
|
||||
/// <summary>
|
||||
/// Disables all <see cref="LexerRule{TokenType}" />s currently registered against
|
||||
/// this Lexer.
|
||||
/// </summary>
|
||||
public void DisableAllRules() => DisableRulesByPrefix("");
|
||||
|
||||
/// <summary>
|
||||
/// Enables the rule that matches against the given <see cref="TokenType" />.
|
||||
/// </summary>
|
||||
/// <param name="type">The token type to use to find the rule to enable.</param>
|
||||
public void EnableRule(TokenType type) => SetRule(type, true);
|
||||
/// <summary>
|
||||
/// Disables the rule that matches against the given <see cref="TokenType" />.
|
||||
/// </summary>
|
||||
/// <param name="type">The token type to use to find the rule to disable.</param>
|
||||
public void DisableRule(TokenType type) => SetRule(type, false);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the enabled status of the rule that matches against the given
|
||||
/// <see cref="TokenType" /> to the given state.
|
||||
/// </summary>
|
||||
/// <param name="type">The <see cref="TokenType" /> to use to find the rule to
|
||||
/// sets the enabled state of.</param>
|
||||
/// <param name="state">Whether to enable or disable the rule. <see langword="true"/> = enable it, <see langword="false"/> = disable it.</param>
|
||||
public void SetRule(TokenType type, bool state)
|
||||
{
|
||||
foreach (LexerRule<TokenType> rule in Rules)
|
||||
{
|
||||
// We have to do a string comparison here because of the generic type we're using in multiple nested
|
||||
// classes
|
||||
if (Enum.GetName(rule.Type.GetType(), rule.Type) == Enum.GetName(type.GetType(), type))
|
||||
{
|
||||
rule.Enabled = state;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Toggles the enabled status of multiple rules by finding rules that generate
|
||||
/// tokens whose name begins with a specific substring.
|
||||
/// </summary>
|
||||
/// <param name="tokenTypePrefix">The prefix to use when finding rules to toggle.</param>
|
||||
public void ToggleRulesByPrefix(string tokenTypePrefix)
|
||||
{
|
||||
foreach (LexerRule<TokenType> rule in Rules)
|
||||
{
|
||||
// We have to do a string comparison here because of the generic type we're using in multiple nested
|
||||
// classes
|
||||
if (Enum.GetName(rule.Type.GetType(), rule.Type).StartsWith(tokenTypePrefix, StringComparison.CurrentCulture))
|
||||
rule.Enabled = !rule.Enabled;
|
||||
}
|
||||
}
|
||||
/// <summary>
|
||||
/// Enables multiple rules by finding rules that generate
|
||||
/// tokens whose name begins with a specific substring.
|
||||
/// </summary>
|
||||
/// <param name="tokenTypePrefix">The prefix to use when finding rules to enable.</param>
|
||||
public void EnableRulesByPrefix(string tokenTypePrefix)
|
||||
=> SetRulesByPrefix(tokenTypePrefix, true);
|
||||
/// <summary>
|
||||
/// Disables multiple rules by finding rules that generate
|
||||
/// tokens whose name begins with a specific substring.
|
||||
/// </summary>
|
||||
/// <param name="tokenTypePrefix">The prefix to use when finding rules to disable.</param>
|
||||
public void DisableRulesByPrefix(string tokenTypePrefix)
|
||||
=> SetRulesByPrefix(tokenTypePrefix, false);
|
||||
|
||||
/// <summary>
|
||||
/// Set the enabled status of multiple rules by finding rules that generate
|
||||
/// tokens whose name begins with a specific substring.
|
||||
/// </summary>
|
||||
/// <param name="tokenTypePrefix">The prefix to use when finding rules to set the
|
||||
/// status of.</param>
|
||||
public void SetRulesByPrefix(string tokenTypePrefix, bool state)
|
||||
{
|
||||
foreach (LexerRule<TokenType> rule in Rules)
|
||||
{
|
||||
// We have to do a string comparison here because of the generic type we're using in multiple nested
|
||||
// classes
|
||||
if (Enum.GetName(rule.Type.GetType(), rule.Type).StartsWith(tokenTypePrefix, StringComparison.CurrentCulture))
|
||||
{
|
||||
//if(Verbose) Console.WriteLine($"{Ansi.FBlue}[Lexer/Rules] {Ansi.FCyan}Setting {rule.Type} to {state}");
|
||||
rule.Enabled = state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Saves the current rule states (i.e. whether they are enabled or not) as a snapshot to an
|
||||
/// internal stack.
|
||||
/// </summary>
|
||||
public void SaveRuleStates()
|
||||
{
|
||||
Dictionary<LexerRule<TokenType>, bool> states = new Dictionary<LexerRule<TokenType>, bool>();
|
||||
foreach (LexerRule<TokenType> nextRule in Rules)
|
||||
states[nextRule] = nextRule.Enabled;
|
||||
|
||||
EnabledStateStack.Push(states);
|
||||
}
|
||||
/// <summary>
|
||||
/// Restores the top-most rule states snapshot from the internal stack.
|
||||
/// </summary>
|
||||
/// <exception cref="InvalidOperationException">Thrown if there aren't any states left on the stack to restore.</exception>
|
||||
public void RestoreRuleStates()
|
||||
{
|
||||
if (EnabledStateStack.Count < 1)
|
||||
throw new InvalidOperationException("Error: Can't restore the lexer rule states when no states have been saved!");
|
||||
|
||||
Dictionary<LexerRule<TokenType>, bool> states = EnabledStateStack.Pop();
|
||||
foreach (KeyValuePair<LexerRule<TokenType>, bool> nextRulePair in states)
|
||||
nextRulePair.Key.Enabled = nextRulePair.Value;
|
||||
}
|
||||
|
||||
|
||||
#endregion
|
||||
|
||||
}
|
||||
}
|
34
PolyFeed/Salamander.Core/LexerPool.cs
Normal file
34
PolyFeed/Salamander.Core/LexerPool.cs
Normal file
|
@ -0,0 +1,34 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace Salamander.Core.Lexer
|
||||
{
|
||||
/// <summary>
|
||||
/// Represents a pool of reusable <see cref="Lexer{TokenType}"/>s.
|
||||
/// Useful to avoid memory churn when lexing lots of different input streams.
|
||||
/// </summary>
|
||||
public class LexerPool<T, E> where T : Lexer<E>, new()
|
||||
{
|
||||
private List<T> freeLexers = new List<T>();
|
||||
|
||||
public LexerPool()
|
||||
{
|
||||
}
|
||||
|
||||
public T AcquireLexer()
|
||||
{
|
||||
if (freeLexers.Count > 0)
|
||||
{
|
||||
T lexer = freeLexers[0];
|
||||
freeLexers.Remove(lexer);
|
||||
return lexer;
|
||||
}
|
||||
return new T();
|
||||
}
|
||||
|
||||
public void ReleaseLexer(T lexer)
|
||||
{
|
||||
freeLexers.Add(lexer);
|
||||
}
|
||||
}
|
||||
}
|
52
PolyFeed/Salamander.Core/LexerRule.cs
Normal file
52
PolyFeed/Salamander.Core/LexerRule.cs
Normal file
|
@ -0,0 +1,52 @@
|
|||
using System;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace Salamander.Core.Lexer
|
||||
{
|
||||
public class LexerRule<TokenType>
|
||||
{
|
||||
/// <summary>
|
||||
/// The token type that a match against this rule should generate.
|
||||
/// </summary>
|
||||
public readonly TokenType Type;
|
||||
/// <summary>
|
||||
/// The regular expression to use to find matches.
|
||||
/// </summary>
|
||||
public readonly Regex RegEx;
|
||||
/// <summary>
|
||||
/// The priority of this rule.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// If there are multiple matches, then the one with the highest priority will be matched
|
||||
/// against first.
|
||||
/// Failing that, the longest match will be taken first.
|
||||
/// Note that even if a match has a higher priority, a match from a lower priority rule
|
||||
/// will be used instead if it occurs earlier in the source, as this will result in fewer
|
||||
/// unmatched characters.
|
||||
/// </remarks>
|
||||
public int Priority { get; set; } = 0;
|
||||
/// <summary>
|
||||
/// Whether this rule is currently enabled or not. This can be changed on-the-fly whilst lexing.
|
||||
/// Sometimes useful when handling more complicated logic.
|
||||
/// Be careful though, as if you start needing this, perhaps you should evaluate whether
|
||||
/// utilising the fuller capabilities of the parser would be more appropriate instead.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
public LexerRule(TokenType inName, string inRegEx, RegexOptions inRegexOptions = RegexOptions.None, int inPriority = 0)
|
||||
{
|
||||
if (!typeof(TokenType).IsEnum)
|
||||
throw new ArgumentException($"Error: inName must be an enum - {typeof(TokenType)} passed");
|
||||
|
||||
Type = inName;
|
||||
RegEx = new Regex(inRegEx, inRegexOptions | RegexOptions.Compiled);
|
||||
Priority = inPriority;
|
||||
}
|
||||
|
||||
public bool Toggle()
|
||||
{
|
||||
Enabled = !Enabled;
|
||||
return Enabled;
|
||||
}
|
||||
}
|
||||
}
|
76
PolyFeed/Salamander.Core/LexerToken.cs
Normal file
76
PolyFeed/Salamander.Core/LexerToken.cs
Normal file
|
@ -0,0 +1,76 @@
|
|||
using System;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace Salamander.Core.Lexer
|
||||
{
|
||||
public class LexerToken<TokenType>
|
||||
{
|
||||
private int _lineNumber = -1, _columnNumber = -1;
|
||||
public int LineNumber {
|
||||
get => _lineNumber;
|
||||
set {
|
||||
if (_lineNumber != -1)
|
||||
throw new InvalidOperationException("Can't overwrite existing line number data");
|
||||
if (value < 0)
|
||||
throw new ArgumentException("Error: Negative line numbers don't make sense.");
|
||||
|
||||
_lineNumber = value;
|
||||
}
|
||||
}
|
||||
public int ColumnNumber {
|
||||
get => _columnNumber;
|
||||
set {
|
||||
if(_columnNumber != -1)
|
||||
throw new InvalidOperationException("Can't overwrite existing column number data");
|
||||
if(value < 0)
|
||||
throw new ArgumentException("Error: Negative column numbers don't make sense.");
|
||||
|
||||
_columnNumber = value;
|
||||
}
|
||||
}
|
||||
|
||||
public readonly bool IsNullMatch = false;
|
||||
public readonly LexerRule<TokenType> Rule = null;
|
||||
public readonly Match RegexMatch;
|
||||
|
||||
public TokenType Type {
|
||||
get {
|
||||
try
|
||||
{
|
||||
return Rule.Type;
|
||||
}
|
||||
catch (NullReferenceException)
|
||||
{
|
||||
return default(TokenType);
|
||||
}
|
||||
}
|
||||
}
|
||||
private string nullValueData;
|
||||
public string Value {
|
||||
get {
|
||||
return IsNullMatch ? nullValueData : RegexMatch.Value;
|
||||
}
|
||||
}
|
||||
|
||||
public LexerToken(LexerRule<TokenType> inRule, Match inMatch)
|
||||
{
|
||||
Rule = inRule;
|
||||
RegexMatch = inMatch;
|
||||
}
|
||||
public LexerToken(string unknownData)
|
||||
{
|
||||
IsNullMatch = true;
|
||||
nullValueData = unknownData;
|
||||
}
|
||||
|
||||
|
||||
#region Overrides
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"[LexerToken @ {LineNumber}:{ColumnNumber} Type={Type}, Value={Value}]";
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
39
PolyFeed/SubstitutionLexer.cs
Normal file
39
PolyFeed/SubstitutionLexer.cs
Normal file
|
@ -0,0 +1,39 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using Salamander.Core.Lexer;
|
||||
|
||||
namespace PolyFeed
|
||||
{
|
||||
internal enum SubstitutionToken
|
||||
{
|
||||
Unknown = 0,
|
||||
|
||||
Text,
|
||||
|
||||
BraceOpen,
|
||||
BraceClose,
|
||||
Identifier
|
||||
|
||||
}
|
||||
|
||||
internal class SubstitutionLexer : Lexer<SubstitutionToken>
|
||||
{
|
||||
public SubstitutionLexer()
|
||||
{
|
||||
AddRules(new List<LexerRule<SubstitutionToken>>() {
|
||||
new LexerRule<SubstitutionToken>(SubstitutionToken.Text, @"[^{}]+"),
|
||||
new LexerRule<SubstitutionToken>(SubstitutionToken.Identifier, @"[^{}]+"),
|
||||
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceOpen, @"\{"),
|
||||
new LexerRule<SubstitutionToken>(SubstitutionToken.BraceClose, @"\}"),
|
||||
});
|
||||
}
|
||||
|
||||
public override void Initialise(StreamReader reader)
|
||||
{
|
||||
base.Initialise(reader);
|
||||
|
||||
DisableRule(SubstitutionToken.Identifier);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,7 +2,7 @@
|
|||
<packages>
|
||||
<package id="Fizzler" version="1.2.0" targetFramework="net47" />
|
||||
<package id="Fizzler.Systems.HtmlAgilityPack" version="1.2.0" targetFramework="net47" />
|
||||
<package id="HtmlAgilityPack" version="1.11.9" targetFramework="net47" />
|
||||
<package id="HtmlAgilityPack" version="1.11.12" targetFramework="net47" />
|
||||
<package id="Microsoft.NETCore.Platforms" version="2.2.2" targetFramework="net47" />
|
||||
<package id="Microsoft.SyndicationFeed.ReaderWriter" version="1.0.2" targetFramework="net47" />
|
||||
<package id="Microsoft.Win32.Primitives" version="4.3.0" targetFramework="net47" />
|
||||
|
|
Loading…
Reference in a new issue