mirror of https://github.com/sbrl/PolyFeed.git
329 lines
12 KiB
C#
329 lines
12 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using Salamander.Core.Helpers;
|
|
|
|
namespace Salamander.Core.Lexer
|
|
{
|
|
public class Lexer<TokenType>
|
|
{
|
|
/// <summary>
|
|
/// The rules that should be used during the lexing process.
|
|
/// </summary>
|
|
public List<LexerRule<TokenType>> Rules { get; private set; } = new List<LexerRule<TokenType>>();
|
|
/// <summary>
|
|
/// Tokens in this list will be matched against, but not emitted by the lexer
|
|
/// into the main token stream.
|
|
/// Useful for catching and disposing of sequences of characters you don't want escaping
|
|
/// or breaking your parser.
|
|
/// </summary>
|
|
public List<TokenType> IgnoreTokens { get; private set; } = new List<TokenType>();
|
|
|
|
/// <summary>
|
|
/// Whether the lexer should be verbose and log a bunch of debugging information
|
|
/// to the console.
|
|
/// </summary>
|
|
public bool Verbose { get; set; } = false;
|
|
|
|
/// <summary>
|
|
/// The number of the line that currently being scanned.
|
|
/// </summary>
|
|
public int CurrentLineNumber { get; private set; } = 0;
|
|
/// <summary>
|
|
/// The number of characters on the current line that have been scanned.
|
|
/// </summary>
|
|
/// <value>The current line position.</value>
|
|
public int CurrentLinePos { get; private set; } = 0;
|
|
/// <summary>
|
|
/// The total number of characters currently scanned by this lexer instance.
|
|
/// Only updated every newline!
|
|
/// </summary>
|
|
public int TotalCharsScanned { get; private set; } = 0;
|
|
|
|
/// <summary>
|
|
/// The internal stream that we should read from when lexing.
|
|
/// </summary>
|
|
private StreamReader textStream;
|
|
|
|
/// <summary>
|
|
/// A stack of rule states.
|
|
/// Whether rules are enabled or disabled can be recursively saved and restored -
|
|
/// this <see cref="Stack{T}" /> is how the lexer saves this information.
|
|
/// </summary>
|
|
private Stack<Dictionary<LexerRule<TokenType>, bool>> EnabledStateStack = new Stack<Dictionary<LexerRule<TokenType>, bool>>();
|
|
|
|
/// <summary>
|
|
/// Creates a new <see cref="Lexer{TokenType}" />, optionally containing the given
|
|
/// <see cref="LexerRule{TokenType}" /> instances.
|
|
/// </summary>
|
|
/// <param name="initialRules">The rules to add to the new <see cref="Lexer{TokenType}" />.</param>
|
|
public Lexer(params LexerRule<TokenType>[] initialRules)
|
|
{
|
|
AddRules(initialRules);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Adds a single lexing rule to the <see cref="Lexer{TokenType}" />.
|
|
/// </summary>
|
|
/// <param name="newRule">The rule to add.</param>
|
|
public void AddRule(LexerRule<TokenType> newRule)
|
|
=> Rules.Add(newRule);
|
|
/// <summary>
|
|
/// Adds a bunch of lexing rules to the <see cref="Lexer{TokenType}" />.
|
|
/// </summary>
|
|
/// <param name="newRules">The rules to add.</param>
|
|
public void AddRules(IEnumerable<LexerRule<TokenType>> newRules)
|
|
=> Rules.AddRange(newRules);
|
|
|
|
/// <summary>
|
|
/// Reinitialises the parser with a new input stream.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// Child classes should override this method to do their own state initialisation,
|
|
/// as lexers MAY be re-used on multiple input streams.
|
|
/// Implementors must be careful not to forget to call this base method though.
|
|
/// </remarks>
|
|
/// <param name="reader">The <see cref="StreamReader"/> to use as the new input stream..</param>
|
|
public virtual void Initialise(StreamReader reader)
|
|
{
|
|
// Reset the counters
|
|
CurrentLineNumber = 0;
|
|
CurrentLinePos = 0;
|
|
TotalCharsScanned = 0;
|
|
|
|
// Reset the state stack
|
|
EnabledStateStack.Clear();
|
|
|
|
// Re-enable all rules
|
|
EnableAllRules();
|
|
|
|
textStream = reader;
|
|
}
|
|
public void Initialise(string input)
|
|
{
|
|
MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(input));
|
|
Initialise(new StreamReader(stream));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Performs the lexing process itself in an incremental manner.
|
|
/// Note that a single Lexer may only do a single lex at a time - even if it's the
|
|
/// same document multiple times over.
|
|
/// </summary>
|
|
/// <returns>A stream of lexical tokens.</returns>
|
|
public IEnumerable<LexerToken<TokenType>> TokenStream()
|
|
{
|
|
string nextLine;
|
|
List<LexerToken<TokenType>> matches = new List<LexerToken<TokenType>>();
|
|
while ((nextLine = textStream.ReadLine()) != null)
|
|
{
|
|
CurrentLinePos = 0;
|
|
|
|
while (CurrentLinePos < nextLine.Length)
|
|
{
|
|
matches.Clear();
|
|
foreach (LexerRule<TokenType> rule in Rules)
|
|
{
|
|
if (!rule.Enabled) continue;
|
|
|
|
Match nextMatch = rule.RegEx.Match(nextLine, CurrentLinePos);
|
|
if (!nextMatch.Success) continue;
|
|
|
|
matches.Add(
|
|
new LexerToken<TokenType>(rule, nextMatch)
|
|
{
|
|
LineNumber = CurrentLineNumber,
|
|
ColumnNumber = nextMatch.Index
|
|
}
|
|
);
|
|
}
|
|
|
|
if (matches.Count == 0)
|
|
{
|
|
string unknownTokenContent = nextLine.Substring(CurrentLinePos);
|
|
if (Verbose) Console.WriteLine($"{Ansi.FRed}[Unknown Token: No matches found for this line]{Ansi.Reset} {0}", unknownTokenContent);
|
|
yield return new LexerToken<TokenType>(unknownTokenContent)
|
|
{
|
|
LineNumber = CurrentLineNumber,
|
|
ColumnNumber = CurrentLinePos
|
|
};
|
|
break;
|
|
}
|
|
|
|
matches.Sort((LexerToken<TokenType> a, LexerToken<TokenType> b) => {
|
|
// Match of offset position position
|
|
int result = a.ColumnNumber - b.ColumnNumber;
|
|
// If they both start at the same position, then go with highest priority one
|
|
if (result == 0)
|
|
result = b.Rule.Priority - a.Rule.Priority;
|
|
// Failing that, try the longest one
|
|
if (result == 0)
|
|
result = b.RegexMatch.Length - a.RegexMatch.Length;
|
|
|
|
return result;
|
|
});
|
|
LexerToken<TokenType> selectedToken = matches[0];
|
|
int selectedTokenOffset = nextLine.IndexOf(selectedToken.RegexMatch.Value, CurrentLinePos) - CurrentLinePos;
|
|
|
|
if (selectedTokenOffset > 0)
|
|
{
|
|
string extraTokenContent = nextLine.Substring(CurrentLinePos, selectedTokenOffset);
|
|
int unmatchedLinePos = CurrentLinePos;
|
|
CurrentLinePos += selectedTokenOffset;
|
|
if (Verbose) Console.WriteLine($"{Ansi.FRed}[Unmatched content]{Ansi.Reset} '{extraTokenContent}'");
|
|
// Return the an unknown token, but only if we're not meant to be ignoring them
|
|
if (!IgnoreTokens.Contains((TokenType)Enum.ToObject(typeof(TokenType), 0)))
|
|
{
|
|
yield return new LexerToken<TokenType>(extraTokenContent)
|
|
{
|
|
LineNumber = CurrentLineNumber,
|
|
ColumnNumber = unmatchedLinePos
|
|
};
|
|
}
|
|
}
|
|
|
|
CurrentLinePos += selectedToken.RegexMatch.Length;
|
|
if (Verbose) Console.WriteLine($"{(IgnoreTokens.Contains(selectedToken.Type) ? Ansi.FBlack : Ansi.FGreen)}{selectedToken}{Ansi.Reset}");
|
|
|
|
// Yield the token, but only if we aren't supposed to be ignoring it
|
|
if (IgnoreTokens.Contains(selectedToken.Type))
|
|
continue;
|
|
yield return selectedToken;
|
|
}
|
|
|
|
if (Verbose) Console.WriteLine($"{Ansi.FBlue}[Lexer]{Ansi.Reset} Next line");
|
|
CurrentLineNumber++;
|
|
TotalCharsScanned += CurrentLinePos;
|
|
}
|
|
}
|
|
|
|
|
|
#region Rule Management
|
|
|
|
/// <summary>
|
|
/// Enables all <see cref="LexerRule{TokenType}" />s currently registered against
|
|
/// this Lexer.
|
|
/// </summary>
|
|
public void EnableAllRules() => EnableRulesByPrefix("");
|
|
/// <summary>
|
|
/// Disables all <see cref="LexerRule{TokenType}" />s currently registered against
|
|
/// this Lexer.
|
|
/// </summary>
|
|
public void DisableAllRules() => DisableRulesByPrefix("");
|
|
|
|
/// <summary>
|
|
/// Enables the rule that matches against the given <see cref="TokenType" />.
|
|
/// </summary>
|
|
/// <param name="type">The token type to use to find the rule to enable.</param>
|
|
public void EnableRule(TokenType type) => SetRule(type, true);
|
|
/// <summary>
|
|
/// Disables the rule that matches against the given <see cref="TokenType" />.
|
|
/// </summary>
|
|
/// <param name="type">The token type to use to find the rule to disable.</param>
|
|
public void DisableRule(TokenType type) => SetRule(type, false);
|
|
|
|
/// <summary>
|
|
/// Sets the enabled status of the rule that matches against the given
|
|
/// <see cref="TokenType" /> to the given state.
|
|
/// </summary>
|
|
/// <param name="type">The <see cref="TokenType" /> to use to find the rule to
|
|
/// sets the enabled state of.</param>
|
|
/// <param name="state">Whether to enable or disable the rule. <see langword="true"/> = enable it, <see langword="false"/> = disable it.</param>
|
|
public void SetRule(TokenType type, bool state)
|
|
{
|
|
foreach (LexerRule<TokenType> rule in Rules)
|
|
{
|
|
// We have to do a string comparison here because of the generic type we're using in multiple nested
|
|
// classes
|
|
if (Enum.GetName(rule.Type.GetType(), rule.Type) == Enum.GetName(type.GetType(), type))
|
|
{
|
|
rule.Enabled = state;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Toggles the enabled status of multiple rules by finding rules that generate
|
|
/// tokens whose name begins with a specific substring.
|
|
/// </summary>
|
|
/// <param name="tokenTypePrefix">The prefix to use when finding rules to toggle.</param>
|
|
public void ToggleRulesByPrefix(string tokenTypePrefix)
|
|
{
|
|
foreach (LexerRule<TokenType> rule in Rules)
|
|
{
|
|
// We have to do a string comparison here because of the generic type we're using in multiple nested
|
|
// classes
|
|
if (Enum.GetName(rule.Type.GetType(), rule.Type).StartsWith(tokenTypePrefix, StringComparison.CurrentCulture))
|
|
rule.Enabled = !rule.Enabled;
|
|
}
|
|
}
|
|
/// <summary>
|
|
/// Enables multiple rules by finding rules that generate
|
|
/// tokens whose name begins with a specific substring.
|
|
/// </summary>
|
|
/// <param name="tokenTypePrefix">The prefix to use when finding rules to enable.</param>
|
|
public void EnableRulesByPrefix(string tokenTypePrefix)
|
|
=> SetRulesByPrefix(tokenTypePrefix, true);
|
|
/// <summary>
|
|
/// Disables multiple rules by finding rules that generate
|
|
/// tokens whose name begins with a specific substring.
|
|
/// </summary>
|
|
/// <param name="tokenTypePrefix">The prefix to use when finding rules to disable.</param>
|
|
public void DisableRulesByPrefix(string tokenTypePrefix)
|
|
=> SetRulesByPrefix(tokenTypePrefix, false);
|
|
|
|
/// <summary>
|
|
/// Set the enabled status of multiple rules by finding rules that generate
|
|
/// tokens whose name begins with a specific substring.
|
|
/// </summary>
|
|
/// <param name="tokenTypePrefix">The prefix to use when finding rules to set the
|
|
/// status of.</param>
|
|
public void SetRulesByPrefix(string tokenTypePrefix, bool state)
|
|
{
|
|
foreach (LexerRule<TokenType> rule in Rules)
|
|
{
|
|
// We have to do a string comparison here because of the generic type we're using in multiple nested
|
|
// classes
|
|
if (Enum.GetName(rule.Type.GetType(), rule.Type).StartsWith(tokenTypePrefix, StringComparison.CurrentCulture))
|
|
{
|
|
//if(Verbose) Console.WriteLine($"{Ansi.FBlue}[Lexer/Rules] {Ansi.FCyan}Setting {rule.Type} to {state}");
|
|
rule.Enabled = state;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the current rule states (i.e. whether they are enabled or not) as a snapshot to an
|
|
/// internal stack.
|
|
/// </summary>
|
|
public void SaveRuleStates()
|
|
{
|
|
Dictionary<LexerRule<TokenType>, bool> states = new Dictionary<LexerRule<TokenType>, bool>();
|
|
foreach (LexerRule<TokenType> nextRule in Rules)
|
|
states[nextRule] = nextRule.Enabled;
|
|
|
|
EnabledStateStack.Push(states);
|
|
}
|
|
/// <summary>
|
|
/// Restores the top-most rule states snapshot from the internal stack.
|
|
/// </summary>
|
|
/// <exception cref="InvalidOperationException">Thrown if there aren't any states left on the stack to restore.</exception>
|
|
public void RestoreRuleStates()
|
|
{
|
|
if (EnabledStateStack.Count < 1)
|
|
throw new InvalidOperationException("Error: Can't restore the lexer rule states when no states have been saved!");
|
|
|
|
Dictionary<LexerRule<TokenType>, bool> states = EnabledStateStack.Pop();
|
|
foreach (KeyValuePair<LexerRule<TokenType>, bool> nextRulePair in states)
|
|
nextRulePair.Key.Enabled = nextRulePair.Value;
|
|
}
|
|
|
|
|
|
#endregion
|
|
|
|
}
|
|
}
|