问题
When tokenizing in superpower, how to match a string only if it is the first thing in a line (note: this is a different question than this one) ?
For example, assume I have a language with only the following 4 characters (' ', ':', 'X', 'Y'), each of which is a token. There is also a 'Header' token to capture cases of the following regex pattern /^[XY]+:/ (any number of Xs and Ys followed by a colon, only if they start the line).
Here is a quick class for testing (the 4th test-case fails):
using System;
using Superpower;
using Superpower.Parsers;
using Superpower.Tokenizers;
public enum Tokens { Space, Colon, Header, X, Y }
public class XYTokenizer
{
static void Main(string[] args)
{
Test("X", Tokens.X);
Test("XY", Tokens.X, Tokens.Y);
Test("X Y:", Tokens.X, Tokens.Space, Tokens.Y, Tokens.Colon);
Test("X: X", Tokens.Header, Tokens.Space, Tokens.X);
}
public static readonly Tokenizer<Tokens> tokenizer = new TokenizerBuilder<Tokens>()
.Match(Character.EqualTo('X'), Tokens.X)
.Match(Character.EqualTo('Y'), Tokens.Y)
.Match(Character.EqualTo(':'), Tokens.Colon)
.Match(Character.EqualTo(' '), Tokens.Space)
.Build();
static void Test(string input, params Tokens[] expected)
{
var tokens = tokenizer.Tokenize(input);
var i = 0;
foreach (var t in tokens)
{
if (t.Kind != expected[i])
{
Console.WriteLine("tokens[" + i + "] was Tokens." + t.Kind
+ " not Tokens." + expected[i] + " for '" + input + "'");
return;
}
i++;
}
Console.WriteLine("OK");
}
}
回答1:
I came up with a custom Tokenizer
based on the example found here. I added comments throughout the code so you can follow what's happening.
public class MyTokenizer : Tokenizer<Tokens>
{
protected override IEnumerable<Result<Tokens>> Tokenize(TextSpan input)
{
Result<char> next = input.ConsumeChar();
bool checkForHeader = true;
while (next.HasValue)
{
// need to check for a header when starting a new line
if (checkForHeader)
{
var headerStartLocation = next.Location;
var tokenQueue = new List<Result<Tokens>>();
while (next.HasValue && (next.Value == 'X' || next.Value == 'Y'))
{
tokenQueue.Add(Result.Value(next.Value == 'X' ? Tokens.X : Tokens.Y, next.Location, next.Remainder));
next = next.Remainder.ConsumeChar();
}
// only if we had at least one X or one Y
if (tokenQueue.Any())
{
if (next.HasValue && next.Value == ':')
{
// this is a header token; we have to return a Result of the start location
// along with the remainder at this location
yield return Result.Value(Tokens.Header, headerStartLocation, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else
{
// this isn't a header; we have to return all the tokens we parsed up to this point
foreach (Result<Tokens> tokenResult in tokenQueue)
{
yield return tokenResult;
}
}
}
if (!next.HasValue)
yield break;
}
checkForHeader = false;
if (next.Value == '\r')
{
// skip over the carriage return
next = next.Remainder.ConsumeChar();
continue;
}
if (next.Value == '\n')
{
// line break; check for a header token here
next = next.Remainder.ConsumeChar();
checkForHeader = true;
continue;
}
if (next.Value == 'A')
{
var abcStart = next.Location;
next = next.Remainder.ConsumeChar();
if (next.HasValue && next.Value == 'B')
{
next = next.Remainder.ConsumeChar();
if (next.HasValue && next.Value == 'C')
{
yield return Result.Value(Tokens.ABC, abcStart, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else
{
yield return Result.Empty<Tokens>(next.Location, $"unrecognized `AB{next.Value}`");
}
}
else
{
yield return Result.Empty<Tokens>(next.Location, $"unrecognized `A{next.Value}`");
}
}
else if (next.Value == 'X')
{
yield return Result.Value(Tokens.X, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else if (next.Value == 'Y')
{
yield return Result.Value(Tokens.Y, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else if (next.Value == ':')
{
yield return Result.Value(Tokens.Colon, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else if (next.Value == ' ')
{
yield return Result.Value(Tokens.Space, next.Location, next.Remainder);
next = next.Remainder.ConsumeChar();
}
else
{
yield return Result.Empty<Tokens>(next.Location, $"unrecognized `{next.Value}`");
next = next.Remainder.ConsumeChar(); // Skip the character anyway
}
}
}
}
And you can call it like this:
var tokens = new MyTokenizer().Tokenize(input);
来源:https://stackoverflow.com/questions/53029386/superpower-match-a-string-with-tokenizer-only-if-it-begins-a-line