I have a huge file that I must parse line by line. Speed is of the essence.
Example of a line:
Token-1 Here-is-the-Next-Token La
This begs another question - How big? Give us a clue like # of lines or # or Mb (Gb)? Then we will know if it fits in memory, needs to be disk based etc.
At first pass I would use my WordList(S: String; AList: TStringlist);
then you can access each token as Alist[n]... or sort them or whatever.
Here is a lame ass implementation of a very simple lexer. This might give you an idea.
Note the limitations of this example - no buffering involved, no Unicode (this is an excerpt from a Delphi 7 project). You would probably need those in a serious implementation.
{ Implements a simpe lexer class. }
unit Simplelexer;
interface
uses Classes, Sysutils, Types, dialogs;
type
ESimpleLexerFinished = class(Exception) end;
TProcTableProc = procedure of object;
// A very simple lexer that can handle numbers, words, symbols - no comment handling
TSimpleLexer = class(TObject)
private
FLineNo: Integer;
Run: Integer;
fOffset: Integer;
fRunOffset: Integer; // helper for fOffset
fTokenPos: Integer;
pSource: PChar;
fProcTable: array[#0..#255] of TProcTableProc;
fUseSimpleStrings: Boolean;
fIgnoreSpaces: Boolean;
procedure MakeMethodTables;
procedure IdentProc;
procedure NewLineProc;
procedure NullProc;
procedure NumberProc;
procedure SpaceProc;
procedure SymbolProc;
procedure UnknownProc;
public
constructor Create;
destructor Destroy; override;
procedure Feed(const S: string);
procedure Next;
function GetToken: string;
function GetLineNo: Integer;
function GetOffset: Integer;
property IgnoreSpaces: boolean read fIgnoreSpaces write fIgnoreSpaces;
property UseSimpleStrings: boolean read fUseSimpleStrings write fUseSimpleStrings;
end;
implementation
{ TSimpleLexer }
constructor TSimpleLexer.Create;
begin
makeMethodTables;
fUseSimpleStrings := false;
fIgnoreSpaces := false;
end;
destructor TSimpleLexer.Destroy;
begin
inherited;
end;
procedure TSimpleLexer.Feed(const S: string);
begin
Run := 0;
FLineNo := 1;
FOffset := 1;
pSource := PChar(S);
end;
procedure TSimpleLexer.Next;
begin
fTokenPos := Run;
foffset := Run - frunOffset + 1;
fProcTable[pSource[Run]];
end;
function TSimpleLexer.GetToken: string;
begin
SetString(Result, (pSource + fTokenPos), Run - fTokenPos);
end;
function TSimpleLexer.GetLineNo: Integer;
begin
Result := FLineNo;
end;
function TSimpleLexer.GetOffset: Integer;
begin
Result := foffset;
end;
procedure TSimpleLexer.MakeMethodTables;
var
I: Char;
begin
for I := #0 to #255 do
case I of
'@', '&', '}', '{', ':', ',', ']', '[', '*',
'^', ')', '(', ';', '/', '=', '-', '+', '#', '>', '<', '$',
'.', '"', #39:
fProcTable[I] := SymbolProc;
#13, #10: fProcTable[I] := NewLineProc;
'A'..'Z', 'a'..'z', '_': fProcTable[I] := IdentProc;
#0: fProcTable[I] := NullProc;
'0'..'9': fProcTable[I] := NumberProc;
#1..#9, #11, #12, #14..#32: fProcTable[I] := SpaceProc;
else
fProcTable[I] := UnknownProc;
end;
end;
procedure TSimpleLexer.UnknownProc;
begin
inc(run);
end;
procedure TSimpleLexer.SymbolProc;
begin
if fUseSimpleStrings then
begin
if pSource[run] = '"' then
begin
Inc(run);
while pSource[run] <> '"' do
begin
Inc(run);
if pSource[run] = #0 then
begin
NullProc;
end;
end;
end;
Inc(run);
end
else
inc(run);
end;
procedure TSimpleLexer.IdentProc;
begin
while pSource[Run] in ['_', 'A'..'Z', 'a'..'z', '0'..'9'] do
Inc(run);
end;
procedure TSimpleLexer.NumberProc;
begin
while pSource[run] in ['0'..'9'] do
inc(run);
end;
procedure TSimpleLexer.SpaceProc;
begin
while pSource[run] in [#1..#9, #11, #12, #14..#32] do
inc(run);
if fIgnoreSpaces then Next;
end;
procedure TSimpleLexer.NewLineProc;
begin
inc(FLineNo);
inc(run);
case pSource[run - 1] of
#13:
if pSource[run] = #10 then inc(run);
end;
foffset := 1;
fRunOffset := run;
end;
procedure TSimpleLexer.NullProc;
begin
raise ESimpleLexerFinished.Create('');
end;
end.
I think the biggest bottleneck is always going to be getting the file into memory. Once you have it in memory (obviously not all of it at once, but I would work with buffers if I were you), the actual parsing should be insignificant.
Speed will always be relative to what you are doing once it is parsed. A lexical parser by far is the fastest method of converting to tokens from a text stream regardless of size. TParser in the classes unit is a great place to start.
Personally its been a while since I needed to write a parser, but another more dated yet tried and true method would be to use LEX/YACC to build a grammar then have it convert the grammar into code you can use to perform your processing. DYacc is a Delphi version...not sure if it still compiles or not, but worth a look if you want to do things old school. The dragon book here would be of big help, if you can find a copy.
I made a lexical analyser based on a state engine (DFA). It works with a table and is pretty fast. But there are possible faster options.
It also depends on the language. A simple language can possibly have a smart algorithm.
The table is an array of records each containing 2 chars and 1 integer. For each token the lexer walks through the table, startting at position 0:
state := 0;
result := tkNoToken;
while (result = tkNoToken) do begin
if table[state].c1 > table[state].c2 then
result := table[state].value
else if (table[state].c1 <= c) and (c <= table[state].c2) then begin
c := GetNextChar();
state := table[state].value;
end else
Inc(state);
end;
It is simple and works like a charm.
Here's a sample lexer that should be pretty efficient, but it assumes that all source data is in a single string. Reworking it to handle buffers is moderately tricky due to very long tokens.
type
TLexer = class
private
FData: string;
FTokenStart: PChar;
FCurrPos: PChar;
function GetCurrentToken: string;
public
constructor Create(const AData: string);
function GetNextToken: Boolean;
property CurrentToken: string read GetCurrentToken;
end;
{ TLexer }
constructor TLexer.Create(const AData: string);
begin
FData := AData;
FCurrPos := PChar(FData);
end;
function TLexer.GetCurrentToken: string;
begin
SetString(Result, FTokenStart, FCurrPos - FTokenStart);
end;
function TLexer.GetNextToken: Boolean;
var
cp: PChar;
begin
cp := FCurrPos; // copy to local to permit register allocation
// skip whitespace; this test could be converted to an unsigned int
// subtraction and compare for only a single branch
while (cp^ > #0) and (cp^ <= #32) do
Inc(cp);
// using null terminater for end of file
Result := cp^ <> #0;
if Result then
begin
FTokenStart := cp;
Inc(cp);
while cp^ > #32 do
Inc(cp);
end;
FCurrPos := cp;
end;