How do i implement If statement in Flex/bison

问题

I dont get the error, please can you help me out, here is the .l and .y file.thanks.

%{
#include "ifanw.tab.h"
extern int yylval;
%}
%%
"="      { return EQ; }
"!="     { return NE; }
"<"      { return LT; }
"<="     { return LE; }
">"      { return GT; }
">="     { return GE; }
"+"      { return PLUS; }
"-"      { return MINUS; }
"*"      { return MULT; }
"/"      { return DIVIDE; }
")"      { return RPAREN; }
"("      { return LPAREN; }
":="     { return ASSIGN; }
";"      { return SEMICOLON; }
"IF"     { return IF; }
"THEN"   { return THEN; }
"ELSE"   { return ELSE; }
"FI"     { return FI; }
"WHILE"  { return WHILE; }
"DO"     { return DO; }
"OD"     { return OD; }
"PRINT"  { return PRINT; }
[0-9]+   { yylval = atoi(yytext); return NUMBER; }
[a-z]    { yylval = yytext[0] - 'a'; return NAME; }   
\        { ; }
\n       { nextline(); }
\t       { ; }
"//".*\n { nextline(); }
.        { yyerror("illegal token"); }
%%

Yacc-file

%start ROOT

%token EQ
%token NE
%token LT
%token LE
%token GT
%token GE
%token PLUS
%token MINUS
%token MULT
%token DIVIDE
%token RPAREN
%token LPAREN
%token ASSIGN
%token SEMICOLON
%token IF
%token THEN
%token ELSE
%token FI
%token WHILE
%token DO
%token OD
%token PRINT
%token NUMBER
%token NAME

%%

ROOT:
   stmtseq { execute($1); } 
   ;

statement:
     designator ASSIGN expression { $$ = assignment($1, $3); } 
   | PRINT expression { $$ = print($2); } 
   | IF expression THEN stmtseq ELSE stmtseq FI
    { $$ = ifstmt($2, $4, $6); }
   | IF expression THEN stmtseq FI
    { $$ = ifstmt($2, $4, empty()); }
   | WHILE expression DO stmtseq OD { $$ = whilestmt($2, $4); }   
   ;

stmtseq:
     stmtseq SEMICOLON statement { $$ = seq($1, $3); }
   | statement { $$ = $1; }
   ;

expression:
 expr2 { $$ = $1; } 
   | expr2 EQ expr2 { $$ = eq($1, $3); }
   | expr2 NE expr2 { $$ = ne($1, $3); }
   | expr2 LT expr2 { $$ = le($1, $3); }
   | expr2 LE expr2 { $$ = le($1, $3); }
   | expr2 GT expr2 { $$ = gt($1, $3); }
   | expr2 GE expr2 { $$ = gt($1, $3); }
   ;

expr2:
     expr3 { $$ == $1; }
   | expr2 PLUS expr3 { $$ = plus($1, $3); }
   | expr2 MINUS expr3 { $$ = minus($1, $3); }
   ;

expr3:
     expr4 { $$ = $1; }
   | expr3 MULT expr4 { $$ = mult($1, $3); }
   | expr3 DIVIDE expr4 { $$ = divide ($1, $3); }
   ;

expr4:
     PLUS expr4 { $$ = $2; }
   | MINUS expr4 { $$ = neg($2); }
   | LPAREN expression RPAREN { $$ = $2; }
   | NUMBER { $$ = number($1); }
   | designator { $$ = $1; }
   ;

designator:
     NAME { $$ = name($1); }
  ;
%%

I have another question, is there a possibility to implement a JMP instruction with flex/bison like in Assembler to go to a label like my example, thanks for your help.

:L1
IF FLAG AND X"0001"
    EVT 23;
ELSE
    WAIT 500 ms;
    JMP L1;
END IF;

回答1:

EDIT: I put the old answer to the end

Here is the promised more detailed example:

usually I begin with an example file of the desired language:

# example.toy
begin # example of the simple toy language
    x = 23;
    while x > 0 do begin
        x = x - 1;
        print(x*x);
    end;
end;

The next step is to create a lexer+parser combination where the previous file passes.

Here comes the lexer (generate the source with flex -o lexer.c lexer.l). Also note that the lexer source depends on the parser sources (because of the TOKEN_* constants), so bison must be run before compiling the lexer source:

%option noyywrap

%{
#include "parser.h"
#include <stdlib.h>
%}

%%

"while" return TOKEN_WHILE;
"begin" return TOKEN_BEGIN;
"end"   return TOKEN_END;
"do"    return TOKEN_DO;
[a-zA-Z_][a-zA-Z0-9_]* {yylval.name = strdup(yytext); return TOKEN_ID;}
[-]?[0-9]+    {yylval.val = atoi(yytext); return TOKEN_NUMBER;}
[()=;]  {return *yytext;}
[*/+-<>] {yylval.op = *yytext; return TOKEN_OPERATOR;}
[ \t\n] {/* suppress the output of the whitespaces from the input file to stdout */}
#.* {/* one-line comment */}

and the parser (compile with bison -d -o parser.c parser.y, the -d tells bison to create the parser.h header file with some stuff the lexer needs)

%error-verbose /* instruct bison to generate verbose error messages*/
%{
/* enable debugging of the parser: when yydebug is set to 1 before the
 * yyparse call the parser prints a lot of messages about what it does */
#define YYDEBUG 1
%}

%union {
    int val;
    char op;
    char* name;
}

%token TOKEN_BEGIN TOKEN_END TOKEN_WHILE TOKEN_DO TOKEN_ID TOKEN_NUMBER TOKEN_OPERATOR
%start program

%{
/* Forward declarations */
void yyerror(const char* const message);


%}

%%

program: statement';';

block: TOKEN_BEGIN statements TOKEN_END;

statements:
    | statements statement ';'
    | statements block';';

statement: 
      assignment
    | whileStmt
    | block
    | call;

assignment: TOKEN_ID '=' expression;

expression: TOKEN_ID
    | TOKEN_NUMBER
    | expression TOKEN_OPERATOR expression;

whileStmt: TOKEN_WHILE expression TOKEN_DO statement;

call: TOKEN_ID '(' expression ')';

%%

#include <stdlib.h>

void yyerror(const char* const message)
{
    fprintf(stderr, "Parse error:%s\n", message);
    exit(1);
}

int main()
{
    yydebug = 0;
    yyparse();
}

After gcc parser.c lexer.c -o toylang-noop the call of toylang-noop < example.toy must run without any error. So now the parser itself works and can parse the example script.

The next step is to create a so called abstract syntax tree of the grammar. At this point I start with the augmenting of the parser by defining different types to the tokens and rules, as well as inserting rules to each parsing step.

%error-verbose /* instruct bison to generate verbose error messages*/
%{
#include "astgen.h"
#define YYDEBUG 1

/* Since the parser must return the AST, it must get a parameter where
 * the AST can be stored. The type of the parameter will be void*. */
#define YYPARSE_PARAM astDest
%}

%union {
    int val;
    char op;
    char* name;
    struct AstElement* ast; /* this is the new member to store AST elements */
}

%token TOKEN_BEGIN TOKEN_END TOKEN_WHILE TOKEN_DO
%token<name> TOKEN_ID
%token<val> TOKEN_NUMBER
%token<op> TOKEN_OPERATOR
%type<ast> program block statements statement assignment expression whileStmt call
%start program

%{
/* Forward declarations */
void yyerror(const char* const message);


%}

%%

program: statement';' { (*(struct AstElement**)astDest) = $1; };

block: TOKEN_BEGIN statements TOKEN_END{ $$ = $2; };

statements: {$$=0;}
    | statements statement ';' {$$=makeStatement($1, $2);}
    | statements block';' {$$=makeStatement($1, $2);};

statement: 
      assignment {$$=$1;}
    | whileStmt {$$=$1;}
    | block {$$=$1;}
    | call {$$=$1;}

assignment: TOKEN_ID '=' expression {$$=makeAssignment($1, $3);}

expression: TOKEN_ID {$$=makeExpByName($1);}
    | TOKEN_NUMBER {$$=makeExpByNum($1);}
    | expression TOKEN_OPERATOR expression {$$=makeExp($1, $3, $2);}

whileStmt: TOKEN_WHILE expression TOKEN_DO statement{$$=makeWhile($2, $4);};

call: TOKEN_ID '(' expression ')' {$$=makeCall($1, $3);};

%%

#include "astexec.h"
#include <stdlib.h>

void yyerror(const char* const message)
{
    fprintf(stderr, "Parse error:%s\n", message);
    exit(1);
}

int main()
{
    yydebug = 0;
    struct AstElement *a;
    yyparse(&a);
}

As you can see, the main part when generating the AST is to create the nodes of the AST when a certain rule of the parser was passed. Since bison maintains a stack of the current parsing process itself, is is only needed to assign the current parsing status to the elements of the stack (these are the $$=foo(bar) lines)

The target is the following structure in memory:

ekStatements
  .count = 2
  .statements
    ekAssignment
      .name = "x"
      .right
        ekNumber
          .val = 23
    ekWhile
      .cond
        ekBinExpression
        .left
          ekId
            .name = "x"
        .right
          ekNumber
            .val=0
        .op = '>'
      .statements
        ekAssignment
          .name = "x"
          .right
            ekBinExpression
              .left
                ekId
                  .name = "x"
              .right
                ekNumber
                  .val = 1
              .op = '-'
        ekCall
          .name = "print"
          .param
            ekBinExpression
              .left
                ekId
                  .name = "x"
              .right
                ekId
                  .name = "x"
              .op = '*'

To get this graph, there is the generating code needed, astgen.h:

#ifndef ASTGEN_H
#define ASTGEN_H

struct AstElement
{
    enum {ekId, ekNumber, ekBinExpression, ekAssignment, ekWhile, ekCall, ekStatements, ekLastElement} kind;
    union
    {
        int val;
        char* name;
        struct
        {
            struct AstElement *left, *right;
            char op;
        }expression;
        struct
        {
            char*name;
            struct AstElement* right;
        }assignment;
        struct
        {
            int count;
            struct AstElement** statements;
        }statements;
        struct
        {
            struct AstElement* cond;
            struct AstElement* statements;
        } whileStmt;
        struct
        {
            char* name;
            struct AstElement* param;
        }call;
    } data;
};

struct AstElement* makeAssignment(char*name, struct AstElement* val);
struct AstElement* makeExpByNum(int val);
struct AstElement* makeExpByName(char*name);
struct AstElement* makeExp(struct AstElement* left, struct AstElement* right, char op);
struct AstElement* makeStatement(struct AstElement* dest, struct AstElement* toAppend);
struct AstElement* makeWhile(struct AstElement* cond, struct AstElement* exec);
struct AstElement* makeCall(char* name, struct AstElement* param);
#endif

astgen.c:

#include "astgen.h"
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

static void* checkAlloc(size_t sz)
{
    void* result = calloc(sz, 1);
    if(!result)
    {
        fprintf(stderr, "alloc failed\n");
        exit(1);
    }
}

struct AstElement* makeAssignment( char*name, struct AstElement* val)
{
    struct AstElement* result = checkAlloc(sizeof(*result));
    result->kind = ekAssignment;
    result->data.assignment.name = name;
    result->data.assignment.right = val;
    return result;
}

struct AstElement* makeExpByNum(int val)
{
    struct AstElement* result = checkAlloc(sizeof(*result));
    result->kind = ekNumber;
    result->data.val = val;
    return result;
}

struct AstElement* makeExpByName(char*name)
{
    struct AstElement* result = checkAlloc(sizeof(*result));
    result->kind = ekId;
    result->data.name = name;
    return result;
}

struct AstElement* makeExp(struct AstElement* left, struct AstElement* right, char op)
{
    struct AstElement* result = checkAlloc(sizeof(*result));
    result->kind = ekBinExpression;
    result->data.expression.left = left;
    result->data.expression.right = right;
    result->data.expression.op = op;
    return result;
}

struct AstElement* makeStatement(struct AstElement* result, struct AstElement* toAppend)
{
    if(!result)
    {
        result = checkAlloc(sizeof(*result));
        result->kind = ekStatements;
        result->data.statements.count = 0;
        result->data.statements.statements = 0;
    }
    assert(ekStatements == result->kind);
    result->data.statements.count++;
    result->data.statements.statements = realloc(result->data.statements.statements, result->data.statements.count*sizeof(*result->data.statements.statements));
    result->data.statements.statements[result->data.statements.count-1] = toAppend;
    return result;
}

struct AstElement* makeWhile(struct AstElement* cond, struct AstElement* exec)
{
    struct AstElement* result = checkAlloc(sizeof(*result));
    result->kind = ekWhile;
    result->data.whileStmt.cond = cond;
    result->data.whileStmt.statements = exec;
    return result;
}

struct AstElement* makeCall(char* name, struct AstElement* param)
{
    struct AstElement* result = checkAlloc(sizeof(*result));
    result->kind = ekCall;
    result->data.call.name = name;
    result->data.call.param = param;
    return result;
}

You can see here that the generating of the AST elements is a rather monotone job. After the step is done, the program still does nothing, but the AST can be viewed in a debugger.

The next step is to write the interpreter. This is astexec.h:

#ifndef ASTEXEC_H
#define ASTEXEC_H

struct AstElement;
struct ExecEnviron;

/* creates the execution engine */
struct ExecEnviron* createEnv();

/* removes the ExecEnviron */
void freeEnv(struct ExecEnviron* e);

/* executes an AST */
void execAst(struct ExecEnviron* e, struct AstElement* a);

#endif

Well, this looks friendly. The Interpreter itself is simple, despite it's length. The most functions deals with only a particular kind of AstElement. The correct function is selected by the dispatchExpression and dispatchStatement functions. The dispatch functions looks for the target function in the valExecs and runExecs arrays.

astexec.c:

#include "astexec.h"
#include "astgen.h"
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>

struct ExecEnviron
{
    int x; /* The value of the x variable, a real language would have some name->value lookup table instead */
};

static int execTermExpression(struct ExecEnviron* e, struct AstElement* a);
static int execBinExp(struct ExecEnviron* e, struct AstElement* a);
static void execAssign(struct ExecEnviron* e, struct AstElement* a);
static void execWhile(struct ExecEnviron* e, struct AstElement* a);
static void execCall(struct ExecEnviron* e, struct AstElement* a);
static void execStmt(struct ExecEnviron* e, struct AstElement* a);

/* Lookup Array for AST elements which yields values */
static int(*valExecs[])(struct ExecEnviron* e, struct AstElement* a) =
{
    execTermExpression,
    execTermExpression,
    execBinExp,
    NULL,
    NULL,
    NULL,
    NULL
};

/* lookup array for non-value AST elements */
static void(*runExecs[])(struct ExecEnviron* e, struct AstElement* a) =
{
    NULL, /* ID and numbers are canonical and */
    NULL, /* don't need to be executed */
    NULL, /* a binary expression is not executed */
    execAssign,
    execWhile,
    execCall,
    execStmt,
};

/* Dispatches any value expression */
static int dispatchExpression(struct ExecEnviron* e, struct AstElement* a)
{
    assert(a);
    assert(valExecs[a->kind]);
    return valExecs[a->kind](e, a);
}

static void dispatchStatement(struct ExecEnviron* e, struct AstElement* a)
{
    assert(a);
    assert(runExecs[a->kind]);
    runExecs[a->kind](e, a);
}

static void onlyName(const char* name, const char* reference, const char* kind)
{
    if(strcmp(reference, name))
    {
        fprintf(stderr,
            "This language knows only the %s '%s', not '%s'\n",
            kind, reference, name);
        exit(1);
    }
}

static void onlyX(const char* name)
{
    onlyName(name, "x", "variable");
}

static void onlyPrint(const char* name)
{
    onlyName(name, "print", "function");
}

static int execTermExpression(struct ExecEnviron* e, struct AstElement* a)
{
    /* This function looks ugly because it handles two different kinds of
     * AstElement. I would refactor it to an execNameExp and execVal
     * function to get rid of this two if statements. */
    assert(a);
    if(ekNumber == a->kind)
    {
        return a->data.val;
    }
    else
    {
        if(ekId == a->kind)
        {
            onlyX(a->data.name);
            assert(e);
            return e->x;
        }
    }
    fprintf(stderr, "OOPS: tried to get the value of a non-expression(%d)\n", a->kind);
    exit(1);
}

static int execBinExp(struct ExecEnviron* e, struct AstElement* a)
{
    assert(ekBinExpression == a->kind);
    const int left = dispatchExpression(e, a->data.expression.left);
    const int right = dispatchExpression(e, a->data.expression.right);
    switch(a->data.expression.op)
    {
        case '+':
            return left + right;
        case '-':
            return left - right;
        case '*':
            return left * right;
        case '<':
            return left < right;
        case '>':
            return left > right;
        default:
            fprintf(stderr,  "OOPS: Unknown operator:%c\n", a->data.expression.op);
            exit(1);
    }
    /* no return here, since every switch case returns some value (or bails out) */
}

static void execAssign(struct ExecEnviron* e, struct AstElement* a)
{
    assert(a);
    assert(ekAssignment == a->kind);
    onlyX(a->data.assignment.name);
    assert(e);
    struct AstElement* r = a->data.assignment.right;
    e->x = dispatchExpression(e, r);
}

static void execWhile(struct ExecEnviron* e, struct AstElement* a)
{
    assert(a);
    assert(ekWhile == a->kind);
    struct AstElement* const c = a->data.whileStmt.cond;
    struct AstElement* const s = a->data.whileStmt.statements;
    assert(c);
    assert(s);
    while(dispatchExpression(e, c))
    {
        dispatchStatement(e, s);
    }
}

static void execCall(struct ExecEnviron* e, struct AstElement* a)
{
    assert(a);
    assert(ekCall == a->kind);
    onlyPrint(a->data.call.name);
    printf("%d\n", dispatchExpression(e, a->data.call.param));
}

static void execStmt(struct ExecEnviron* e, struct AstElement* a)
{
    assert(a);
    assert(ekStatements == a->kind);
    int i;
    for(i=0; i<a->data.statements.count; i++)
    {
        dispatchStatement(e, a->data.statements.statements[i]);
    }
}

void execAst(struct ExecEnviron* e, struct AstElement* a)
{
    dispatchStatement(e, a);
}

struct ExecEnviron* createEnv()
{
    assert(ekLastElement == (sizeof(valExecs)/sizeof(*valExecs)));
    assert(ekLastElement == (sizeof(runExecs)/sizeof(*runExecs)));
    return calloc(1, sizeof(struct ExecEnviron));
}

void freeEnv(struct ExecEnviron* e)
{
    free(e);
}

Now the interpreter is complete, and the example can be run, after the main function is updated:

#include <assert.h>

int main()
{
    yydebug = 0;
    struct AstElement *a = 0;
    yyparse(&a);
    /* Q&D WARNING: in production code this assert must be replaced by
     * real error handling. */
    assert(a);
    struct ExecEnviron* e = createEnv();
    execAst(e, a);
    freeEnv(e);
    /* TODO: destroy the AST */
}

Now the interpreter for this language works. Note that there are some limitations within this interpreter:

it has only one variable and one function
- and only one parameter to a function
only the type int for values
it is difficult to add goto support, since for each AST element the interpreter calls an interpreting function. Goto can be implemented within one block by hacking something into the execStmt function, but to jump between different blocks or levels the execution machinery must be changed dramatically (this is because one can't jump between different stack frames in the interpreter). For example the AST can be transformed into byte code and this byte code is interpreted by a vm.
some other which I would need to lookup :)

You need to define the grammar for your language. Some thing like this (both lexer and parser are incomplete):

/* foo.y */
%token ID IF ELSE OR AND /* First list all terminal symbols of the language */
%%

statements: /* allow empty statements */ | stm | statements ';' stm;

stm: ifStatement
   | NAME
   | NAME expList
   | label;

expList: expression | expList expression;

label: ':' NAME { /* code to store the label */ };

ifStatement: IF expression statements
           | IF expression statements ELSE statements;

expression: ID                          { /* Code to handle the found ID */ }
          | expression AND expression   { /* Code to con cat two expression with and */ }
          | expression OR expression
          | '(' expression ')';

Then you compile this file with bison -d foo.y -o foo.c. The -d switch instruct bison to generate a header with all the tokens the parser uses. Now you create your lexer

/* bar.l */
%{
#include "foo.h"
%}

%%

IF   return IF;
ELSE return ELSE;
OR   return OR;
AND  return AND;
[A-Z]+  { /*store yylval somewhere to access it in the parser*/ return ID; }

After this you have your lexer and parser done, and "only" need to write the semantic actions for your language.

来源：https://stackoverflow.com/questions/2644597/how-do-i-implement-if-statement-in-flex-bison

标签

c++

yacc

bison

lexical-analysis