问题
I'm writing a DSL and using a Boost Spirit lexer to tokenize my input. In my grammar, I want a rule similar to this (where tok
is the lexer):
header_block =
tok.name >> ':' >> tok.stringval > ';' >>
tok.description >> ':' >> tok.stringval > ';'
;
Rather than specifying reserved words for the language (e.g. "name", "description") and deal with synchronizing these between the lexer and grammar, I want to just tokenize everything that matches [a-zA-Z_]\w*
as a single token type (e.g. tok.symbol
), and let the grammar sort it out. If I weren't using a lexer, I might do something like this:
stringval = lexeme['"' >> *(char_ - '"') >> '"'];
header_block =
lit("name") >> ':' >> stringval > ';' >>
lit("description") >> ':' >> stringval > ';'
;
With a lexer in the mix, I can compile the following rule, but of course it matches more than I want — it doesn't care about the particular symbol values "name" and "description":
header_block =
tok.symbol >> ':' >> tok.stringval > ';' >>
tok.symbol >> ':' >> tok.stringval > ';'
;
What I'm looking for is something like this:
header_block =
specific_symbol_matcher("name") >> ':' >> tok.stringval > ';' >>
specific_symbol_matcher("description") >> ':' >> tok.stringval > ';'
;
Does Qi provide anything I can use instead of my specific_symbol_matcher
hand-waving, there? I'd rather not write my own matcher if I can get close using stuff that's provided. If I must write my own matcher, can anyone suggest how to do that?
回答1:
If the token exposes a std::string, you should just be able to do:
statement =
( tok.keyword [ qi::_pass = (_1 == "if") ] >> if_stmt )
| ( tok.keyword [ qi::_pass = (_1 == "while) ] >> while_stmt );
If I understood you right, this is, more or less, what you were asking.
While you are at it, do look at qi::symbol<> and an especially nifty application of that, known as the Nabialek Trick.
Bonus material
In case you're just struggling to make an existing grammar work with a lexer, here's what I just did with the calc_utree_ast.cpp example to make it work with a lexer.
It shows
- how you can directly consume the exposed attributes
- how you can still parse based on char-literals, as long as these char literals are registered as (anonymous) tokens
- how the (simple) expression gammar was minimally changed
- how the skipping behaviour was moved into the lexer
///////////////////////////////////////////////////////////////////////////////
//
// Plain calculator example demonstrating the grammar. The parser is a
// syntax checker only and does not do any semantic evaluation.
//
// [ JDG May 10, 2002 ] spirit1
// [ JDG March 4, 2007 ] spirit2
// [ HK November 30, 2010 ] spirit2/utree
// [ SH July 17, 2012 ] use a lexer
//
///////////////////////////////////////////////////////////////////////////////
#define BOOST_SPIRIT_DEBUG
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/support_utree.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_function.hpp>
#include <iostream>
#include <string>
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
namespace spirit = boost::spirit;
namespace phx = boost::phoenix;
// base iterator type
typedef std::string::const_iterator BaseIteratorT;
// token type
typedef lex::lexertl::token<BaseIteratorT, boost::mpl::vector<char, uint32_t> > TokenT;
// lexer type
typedef lex::lexertl::actor_lexer<TokenT> LexerT;
template <typename LexerT_>
struct Tokens: public lex::lexer<LexerT_> {
Tokens() {
// literals
uint_ = "[0-9]+";
space = " \t\r\n";
// literal rules
this->self += uint_;
this->self += '+';
this->self += '-';
this->self += '*';
this->self += '/';
this->self += '(';
this->self += ')';
using lex::_pass;
using lex::pass_flags;
this->self += space [ _pass = pass_flags::pass_ignore ];
}
lex::token_def<uint32_t> uint_;
lex::token_def<lex::omit> space;
};
namespace client
{
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace spirit = boost::spirit;
struct expr
{
template <typename T1, typename T2 = void>
struct result { typedef void type; };
expr(char op) : op(op) {}
void operator()(spirit::utree& expr, spirit::utree const& rhs) const
{
spirit::utree lhs;
lhs.swap(expr);
expr.push_back(spirit::utf8_symbol_range_type(&op, &op+1));
expr.push_back(lhs);
expr.push_back(rhs);
}
char const op;
};
boost::phoenix::function<expr> const plus = expr('+');
boost::phoenix::function<expr> const minus = expr('-');
boost::phoenix::function<expr> const times = expr('*');
boost::phoenix::function<expr> const divide = expr('/');
struct negate_expr
{
template <typename T1, typename T2 = void>
struct result { typedef void type; };
void operator()(spirit::utree& expr, spirit::utree const& rhs) const
{
char const op = '-';
expr.clear();
expr.push_back(spirit::utf8_symbol_range_type(&op, &op+1));
expr.push_back(rhs);
}
};
boost::phoenix::function<negate_expr> neg;
///////////////////////////////////////////////////////////////////////////////
// Our calculator grammar
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator>
struct calculator : qi::grammar<Iterator, spirit::utree()>
{
template <typename Tokens>
calculator(Tokens const& toks) : calculator::base_type(expression)
{
using qi::_val;
using qi::_1;
expression =
term [_val = _1]
>> *( ('+' >> term [plus(_val, _1)])
| ('-' >> term [minus(_val, _1)])
)
;
term =
factor [_val = _1]
>> *( ('*' >> factor [times(_val, _1)])
| ('/' >> factor [divide(_val, _1)])
)
;
factor =
toks.uint_ [_val = _1]
| '(' >> expression [_val = _1] >> ')'
| ('-' >> factor [neg(_val, _1)])
| ('+' >> factor [_val = _1])
;
BOOST_SPIRIT_DEBUG_NODE(expression);
BOOST_SPIRIT_DEBUG_NODE(term);
BOOST_SPIRIT_DEBUG_NODE(factor);
}
qi::rule<Iterator, spirit::utree()> expression, term, factor;
};
}
///////////////////////////////////////////////////////////////////////////////
// Main program
///////////////////////////////////////////////////////////////////////////////
int main()
{
std::cout << "/////////////////////////////////////////////////////////\n\n";
std::cout << "Expression parser...\n\n";
std::cout << "/////////////////////////////////////////////////////////\n\n";
std::cout << "Type an expression...or [q or Q] to quit\n\n";
using boost::spirit::utree;
typedef std::string::const_iterator iterator_type;
typedef Tokens<LexerT>::iterator_type IteratorT;
typedef client::calculator<IteratorT> calculator;
Tokens<LexerT> l;
calculator calc(l); // Our grammar
std::string str;
while (std::getline(std::cin, str))
{
if (str.empty() || str[0] == 'q' || str[0] == 'Q')
break;
std::string::const_iterator iter = str.begin();
std::string::const_iterator end = str.end();
utree ut;
bool r = lex::tokenize_and_parse(iter, end, l, calc, ut);
if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded: " << ut << "\n";
std::cout << "-------------------------\n";
}
else
{
std::string rest(iter, end);
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "stopped at: \"" << rest << "\"\n";
std::cout << "-------------------------\n";
}
}
std::cout << "Bye... :-) \n\n";
return 0;
}
For the input
8*12312*(4+5)
It prints (without debug info)
Parsing succeeded: ( * ( * 8 12312 ) ( + 4 5 ) )
来源:https://stackoverflow.com/questions/11512282/is-there-a-way-to-match-the-content-of-a-spiritlex-string-token-as-a-literal-i