问题
I have a cpp file with a huge class implementation. Now I have to modify the source file itself.
For this, is there a library/api/tool that will tokenize this file for me and give me one token each time i request.
My requirement is as below.
OpenCPPFile()
While (!EOF)
token = GetNextToken();
process something based on this token
EndWhile
I am happy now
Regards, AJ
回答1:
Boost.Wave
offers a standard C++ lexer among many other tools like a standard preprocessor which are built on top of Boost.Spirit
. Check the following sample in the boost directory:
C:\boost\libs\wave\samples\lexed_tokens
For example, if you have the following file which is called main.cpp
:
int main()
{
double PI = 3.14, r = 10;
double area = PI * r*r;
}
You apply the lexer which I named cpp_lex
(assuming they are in the same directory):
c:\cpp_lex main.cpp
You get:
INT (#334) at main.cpp ( 1/ 1): >int<
SPACE (#392) at main.cpp ( 1/ 4): > <
IDENTIFIER (#380) at main.cpp ( 1/ 5): >main<
LEFTPAREN (#277) at main.cpp ( 1/ 9): >(<
RIGHTPAREN (#294) at main.cpp ( 1/10): >)<
NEWLINE (#394) at main.cpp ( 1/11): >\n<
LEFTBRACE (#274) at main.cpp ( 2/ 1): >{<
NEWLINE (#394) at main.cpp ( 2/ 2): >\n<
SPACE (#392) at main.cpp ( 3/ 1): >\t<
DOUBLE (#321) at main.cpp ( 3/ 2): >double<
SPACE (#392) at main.cpp ( 3/ 8): > <
IDENTIFIER (#380) at main.cpp ( 3/ 9): >PI<
SPACE (#392) at main.cpp ( 3/11): > <
ASSIGN (#258) at main.cpp ( 3/12): >=<
SPACE (#392) at main.cpp ( 3/13): > <
FLOATLIT (#386) at main.cpp ( 3/14): >3.14<
COMMA (#264) at main.cpp ( 3/18): >,<
SPACE (#392) at main.cpp ( 3/19): > <
IDENTIFIER (#380) at main.cpp ( 3/20): >r<
SPACE (#392) at main.cpp ( 3/21): > <
ASSIGN (#258) at main.cpp ( 3/22): >=<
SPACE (#392) at main.cpp ( 3/23): > <
INTLIT (#384) at main.cpp ( 3/24): >10<
SEMICOLON (#297) at main.cpp ( 3/26): >;<
NEWLINE (#394) at main.cpp ( 3/27): >\n<
SPACE (#392) at main.cpp ( 4/ 1): >\t<
DOUBLE (#321) at main.cpp ( 4/ 2): >double<
SPACE (#392) at main.cpp ( 4/ 8): > <
IDENTIFIER (#380) at main.cpp ( 4/ 9): >area<
SPACE (#392) at main.cpp ( 4/13): > <
ASSIGN (#258) at main.cpp ( 4/14): >=<
SPACE (#392) at main.cpp ( 4/15): > <
IDENTIFIER (#380) at main.cpp ( 4/16): >PI<
SPACE (#392) at main.cpp ( 4/18): > <
STAR (#302) at main.cpp ( 4/19): >*<
SPACE (#392) at main.cpp ( 4/20): > <
IDENTIFIER (#380) at main.cpp ( 4/21): >r<
STAR (#302) at main.cpp ( 4/22): >*<
IDENTIFIER (#380) at main.cpp ( 4/23): >r<
SEMICOLON (#297) at main.cpp ( 4/24): >;<
NEWLINE (#394) at main.cpp ( 4/25): >\n<
RIGHTBRACE (#293) at main.cpp ( 5/ 1): >}<
EOF (#401) at main.cpp ( 5/ 2): ><
Here is the code, for more information check Boost.Wave manual:
/*=============================================================================
Boost.Wave: A Standard compliant C++ preprocessor library
http://www.boost.org/
Copyright (c) 2001-2010 Hartmut Kaiser. Distributed under the Boost
Software License, Version 1.0. (See accompanying file
LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
=============================================================================*/
#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include <vector>
///////////////////////////////////////////////////////////////////////////////
// Include Wave itself
#include <boost/wave.hpp>
///////////////////////////////////////////////////////////////////////////////
// Include the lexer stuff
#include <boost/wave/cpplexer/cpp_lex_token.hpp> // token class
#include <boost/wave/cpplexer/cpp_lex_iterator.hpp> // lexer class
///////////////////////////////////////////////////////////////////////////////
//
// Special output operator for a lex_token.
//
// Note: this doesn't compile if BOOST_SPIRIT_DEBUG is defined.
//
///////////////////////////////////////////////////////////////////////////////
template <typename PositionT>
inline std::ostream &
operator<< (std::ostream &stream,
boost::wave::cpplexer::lex_token<PositionT> const &t)
{
using namespace std;
using namespace boost::wave;
token_id id = token_id(t);
stream << setw(16)
<< left << boost::wave::get_token_name(id) << " ("
<< "#" << setw(3) << BASEID_FROM_TOKEN(id);
if (ExtTokenTypeMask & id) {
// this is an extended token id
if (AltTokenType == (id & ExtTokenOnlyMask)) {
stream << ", AltTokenType";
}
else if (TriGraphTokenType == (id & ExtTokenOnlyMask)) {
stream << ", TriGraphTokenType";
}
else if (AltExtTokenType == (id & ExtTokenOnlyMask)){
stream << ", AltExtTokenType";
}
}
stream
<< ") at " << t.get_position().get_file() << " ("
<< setw(3) << right << t.get_position().get_line() << "/"
<< setw(2) << right << t.get_position().get_column()
<< "): >";
typedef typename boost::wave::cpplexer::lex_token<PositionT>::string_type
string_type;
string_type const& value = t.get_value();
for (std::size_t i = 0; i < value.size(); ++i) {
switch (value[i]) {
case '\r': stream << "\\r"; break;
case '\n': stream << "\\n"; break;
case '\t': stream << "\\t"; break;
default:
stream << value[i];
break;
}
}
stream << "<";
return stream;
}
///////////////////////////////////////////////////////////////////////////////
// main entry point
int main(int argc, char *argv[])
{
if (2 != argc) {
std::cerr << "Usage: lexed_tokens infile" << std::endl;
return -1;
}
// current file position is saved for exception handling
boost::wave::util::file_position_type current_position;
try {
// Open and read in the specified input file.
std::ifstream instream(argv[1]);
std::string instr;
if (!instream.is_open()) {
std::cerr << "Could not open input file: " << argv[1] << std::endl;
return -2;
}
instream.unsetf(std::ios::skipws);
instr = std::string(std::istreambuf_iterator<char>(instream.rdbuf()),
std::istreambuf_iterator<char>());
// tokenize the input data into C++ tokens using the C++ lexer
typedef boost::wave::cpplexer::lex_token<> token_type;
typedef boost::wave::cpplexer::lex_iterator<token_type> lexer_type;
typedef token_type::position_type position_type;
position_type pos(argv[1]);
lexer_type it = lexer_type(instr.begin(), instr.end(), pos,
boost::wave::language_support(
boost::wave::support_cpp|boost::wave::support_option_long_long));
lexer_type end = lexer_type();
while (it != end) {
current_position = (*it).get_position(); // for error reporting
std::cout << *it << std::endl; // dump the tokenf info
++it;
}
}
catch (boost::wave::cpplexer::lexing_exception const& e) {
// some lexing error
std::cerr
<< e.file_name() << "(" << e.line_no() << "): "
<< e.description() << std::endl;
return 2;
}
catch (std::exception const& e) {
// use last recognized token to retrieve the error position
std::cerr
<< current_position.get_file()
<< "(" << current_position.get_line() << "): "
<< "exception caught: " << e.what()
<< std::endl;
return 3;
}
catch (...) {
// use last recognized token to retrieve the error position
std::cerr
<< current_position.get_file()
<< "(" << current_position.get_line() << "): "
<< "unexpected exception caught." << std::endl;
return 4;
}
return 0;
}
回答2:
Have a look at liblex from clang\llvm too. It only supports forward lexing but it should do.
来源:https://stackoverflow.com/questions/2666310/is-there-a-tokenizer-for-a-cpp-file