I want to parse a file that looks like this (FASTA-like text format):
>InfoHeader
\"Some text sequence that h
Previous: Step 2. Faster with mmap
Next: Step 4: Dropping the position iterator
Let's avoid allocations! If we move the file mapping into the FastaReader class, we can directly point to data in the mapping instead of copying strings all the time.
Using boost::string_ref as e.g. described here: C++: Fast way to read mapped file into a matrix you can do
Live On Coliru
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include
#include
#include
namespace io = boost::iostreams;
namespace fs = boost::filesystem;
class FastaReader {
public:
typedef std::pair Entry;
typedef std::vector Data;
private:
Data fV;
fs::path file;
public:
FastaReader(const fs::path & f);
~FastaReader();
const fs::path & getFile() const;
const Data::const_iterator begin() const;
const Data::const_iterator end() const;
private:
io::mapped_file_source mmap;
void parse();
};
#include
#include
#include
#include
#include
#include
#include
#include
#include
//#include "fastaReader.hpp"
#include
using namespace std;
namespace fs = boost::filesystem;
namespace qi = boost::spirit::qi;
namespace pt = boost::posix_time;
namespace io = boost::iostreams;
namespace boost { namespace spirit { namespace traits {
template
struct assign_to_attribute_from_iterators {
static void call(It f, It l, boost::string_ref& attr) { attr = boost::string_ref { f.base(), size_t(std::distance(f.base(),l.base())) }; }
};
} } }
template
struct FastaGrammar : qi::grammar {
FastaGrammar() : FastaGrammar::base_type(fasta) {
using namespace qi;
using boost::phoenix::construct;
using boost::phoenix::begin;
using boost::phoenix::size;
entry = ('>' >> raw[ *~char_('\n') ] >> '\n' >> raw[ *~char_('>') ]);
fasta = *entry >> *eol >> eoi ;
BOOST_SPIRIT_DEBUG_NODES((fasta)(entry));
}
private:
qi::rule fasta;
qi::rule entry;
};
FastaReader::FastaReader(const fs::path & f) : file(f), mmap(file.c_str()) {
parse();
}
FastaReader::~FastaReader() {}
const fs::path & FastaReader::getFile() const {
return this->file;
}
const FastaReader::Data::const_iterator FastaReader::begin() const {
return this->fV.cbegin();
}
const FastaReader::Data::const_iterator FastaReader::end() const {
return this->fV.cend();
}
void FastaReader::parse() {
if (this->file.empty()) throw std::runtime_error("FastaReader: No file specified.");
if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
typedef char const* iterator_type;
typedef boost::spirit::classic::position_iterator2 pos_iterator_type;
typedef FastaGrammar fastaGr;
static const fastaGr fG{};
try {
std::cerr << "Measuring: Parsing." << std::endl;
const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
pos_iterator_type first(iterator_type{mmap.data()}, iterator_type{mmap.end()}, file.string());
qi::phrase_parse(first, {}, fG, boost::spirit::ascii::space, this->fV);
const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
pt::time_duration duration (endMeasurement - startMeasurement);
std::cerr << duration << std::endl;
} catch (std::exception const& e) {
cerr << "error message: " << e.what() << endl;
}
}
int main() {
FastaReader reader("input.txt");
for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
}
This is indeed already 4.8x faster:
$ ./test3 | head -n4
Measuring: Parsing.
00:00:04.577123
>gi|31563518|ref|NP_852610.1| microtubule-associated proteins 1A/1B light chain 3A isoform b [Homo sapiens]
MKMRFFSSPCGKAAVDPADRCKEVQQIRDQHPSKIPVIIERYKGEKQLPVLDKTKFLVPDHVNMSELVKI
IRRRLQLNPTQAFFLLVNQHSMVSVSTPIADIYEQEKDEDGFLYMVYASQETFGFIRENE
Next: Step 4: Dropping the position iterator