I want to parse a file that looks like this (FASTA-like text format):
>InfoHeader
\"Some text sequence that h
Previous: Step 1. Cleaning up + Profiling
Next: Step 3: MOAR FASTER WITH ZERO-COPY
mmap
Live On Coliru
#include
namespace fs = boost::filesystem;
class FastaReader {
public:
typedef std::pair Entry;
typedef std::vector Data;
private:
Data fV;
fs::path file;
public:
FastaReader(const fs::path & f);
~FastaReader();
const fs::path & getFile() const;
const Data::const_iterator begin() const;
const Data::const_iterator end() const;
private:
void parse();
};
#include
#include
#include
#include
#include
#include
#include
#include
//#include "fastaReader.hpp"
#include
using namespace std;
namespace fs = boost::filesystem;
namespace qi = boost::spirit::qi;
namespace pt = boost::posix_time;
namespace io = boost::iostreams;
template
struct FastaGrammar : qi::grammar {
qi::rule fasta;
FastaGrammar() : FastaGrammar::base_type(fasta) {
using namespace qi;
fasta = *('>' >> *~char_('\n') >> '\n'
>> *~char_('>'))
>> *eol
>> eoi
;
BOOST_SPIRIT_DEBUG_NODES((fasta));
}
};
FastaReader::FastaReader(const fs::path & f) : file(f) {
parse();
}
FastaReader::~FastaReader() {}
const fs::path & FastaReader::getFile() const {
return this->file;
}
const FastaReader::Data::const_iterator FastaReader::begin() const {
return this->fV.cbegin();
}
const FastaReader::Data::const_iterator FastaReader::end() const {
return this->fV.cend();
}
void FastaReader::parse() {
if (this->file.empty()) throw std::runtime_error("FastaReader: No file specified.");
if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
typedef char const* iterator_type;
typedef boost::spirit::classic::position_iterator2 pos_iterator_type;
typedef FastaGrammar fastaGr;
io::mapped_file_source mmap(file.c_str());
static const fastaGr fG{};
try {
std::cerr << "Measuring: Parsing." << std::endl;
const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
pos_iterator_type first(iterator_type{mmap.data()}, iterator_type{mmap.end()}, file.string());
qi::phrase_parse(first, {}, fG, boost::spirit::ascii::space, this->fV);
const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
pt::time_duration duration (endMeasurement - startMeasurement);
std::cerr << duration << std::endl;
} catch (std::exception const& e) {
cerr << "error message: " << e.what() << endl;
}
}
int main() {
FastaReader reader("input.txt");
//for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
}
Indeed on my system it's roughly 3x faster (input is 229 MiB):
$ ./mapped_file_source
Measuring: Parsing.
00:00:07.385787
Next: Step 3: MOAR FASTER WITH ZERO-COPY