I want to parse a file that looks like this (FASTA-like text format):
>InfoHeader
\"Some text sequence that h
Next: Step 2. Faster with mmap
You should avoid the many rules they introduce type erasure.
If you input is sane, you can do without the skipper (anyways, line ends were significant, so it made no sense to skip them).
Use fusion adaptation instead of a helper to construct new pairs:
This is not optimal, yet, but a lot cleaner:
$ ./test1
Measuring: Parsing.
00:00:22.681605
Slightly more efficient by reducing moving parts and indirections:
Live On Coliru
#include
namespace fs = boost::filesystem;
class FastaReader {
public:
typedef std::pair Entry;
typedef std::vector Data;
private:
Data fV;
fs::path file;
public:
FastaReader(const fs::path & f);
~FastaReader();
const fs::path & getFile() const;
const Data::const_iterator begin() const;
const Data::const_iterator end() const;
private:
void parse();
};
#include
#include
#include
#include
#include
#include
#include
#include
//#include "fastaReader.hpp"
using namespace std;
namespace fs = boost::filesystem;
namespace qi = boost::spirit::qi;
namespace pt = boost::posix_time;
template
struct FastaGrammar : qi::grammar {
qi::rule fasta;
FastaGrammar() : FastaGrammar::base_type(fasta) {
using namespace qi;
fasta = *('>' >> *~char_('\n') >> '\n'
>> *~char_('>'))
>> *eol
>> eoi
;
BOOST_SPIRIT_DEBUG_NODES((fasta));
}
};
FastaReader::FastaReader(const fs::path & f) : file(f) {
parse();
}
FastaReader::~FastaReader() {}
const fs::path & FastaReader::getFile() const {
return this->file;
}
const FastaReader::Data::const_iterator FastaReader::begin() const {
return this->fV.cbegin();
}
const FastaReader::Data::const_iterator FastaReader::end() const {
return this->fV.cend();
}
void FastaReader::parse() {
if (this->file.empty()) throw std::runtime_error("FastaReader: No file specified.");
if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
typedef boost::spirit::istream_iterator iterator_type;
typedef boost::spirit::classic::position_iterator2 pos_iterator_type;
typedef FastaGrammar fastaGr;
fs::ifstream fin(this->file);
if (!fin) {
throw std::runtime_error(string("FastaReader: Access denied: ") + this->file.string());
}
static const fastaGr fG{};
try {
std::cerr << "Measuring: Parsing." << std::endl;
const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
pos_iterator_type first(iterator_type{fin >> std::noskipws}, {}, file.string());
qi::phrase_parse(first, {}, fG, boost::spirit::ascii::space, this->fV);
const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
pt::time_duration duration (endMeasurement - startMeasurement);
std::cerr << duration << std::endl;
} catch (std::exception const& e) {
cerr << "error message: " << e.what() << endl;
}
}
int main() {
std::ios::sync_with_stdio(false);
FastaReader reader("input.txt");
//for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
}
This is still slow. Let's see what takes so long:
That's pretty, but hardly tells us what we need to know. This however does: top-N time consumers are
So most time is spent in istream iteration and the multi-pass adaptor. You could argue that the multipass adaptor could be optimized for by flushing it once in a while (each line?) but really, we would prefer not to be tied to the whole stream and operator on the (stream) buffer instead.
So, I though let's use a mapped file instead:
Next: Step 2. Faster with mmap