Performance issue with parser written with Boost::spirit

前端 未结 4 770
后悔当初
后悔当初 2021-02-10 14:22

I want to parse a file that looks like this (FASTA-like text format):

    >InfoHeader
    \"Some text sequence that h         


        
4条回答
  •  南笙
    南笙 (楼主)
    2021-02-10 14:26

    Previous: Step 2. Faster with mmap
    Next: Step 4: Dropping the position iterator

    Step 3: MOAR FASTER WITH ZERO-COPY

    Let's avoid allocations! If we move the file mapping into the FastaReader class, we can directly point to data in the mapping instead of copying strings all the time.

    Using boost::string_ref as e.g. described here: C++: Fast way to read mapped file into a matrix you can do

    Live On Coliru

    #define BOOST_SPIRIT_USE_PHOENIX_V3
    #include 
    #include 
    #include 
    namespace io = boost::iostreams;
    namespace fs = boost::filesystem;
    
    class FastaReader {
    
    public:
        typedef std::pair Entry;
        typedef std::vector Data;
    
    private:
        Data fV;
        fs::path file;  
    
    public:
        FastaReader(const fs::path & f);
        ~FastaReader();
    
        const fs::path & getFile() const;
        const Data::const_iterator begin() const;
        const Data::const_iterator end() const;   
    
    private:
        io::mapped_file_source mmap;
        void parse();
    
    };
    
    #include 
    #include 
    #include 
    #include 
    #include 
    
    #include 
    #include 
    #include 
    #include 
    //#include "fastaReader.hpp"
    
    #include 
    
    using namespace std;
    
    namespace fs = boost::filesystem;
    namespace qi = boost::spirit::qi;
    namespace pt = boost::posix_time;
    namespace io = boost::iostreams;
    
    namespace boost { namespace spirit { namespace traits {
        template 
        struct assign_to_attribute_from_iterators {
            static void call(It f, It l, boost::string_ref& attr) { attr = boost::string_ref { f.base(), size_t(std::distance(f.base(),l.base())) }; }
        };
    } } }
    
    template 
    struct FastaGrammar : qi::grammar {
    
        FastaGrammar() : FastaGrammar::base_type(fasta) {
            using namespace qi;
            using boost::phoenix::construct;
            using boost::phoenix::begin;
            using boost::phoenix::size;
    
            entry = ('>' >> raw[ *~char_('\n') ] >> '\n' >> raw[ *~char_('>') ]);
            fasta = *entry >> *eol >> eoi ;
    
            BOOST_SPIRIT_DEBUG_NODES((fasta)(entry));
        }
      private:
        qi::rule  fasta;
        qi::rule entry;
    };
    
    FastaReader::FastaReader(const fs::path & f) : file(f), mmap(file.c_str()) {
        parse();
    }
    
    FastaReader::~FastaReader() {}
    
    const fs::path & FastaReader::getFile() const {
        return this->file;
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::begin() const {
        return this->fV.cbegin();
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::end() const {
        return this->fV.cend();
    }
    
    void FastaReader::parse() {
        if (this->file.empty())                throw std::runtime_error("FastaReader: No file specified.");
        if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
    
        typedef char const*                                               iterator_type;
        typedef boost::spirit::classic::position_iterator2 pos_iterator_type;
        typedef FastaGrammar                           fastaGr;
    
        static const fastaGr fG{};
        try {
            std::cerr << "Measuring: Parsing." << std::endl;
            const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
    
            pos_iterator_type first(iterator_type{mmap.data()}, iterator_type{mmap.end()}, file.string());
            qi::phrase_parse(first, {}, fG, boost::spirit::ascii::space, this->fV);
    
            const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
            pt::time_duration duration (endMeasurement - startMeasurement);
            std::cerr << duration <<  std::endl;
        } catch (std::exception const& e) {
            cerr << "error message: " << e.what() << endl;
        }   
    }
    
    int main() {
        FastaReader reader("input.txt");
    
        for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
    }
    

    This is indeed already 4.8x faster:

    $ ./test3 | head -n4
    Measuring: Parsing.
    00:00:04.577123
    >gi|31563518|ref|NP_852610.1| microtubule-associated proteins 1A/1B light chain 3A isoform b [Homo sapiens]
    MKMRFFSSPCGKAAVDPADRCKEVQQIRDQHPSKIPVIIERYKGEKQLPVLDKTKFLVPDHVNMSELVKI
    IRRRLQLNPTQAFFLLVNQHSMVSVSTPIADIYEQEKDEDGFLYMVYASQETFGFIRENE
    

    Next: Step 4: Dropping the position iterator

提交回复
热议问题