Performance issue with parser written with Boost::spirit

前端 未结 4 755
后悔当初
后悔当初 2021-02-10 14:22

I want to parse a file that looks like this (FASTA-like text format):

    >InfoHeader
    \"Some text sequence that h         


        
4条回答
  •  再見小時候
    2021-02-10 14:39

    Previous: Step 3: MOAR FASTER WITH ZERO-COPY
    Return to Step 1. Cleaning up + Profiling

    Step 4: Dropping the position iterator

    Since you're not using it, we can drop the stateful iterator, which is likely to inhibit quite a lot of optimizations (and was indirectly visible in the profiler output)

    Live On Coliru

    #define BOOST_SPIRIT_USE_PHOENIX_V3
    #include 
    #include 
    #include 
    namespace io = boost::iostreams;
    namespace fs = boost::filesystem;
    
    
    class FastaReader {
    
    public:
        typedef std::pair Entry;
        typedef std::vector Data;
    
    private:
        Data fV;
        fs::path file;  
    
    public:
        FastaReader(const fs::path & f);
        ~FastaReader();
    
        const fs::path & getFile() const;
        const Data::const_iterator begin() const;
        const Data::const_iterator end() const;   
    
    private:
        io::mapped_file_source mmap;
        void parse();
    
    };
    
    #include 
    #include 
    #include 
    #include 
    #include 
    
    #include 
    #include 
    #include 
    //#include "fastaReader.hpp"
    
    #include 
    
    using namespace std;
    
    namespace fs = boost::filesystem;
    namespace qi = boost::spirit::qi;
    namespace pt = boost::posix_time;
    namespace io = boost::iostreams;
    
    namespace boost { namespace spirit { namespace traits {
        template 
        struct assign_to_attribute_from_iterators {
            static void call(It f, It l, boost::string_ref& attr) { attr = boost::string_ref { f, size_t(std::distance(f,l)) }; }
        };
    } } }
    
    template 
    struct FastaGrammar : qi::grammar {
    
        FastaGrammar() : FastaGrammar::base_type(fasta) {
            using namespace qi;
            using boost::phoenix::construct;
            using boost::phoenix::begin;
            using boost::phoenix::size;
    
            entry = ('>' >> raw[ *~char_('\n') ] >> '\n' >> raw[ *~char_('>') ]);
            fasta = *entry >> *eol >> eoi ;
    
            BOOST_SPIRIT_DEBUG_NODES((fasta)(entry));
        }
      private:
        qi::rule  fasta;
        qi::rule entry;
    };
    
    FastaReader::FastaReader(const fs::path & f) : file(f), mmap(file.c_str()) {
        parse();
    }
    
    FastaReader::~FastaReader() {}
    
    const fs::path & FastaReader::getFile() const {
        return this->file;
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::begin() const {
        return this->fV.cbegin();
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::end() const {
        return this->fV.cend();
    }
    
    void FastaReader::parse() {
        if (this->file.empty())                throw std::runtime_error("FastaReader: No file specified.");
        if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
    
        typedef char const*                  iterator_type;
        typedef FastaGrammar  fastaGr;
    
        static const fastaGr fG{};
        try {
            std::cerr << "Measuring: Parsing." << std::endl;
            const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
    
            iterator_type first(mmap.data()), last(mmap.end());
            qi::phrase_parse(first, last, fG, boost::spirit::ascii::space, this->fV);
    
            const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
            pt::time_duration duration (endMeasurement - startMeasurement);
            std::cerr << duration <<  std::endl;
        } catch (std::exception const& e) {
            cerr << "error message: " << e.what() << endl;
        }   
    }
    
    int main() {
        FastaReader reader("input.txt");
    
        for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
    }
    

    Now it's 74.8x faster.

    $ time ./test | head -n4
    Measuring: Parsing.
    00:00:00.194432
    

提交回复
热议问题