Performance issue with parser written with Boost::spirit

前端 未结 4 756
后悔当初
后悔当初 2021-02-10 14:22

I want to parse a file that looks like this (FASTA-like text format):

    >InfoHeader
    \"Some text sequence that h         


        
相关标签:
4条回答
  • 2021-02-10 14:26

    Previous: Step 2. Faster with mmap
    Next: Step 4: Dropping the position iterator

    Step 3: MOAR FASTER WITH ZERO-COPY

    Let's avoid allocations! If we move the file mapping into the FastaReader class, we can directly point to data in the mapping instead of copying strings all the time.

    Using boost::string_ref as e.g. described here: C++: Fast way to read mapped file into a matrix you can do

    Live On Coliru

    #define BOOST_SPIRIT_USE_PHOENIX_V3
    #include <boost/filesystem/path.hpp>
    #include <boost/utility/string_ref.hpp>
    #include <boost/iostreams/device/mapped_file.hpp>
    namespace io = boost::iostreams;
    namespace fs = boost::filesystem;
    
    class FastaReader {
    
    public:
        typedef std::pair<boost::string_ref, boost::string_ref> Entry;
        typedef std::vector<Entry> Data;
    
    private:
        Data fV;
        fs::path file;  
    
    public:
        FastaReader(const fs::path & f);
        ~FastaReader();
    
        const fs::path & getFile() const;
        const Data::const_iterator begin() const;
        const Data::const_iterator end() const;   
    
    private:
        io::mapped_file_source mmap;
        void parse();
    
    };
    
    #include <iomanip>
    #include <boost/date_time/posix_time/posix_time.hpp>
    #include <boost/filesystem/fstream.hpp>
    #include <boost/filesystem/operations.hpp>
    #include <boost/filesystem/path.hpp>
    
    #include <boost/spirit/include/classic_position_iterator.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    #include <boost/fusion/adapted/std_pair.hpp>
    //#include "fastaReader.hpp"
    
    #include <boost/iostreams/device/mapped_file.hpp>
    
    using namespace std;
    
    namespace fs = boost::filesystem;
    namespace qi = boost::spirit::qi;
    namespace pt = boost::posix_time;
    namespace io = boost::iostreams;
    
    namespace boost { namespace spirit { namespace traits {
        template <typename It>
        struct assign_to_attribute_from_iterators<boost::string_ref, It, void> {
            static void call(It f, It l, boost::string_ref& attr) { attr = boost::string_ref { f.base(), size_t(std::distance(f.base(),l.base())) }; }
        };
    } } }
    
    template <typename Iterator>
    struct FastaGrammar : qi::grammar<Iterator, FastaReader::Data()> {
    
        FastaGrammar() : FastaGrammar::base_type(fasta) {
            using namespace qi;
            using boost::phoenix::construct;
            using boost::phoenix::begin;
            using boost::phoenix::size;
    
            entry = ('>' >> raw[ *~char_('\n') ] >> '\n' >> raw[ *~char_('>') ]);
            fasta = *entry >> *eol >> eoi ;
    
            BOOST_SPIRIT_DEBUG_NODES((fasta)(entry));
        }
      private:
        qi::rule<Iterator, FastaReader::Data()>  fasta;
        qi::rule<Iterator, FastaReader::Entry()> entry;
    };
    
    FastaReader::FastaReader(const fs::path & f) : file(f), mmap(file.c_str()) {
        parse();
    }
    
    FastaReader::~FastaReader() {}
    
    const fs::path & FastaReader::getFile() const {
        return this->file;
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::begin() const {
        return this->fV.cbegin();
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::end() const {
        return this->fV.cend();
    }
    
    void FastaReader::parse() {
        if (this->file.empty())                throw std::runtime_error("FastaReader: No file specified.");
        if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
    
        typedef char const*                                               iterator_type;
        typedef boost::spirit::classic::position_iterator2<iterator_type> pos_iterator_type;
        typedef FastaGrammar<pos_iterator_type>                           fastaGr;
    
        static const fastaGr fG{};
        try {
            std::cerr << "Measuring: Parsing." << std::endl;
            const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
    
            pos_iterator_type first(iterator_type{mmap.data()}, iterator_type{mmap.end()}, file.string());
            qi::phrase_parse<pos_iterator_type>(first, {}, fG, boost::spirit::ascii::space, this->fV);
    
            const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
            pt::time_duration duration (endMeasurement - startMeasurement);
            std::cerr << duration <<  std::endl;
        } catch (std::exception const& e) {
            cerr << "error message: " << e.what() << endl;
        }   
    }
    
    int main() {
        FastaReader reader("input.txt");
    
        for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
    }
    

    This is indeed already 4.8x faster:

    $ ./test3 | head -n4
    Measuring: Parsing.
    00:00:04.577123
    >gi|31563518|ref|NP_852610.1| microtubule-associated proteins 1A/1B light chain 3A isoform b [Homo sapiens]
    MKMRFFSSPCGKAAVDPADRCKEVQQIRDQHPSKIPVIIERYKGEKQLPVLDKTKFLVPDHVNMSELVKI
    IRRRLQLNPTQAFFLLVNQHSMVSVSTPIADIYEQEKDEDGFLYMVYASQETFGFIRENE
    

    Next: Step 4: Dropping the position iterator

    0 讨论(0)
  • 2021-02-10 14:38

    Next: Step 2. Faster with mmap

    Step 1. Cleaning up + Profiling

    You should avoid the many rules they introduce type erasure.

    If you input is sane, you can do without the skipper (anyways, line ends were significant, so it made no sense to skip them).

    Use fusion adaptation instead of a helper to construct new pairs:

    This is not optimal, yet, but a lot cleaner:

    $ ./test1
    Measuring: Parsing.
    00:00:22.681605
    

    Slightly more efficient by reducing moving parts and indirections:

    Live On Coliru

    #include <boost/filesystem/path.hpp>
    
    namespace fs = boost::filesystem;
    
    class FastaReader {    
    public:
        typedef std::pair<std::string, std::string> Entry;
        typedef std::vector<Entry> Data;
    
    private:
        Data fV;
        fs::path file;  
    
    public:
        FastaReader(const fs::path & f);
        ~FastaReader();
    
        const fs::path & getFile() const;
        const Data::const_iterator begin() const;
        const Data::const_iterator end() const;   
    
    private:
        void parse();    
    };
    
    #include <iomanip>
    #include <boost/date_time/posix_time/posix_time.hpp>
    #include <boost/filesystem/fstream.hpp>
    #include <boost/filesystem/operations.hpp>
    #include <boost/filesystem/path.hpp>
    
    #include <boost/spirit/include/classic_position_iterator.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <boost/fusion/adapted/std_pair.hpp>
    //#include "fastaReader.hpp"
    
    using namespace std;
    
    namespace fs = boost::filesystem;
    namespace qi = boost::spirit::qi;
    namespace pt = boost::posix_time;
    
    template <typename Iterator>
    struct FastaGrammar : qi::grammar<Iterator, FastaReader::Data()> {
        qi::rule<Iterator, FastaReader::Data()> fasta;
    
        FastaGrammar() : FastaGrammar::base_type(fasta) {
            using namespace qi;
    
            fasta = *('>' >> *~char_('\n') >> '\n' 
                          >> *~char_('>')) 
                    >> *eol
                    >> eoi
                    ;
    
            BOOST_SPIRIT_DEBUG_NODES((fasta));
        }
    };
    
    
    FastaReader::FastaReader(const fs::path & f) : file(f) {
        parse();
    }
    
    FastaReader::~FastaReader() {}
    
    const fs::path & FastaReader::getFile() const {
        return this->file;
    }
    
    const FastaReader::Data::const_iterator FastaReader::begin() const {
        return this->fV.cbegin();
    }
    
    const FastaReader::Data::const_iterator FastaReader::end() const {
        return this->fV.cend();
    }
    
    void FastaReader::parse() {
        if (this->file.empty())                throw std::runtime_error("FastaReader: No file specified.");
        if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
    
        typedef boost::spirit::istream_iterator                           iterator_type;
        typedef boost::spirit::classic::position_iterator2<iterator_type> pos_iterator_type;
        typedef FastaGrammar<pos_iterator_type>                           fastaGr;
    
        fs::ifstream fin(this->file);
        if (!fin) {
            throw std::runtime_error(string("FastaReader: Access denied: ") + this->file.string());
        }
    
        static const fastaGr fG{};
        try {
            std::cerr << "Measuring: Parsing." << std::endl;
            const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
    
            pos_iterator_type first(iterator_type{fin >> std::noskipws}, {}, file.string());
            qi::phrase_parse<pos_iterator_type>(first, {}, fG, boost::spirit::ascii::space, this->fV);
    
            const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
            pt::time_duration duration (endMeasurement - startMeasurement);
            std::cerr << duration <<  std::endl;
        } catch (std::exception const& e) {
            cerr << "error message: " << e.what() << endl;
        }   
    }
    
    int main() {
        std::ios::sync_with_stdio(false);
    
        FastaReader reader("input.txt");
    
        //for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
    }
    

    This is still slow. Let's see what takes so long:

    enter image description here

    That's pretty, but hardly tells us what we need to know. This however does: top-N time consumers are

    enter image description here

    So most time is spent in istream iteration and the multi-pass adaptor. You could argue that the multipass adaptor could be optimized for by flushing it once in a while (each line?) but really, we would prefer not to be tied to the whole stream and operator on the (stream) buffer instead.

    So, I though let's use a mapped file instead:

    Next: Step 2. Faster with mmap

    0 讨论(0)
  • 2021-02-10 14:39

    Previous: Step 3: MOAR FASTER WITH ZERO-COPY
    Return to Step 1. Cleaning up + Profiling

    Step 4: Dropping the position iterator

    Since you're not using it, we can drop the stateful iterator, which is likely to inhibit quite a lot of optimizations (and was indirectly visible in the profiler output)

    Live On Coliru

    #define BOOST_SPIRIT_USE_PHOENIX_V3
    #include <boost/filesystem/path.hpp>
    #include <boost/utility/string_ref.hpp>
    #include <boost/iostreams/device/mapped_file.hpp>
    namespace io = boost::iostreams;
    namespace fs = boost::filesystem;
    
    
    class FastaReader {
    
    public:
        typedef std::pair<boost::string_ref, boost::string_ref> Entry;
        typedef std::vector<Entry> Data;
    
    private:
        Data fV;
        fs::path file;  
    
    public:
        FastaReader(const fs::path & f);
        ~FastaReader();
    
        const fs::path & getFile() const;
        const Data::const_iterator begin() const;
        const Data::const_iterator end() const;   
    
    private:
        io::mapped_file_source mmap;
        void parse();
    
    };
    
    #include <iomanip>
    #include <boost/date_time/posix_time/posix_time.hpp>
    #include <boost/filesystem/fstream.hpp>
    #include <boost/filesystem/operations.hpp>
    #include <boost/filesystem/path.hpp>
    
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    #include <boost/fusion/adapted/std_pair.hpp>
    //#include "fastaReader.hpp"
    
    #include <boost/iostreams/device/mapped_file.hpp>
    
    using namespace std;
    
    namespace fs = boost::filesystem;
    namespace qi = boost::spirit::qi;
    namespace pt = boost::posix_time;
    namespace io = boost::iostreams;
    
    namespace boost { namespace spirit { namespace traits {
        template <typename It>
        struct assign_to_attribute_from_iterators<boost::string_ref, It, void> {
            static void call(It f, It l, boost::string_ref& attr) { attr = boost::string_ref { f, size_t(std::distance(f,l)) }; }
        };
    } } }
    
    template <typename Iterator>
    struct FastaGrammar : qi::grammar<Iterator, FastaReader::Data()> {
    
        FastaGrammar() : FastaGrammar::base_type(fasta) {
            using namespace qi;
            using boost::phoenix::construct;
            using boost::phoenix::begin;
            using boost::phoenix::size;
    
            entry = ('>' >> raw[ *~char_('\n') ] >> '\n' >> raw[ *~char_('>') ]);
            fasta = *entry >> *eol >> eoi ;
    
            BOOST_SPIRIT_DEBUG_NODES((fasta)(entry));
        }
      private:
        qi::rule<Iterator, FastaReader::Data()>  fasta;
        qi::rule<Iterator, FastaReader::Entry()> entry;
    };
    
    FastaReader::FastaReader(const fs::path & f) : file(f), mmap(file.c_str()) {
        parse();
    }
    
    FastaReader::~FastaReader() {}
    
    const fs::path & FastaReader::getFile() const {
        return this->file;
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::begin() const {
        return this->fV.cbegin();
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::end() const {
        return this->fV.cend();
    }
    
    void FastaReader::parse() {
        if (this->file.empty())                throw std::runtime_error("FastaReader: No file specified.");
        if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
    
        typedef char const*                  iterator_type;
        typedef FastaGrammar<iterator_type>  fastaGr;
    
        static const fastaGr fG{};
        try {
            std::cerr << "Measuring: Parsing." << std::endl;
            const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
    
            iterator_type first(mmap.data()), last(mmap.end());
            qi::phrase_parse(first, last, fG, boost::spirit::ascii::space, this->fV);
    
            const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
            pt::time_duration duration (endMeasurement - startMeasurement);
            std::cerr << duration <<  std::endl;
        } catch (std::exception const& e) {
            cerr << "error message: " << e.what() << endl;
        }   
    }
    
    int main() {
        FastaReader reader("input.txt");
    
        for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
    }
    

    Now it's 74.8x faster.

    $ time ./test | head -n4
    Measuring: Parsing.
    00:00:00.194432
    
    0 讨论(0)
  • 2021-02-10 14:51

    Previous: Step 1. Cleaning up + Profiling
    Next: Step 3: MOAR FASTER WITH ZERO-COPY

    Step 2. Faster with mmap

    Live On Coliru

    #include <boost/filesystem/path.hpp>
    
    namespace fs = boost::filesystem;
    
    
    class FastaReader {
    
    public:
        typedef std::pair<std::string, std::string> Entry;
        typedef std::vector<Entry> Data;
    
    private:
        Data fV;
        fs::path file;  
    
    public:
        FastaReader(const fs::path & f);
        ~FastaReader();
    
        const fs::path & getFile() const;
        const Data::const_iterator begin() const;
        const Data::const_iterator end() const;   
    
    private:
        void parse();
    
    };
    
    #include <iomanip>
    #include <boost/date_time/posix_time/posix_time.hpp>
    #include <boost/filesystem/fstream.hpp>
    #include <boost/filesystem/operations.hpp>
    #include <boost/filesystem/path.hpp>
    
    #include <boost/spirit/include/classic_position_iterator.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <boost/fusion/adapted/std_pair.hpp>
    //#include "fastaReader.hpp"
    
    #include <boost/iostreams/device/mapped_file.hpp>
    
    using namespace std;
    
    namespace fs = boost::filesystem;
    namespace qi = boost::spirit::qi;
    namespace pt = boost::posix_time;
    namespace io = boost::iostreams;
    
    template <typename Iterator>
    struct FastaGrammar : qi::grammar<Iterator, FastaReader::Data()> {
        qi::rule<Iterator, FastaReader::Data()> fasta;
    
        FastaGrammar() : FastaGrammar::base_type(fasta) {
            using namespace qi;
    
            fasta = *('>' >> *~char_('\n') >> '\n' 
                          >> *~char_('>')) 
                    >> *eol
                    >> eoi
                    ;
    
            BOOST_SPIRIT_DEBUG_NODES((fasta));
        }
    };
    
    
    FastaReader::FastaReader(const fs::path & f) : file(f) {
        parse();
    }
    
    FastaReader::~FastaReader() {}
    
    const fs::path & FastaReader::getFile() const {
        return this->file;
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::begin() const {
        return this->fV.cbegin();
    }
    
    
    const FastaReader::Data::const_iterator FastaReader::end() const {
        return this->fV.cend();
    }
    
    void FastaReader::parse() {
        if (this->file.empty())                throw std::runtime_error("FastaReader: No file specified.");
        if (! fs::is_regular_file(this->file)) throw std::runtime_error(string("FastaReader: File not found: ") + this->file.string());
    
        typedef char const*                                               iterator_type;
        typedef boost::spirit::classic::position_iterator2<iterator_type> pos_iterator_type;
        typedef FastaGrammar<pos_iterator_type>                           fastaGr;
    
        io::mapped_file_source mmap(file.c_str());
    
        static const fastaGr fG{};
        try {
            std::cerr << "Measuring: Parsing." << std::endl;
            const pt::ptime startMeasurement = pt::microsec_clock::universal_time();
    
            pos_iterator_type first(iterator_type{mmap.data()}, iterator_type{mmap.end()}, file.string());
            qi::phrase_parse<pos_iterator_type>(first, {}, fG, boost::spirit::ascii::space, this->fV);
    
            const pt::ptime endMeasurement = pt::microsec_clock::universal_time();
            pt::time_duration duration (endMeasurement - startMeasurement);
            std::cerr << duration <<  std::endl;
        } catch (std::exception const& e) {
            cerr << "error message: " << e.what() << endl;
        }   
    }
    
    int main() {
        FastaReader reader("input.txt");
    
        //for (auto& e : reader) std::cout << '>' << e.first << '\n' << e.second << "\n\n";
    }
    

    Indeed on my system it's roughly 3x faster (input is 229 MiB):

    $ ./mapped_file_source
    Measuring: Parsing.
    00:00:07.385787
    

    Next: Step 3: MOAR FASTER WITH ZERO-COPY

    0 讨论(0)
提交回复
热议问题