How to read a .gz file line-by-line in C++?

后端 未结 7 467
一整个雨季
一整个雨季 2020-12-29 23:08

I have 3 terabyte .gz file and want to read its uncompressed content line-by-line in a C++ program. As the file is quite huge, I want to avoid loading it completely in memor

相关标签:
7条回答
  • 2020-12-29 23:19

    Using zlib, I'm doing something along these lines:

    // return a line in a std::vector< char >
    std::vector< char > readline( gzFile f ) {
        std::vector< char > v( 256 );
        unsigned pos = 0;
        for ( ;; ) {
            if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
                // end-of-file or error
                int err;
                const char *msg = gzerror( f, &err );
                if ( err != Z_OK ) {
                    // handle error
                }
                break;
            }
            unsigned read = strlen( &v[ pos ] );
            if ( v[ pos + read - 1 ] == '\n' ) {
                if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
                    pos = pos + read - 2;
                } else {
                    pos = pos + read - 1;
                }
                break;
            }
            if ( read == 0 || pos + read < v.size() - 1 ) {
                pos = read + pos;
                break;
            }
            pos = v.size() - 1;
            v.resize( v.size() * 2 );
        }
        v.resize( pos );
        return v;
    }
    

    EDIT: Removed two mis-copied * in the example above. EDIT: Corrected out of bounds read on v[pos + read - 2]

    0 讨论(0)
  • 2020-12-29 23:19

    Here is some code with which you can read normal and zipped files line by line:

    char line[0x10000];
    FILE *infile=open_file(file);
    bool gzipped=endsWith(file, ".gz");
    if(gzipped) 
        init_gzip_stream(infile,&line[0]);
    while (readLine(infile,line,gzipped)) {
        if(line[0]==0)continue;// skip gzip new_block
        printf(line);
    }
    
    
    #include <zlib.h>
    #define CHUNK 0x100
    #define OUT_CHUNK CHUNK*100
    unsigned char gzip_in[CHUNK];
    unsigned char gzip_out[OUT_CHUNK];
    ///* These are parameters to inflateInit2. See http://zlib.net/manual.html for the exact meanings. */
    #define windowBits 15
    #define ENABLE_ZLIB_GZIP 32
    z_stream strm = {0};
    z_stream init_gzip_stream(FILE* file,char* out){// unsigned     
            strm.zalloc = Z_NULL;
            strm.zfree = Z_NULL;
            strm.opaque = Z_NULL;
            strm.next_in = gzip_in;
            strm.avail_in = 0;
            strm.next_out = gzip_out;
            inflateInit2 (& strm, windowBits | ENABLE_ZLIB_GZIP);
        return strm;
    }
    
    bool inflate_gzip(FILE* file, z_stream strm,size_t bytes_read){
                strm.avail_in = (int)bytes_read;
                do {
                    strm.avail_out = OUT_CHUNK;
                    inflate (& strm, Z_NO_FLUSH);
    //              printf ("%s",gzip_out);
                }while (strm.avail_out == 0);
                if (feof (file)) {
                    inflateEnd (& strm);
                    return false;
                }
        return true;// all OK
    }
    
    
    char* first_line=(char*)&gzip_out[0];
    char* current_line=first_line;
    char* next_line=first_line;
    char hangover[1000];
    bool readLine(FILE* infile,char* line,bool gzipped){
        if(!gzipped)
            return fgets(line, sizeof(line), infile) != NULL;
        else{
            bool ok=true;
            current_line=next_line;
            if(!current_line || strlen(current_line)==0 || next_line-current_line>OUT_CHUNK){
                current_line=first_line;
                size_t bytes_read = fread (gzip_in, sizeof (char), CHUNK, infile);
                ok=inflate_gzip(infile,strm,bytes_read);
                strcpy(line,hangover);
            }
            if(ok){
                next_line=strstr(current_line,"\n");
                if(next_line){
                    next_line[0]=0;
                    next_line++;
                    strcpy(line+strlen(hangover),current_line);
                    hangover[0]=0;
                }else{
                    strcpy(hangover,current_line);
                    line[0]=0;// skip that one!!
                }
            }
            return ok;
        }
    }
    
    0 讨论(0)
  • 2020-12-29 23:26

    The zlib library supports decompressing files in memory in blocks, so you don't have to decompress the entire file in order to process it.

    0 讨论(0)
  • 2020-12-29 23:26

    Chilkat (http://www.chilkatsoft.com/) has libraries to read compressed files from a C++, .Net, VB, ... application.

    0 讨论(0)
  • 2020-12-29 23:28

    For something that is going to be used regularly, you probably want to use one of the previous suggestions. Alternatively, you can do

    gzcat file.gz | yourprogram
    

    and have yourprogram read from cin. This will decompress parts of the file in memory as it is needed, and send the uncompressed output to yourprogram.

    0 讨论(0)
  • 2020-12-29 23:30

    You most probably will have to use ZLib's deflate, example is available from their site

    Alternatively you may have a look at BOOST C++ wrapper

    The example from BOOST page (decompresses data from a file and writes it to standard output)

    #include <fstream>
    #include <iostream>
    #include <boost/iostreams/filtering_streambuf.hpp>
    #include <boost/iostreams/copy.hpp>
    #include <boost/iostreams/filter/zlib.hpp>
    
    int main() 
    {
        using namespace std;
    
        ifstream file("hello.z", ios_base::in | ios_base::binary);
        filtering_streambuf<input> in;
        in.push(zlib_decompressor());
        in.push(file);
        boost::iostreams::copy(in, cout);
    }
    
    0 讨论(0)
提交回复
热议问题