How to replace/ignore invalid Unicode/UTF8 characters � from C stdio.h getline()?

前端 未结 3 985
旧时难觅i
旧时难觅i 2021-01-03 08:36

On Python, there is this option errors=\'ignore\' for the open Python function:

open( \'/filepath.txt\',          


        
3条回答
  •  孤街浪徒
    2021-01-03 09:14

    I also managed to fix it by trailing/cutting down all Non-ASCII characters.

    This one takes about 2.6 seconds to parse 319MB:

    #include 
    #include 
    
    int main(int argc, char const *argv[])
    {
        FILE* cfilestream = fopen( "./test.txt", "r" );
        size_t linebuffersize = 131072;
    
        if( cfilestream == NULL ) {
            perror( "fopen cfilestream" );
            return -1;
        }
    
        char* readline = (char*) malloc( linebuffersize );
        char* fixedreadline = (char*) malloc( linebuffersize );
    
        if( readline == NULL ) {
            perror( "malloc readline" );
            return -1;
        }
    
        if( fixedreadline == NULL ) {
            perror( "malloc fixedreadline" );
            return -1;
        }
    
        char* source;
        if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
            perror( "setlocale" );
        }
        else {
            std::cerr << "locale='" << source << "'" << std::endl;
        }
    
        int index;
        int charsread;
        int invalidcharsoffset;
        unsigned int fixedchar;
    
        while( true )
        {
            if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
            {
                invalidcharsoffset = 0;
                for( index = 0; index < charsread; ++index )
                {
                    fixedchar = static_cast( readline[index] );
                    // std::cerr << "index " << std::setw(3) << index
                    //         << " readline " << std::setw(10) << fixedchar
                    //         << " -> '" << readline[index] << "'" << std::endl;
    
                    if( 31 < fixedchar && fixedchar < 128 ) {
                        fixedreadline[index-invalidcharsoffset] = readline[index];
                    }
                    else {
                        ++invalidcharsoffset;
                    }
                }
    
                fixedreadline[index-invalidcharsoffset] = '\0';
                // std::cerr << "fixedreadline=" << fixedreadline << std::endl;
            }
            else {
                break;
            }
        }
        std::cerr << "fixedreadline=" << fixedreadline << std::endl;
    
        free( readline );
        free( fixedreadline );
    
        fclose( cfilestream );
        return 0;
    }
    

    Alternative and slower version using memcpy

    Using menmove does not improve much speed, so you could either one.

    This one takes about 3.1 seconds to parse 319MB:

    #include 
    #include 
    #include 
    #include 
    
    int main(int argc, char const *argv[])
    {
        FILE* cfilestream = fopen( "./test.txt", "r" );
        size_t linebuffersize = 131072;
    
        if( cfilestream == NULL ) {
            perror( "fopen cfilestream" );
            return -1;
        }
    
        char* readline = (char*) malloc( linebuffersize );
        char* fixedreadline = (char*) malloc( linebuffersize );
    
        if( readline == NULL ) {
            perror( "malloc readline" );
            return -1;
        }
    
        if( fixedreadline == NULL ) {
            perror( "malloc fixedreadline" );
            return -1;
        }
    
        char* source;
        char* destination;
        char* finalresult;
    
        int index;
        int lastcopy;
        int charsread;
        int charstocopy;
        int invalidcharsoffset;
    
        bool hasignoredbytes;
        unsigned int fixedchar;
    
        if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
            perror( "setlocale" );
        }
        else {
            std::cerr << "locale='" << source << "'" << std::endl;
        }
    
        while( true )
        {
            if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
            {
                hasignoredbytes = false;
                source = readline;
                destination = fixedreadline;
                lastcopy = 0;
                invalidcharsoffset = 0;
    
                for( index = 0; index < charsread; ++index )
                {
                    fixedchar = static_cast( readline[index] );
                    // std::cerr << "fixedchar " << std::setw(10)
                    //           << fixedchar << " -> '"
                    //           << readline[index] << "'" << std::endl;
    
                    if( 31 < fixedchar && fixedchar < 128 ) {
                        if( hasignoredbytes ) {
                            charstocopy = index - lastcopy - invalidcharsoffset;
                            memcpy( destination, source, charstocopy );
    
                            source += index - lastcopy;
                            lastcopy = index;
                            destination += charstocopy;
    
                            invalidcharsoffset = 0;
                            hasignoredbytes = false;
                        }
                    }
                    else {
                        ++invalidcharsoffset;
                        hasignoredbytes = true;
                    }
                }
    
                if( destination != fixedreadline ) {
                    charstocopy = charsread - static_cast( source - readline )
                                   - invalidcharsoffset;
    
                    memcpy( destination, source, charstocopy );
                    destination += charstocopy - 1;
    
                    if( *destination == '\n' ) {
                        *destination = '\0';
                    }
                    else {
                        *++destination = '\0';
                    }
                    finalresult = fixedreadline;
                }
                else {
                    finalresult = readline;
                }
    
                // std::cerr << "finalresult=" << finalresult << std::endl;
            }
            else {
                break;
            }
        }
        std::cerr << "finalresult=" << finalresult << std::endl;
    
        free( readline );
        free( fixedreadline );
    
        fclose( cfilestream );
        return 0;
    }
    

    Optimized solution using iconv

    This takes about 4.6 seconds to parse 319MB of text.

    #include 
    #include 
    #include 
    #include 
    
    // Compile it with:
    //     g++ -o main test.cpp -O3 -liconv
    int main(int argc, char const *argv[])
    {
        FILE* cfilestream = fopen( "./test.txt", "r" );
        size_t linebuffersize = 131072;
    
        if( cfilestream == NULL ) {
            perror( "fopen cfilestream" );
            return -1;
        }
    
        char* readline = (char*) malloc( linebuffersize );
        char* fixedreadline = (char*) malloc( linebuffersize );
    
        if( readline == NULL ) {
            perror( "malloc readline" );
            return -1;
        }
    
        if( fixedreadline == NULL ) {
            perror( "malloc fixedreadline" );
            return -1;
        }
    
        char* source;
        char* destination;
    
        int charsread;
        size_t inchars;
        size_t outchars;
    
        if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
            perror( "setlocale" );
        }
        else {
            std::cerr << "locale='" << source << "'" << std::endl;
        }
    
        iconv_t conversiondescriptor = iconv_open("UTF-8//IGNORE", "UTF-8");
        if( conversiondescriptor == (iconv_t)-1 ) {
            perror( "iconv_open conversiondescriptor" );
        }
    
        while( true )
        {
            if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
            {
                source = readline;
                inchars = charsread;
    
                destination = fixedreadline;
                outchars = charsread;
    
                if( iconv( conversiondescriptor, &source, &inchars, &destination, &outchars ) )
                {
                    perror( "iconv" );
                }
    
                // Trim out the new line character
                if( *--destination == '\n' ) {
                    *--destination = '\0';
                }
                else {
                    *destination = '\0';
                }
    
                // std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
            }
            else {
                break;
            }
        }
        std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
    
        free( readline );
        free( fixedreadline );
    
        if( fclose( cfilestream ) ) {
            perror( "fclose cfilestream" );
        }
    
        if( iconv_close( conversiondescriptor ) ) {
            perror( "iconv_close conversiondescriptor" );
        }
    
        return 0;
    }
    

    Slowest solution ever using mbtowc

    This takes about 24.2 seconds to parse 319MB of text.

    If you comment out the line fixedchar = mbtowc(NULL, source, charsread); and uncomment the line charsread -= fixedchar; (breaking the invalid characters removal) this will take 1.9 seconds instead of 24.2 seconds (also compiled with -O3 optimization level).

    #include 
    #include 
    
    #include 
    #include 
    #include 
    
    int main(int argc, char const *argv[])
    {
        FILE* cfilestream = fopen( "./test.txt", "r" );
        size_t linebuffersize = 131072;
    
        if( cfilestream == NULL ) {
            perror( "fopen cfilestream" );
            return -1;
        }
    
        char* readline = (char*) malloc( linebuffersize );
        if( readline == NULL ) {
            perror( "malloc readline" );
            return -1;
        }
    
        char* source;
        char* lineend;
        char* destination;
        int charsread;
        int fixedchar;
    
        if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
            perror( "setlocale" );
        }
        else {
            std::cerr << "locale='" << source << "'" << std::endl;
        }
    
        while( true )
        {
            if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
            {
                lineend = readline + charsread;
                destination = readline;
                for( source = readline; source != lineend; )
                {
                    // fixedchar = 1;
                    fixedchar = mbtowc(NULL, source, charsread);
                    charsread -= fixedchar;
    
                    // std::ostringstream contents;
                    // for( int index = 0; index < fixedchar; ++index )
                    //         contents << source[index];
    
                    // std::cerr << "fixedchar=" << std::setw(10)
                    //         << fixedchar << " -> '"
                    //         << contents.str().c_str() << "'" << std::endl;
    
                    if( fixedchar > 0 ) {
                        memmove( destination, source, fixedchar );
                        source += fixedchar;
                        destination += fixedchar;
                    }
                    else if( fixedchar < 0 ) {
                        source += 1;
                        // std::cerr << "errno=" << strerror( errno ) << std::endl;
                    }
                    else {
                        break;
                    }
                }
    
                // Trim out the new line character
                if( *--destination == '\n' ) {
                    *--destination = '\0';
                }
                else {
                    *destination = '\0';
                }
    
                // std::cerr << "readline='" << readline << "'" << std::endl;
            }
            else {
                break;
            }
        }
        std::cerr << "readline='" << readline << "'" << std::endl;
    
        if( fclose( cfilestream ) ) {
            perror( "fclose cfilestream" );
        }
    
        free( readline );
        return 0;
    }
    

    Fastest version from all my others above using memmove

    You cannot use memcpy here because the memory regions overlap!

    This takes about 2.4 seconds to parse 319MB.

    If you comment out the lines *destination = *source and memmove( destination, source, 1 ) (breaking the invalid characters removal) the performance still almost the same as when memmove is being called. Here in, calling memmove( destination, source, 1 ) is a little slower than directly doing *destination = *source;

    #include 
    #include 
    #include 
    #include 
    
    int main(int argc, char const *argv[])
    {
        FILE* cfilestream = fopen( "./test.txt", "r" );
        size_t linebuffersize = 131072;
    
        if( cfilestream == NULL ) {
            perror( "fopen cfilestream" );
            return -1;
        }
    
        char* readline = (char*) malloc( linebuffersize );
        if( readline == NULL ) {
            perror( "malloc readline" );
            return -1;
        }
    
        char* source;
        char* lineend;
        char* destination;
    
        int charsread;
        unsigned int fixedchar;
    
        if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
            perror( "setlocale" );
        }
        else {
            std::cerr << "locale='" << source << "'" << std::endl;
        }
    
    
        while( true )
        {
            if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
            {
                lineend = readline + charsread;
                destination = readline;
                for( source = readline; source != lineend; ++source )
                {
                    fixedchar = static_cast( *source );
                    // std::cerr << "fixedchar=" << std::setw(10)
                    //         << fixedchar << " -> '" << *source << "'" << std::endl;
    
                    if( 31 < fixedchar && fixedchar < 128 ) {
                        *destination = *source;
                        ++destination;
                    }
                }
    
                // Trim out the new line character
                if( *source == '\n' ) {
                    *--destination = '\0';
                }
                else {
                    *destination = '\0';
                }
    
                // std::cerr << "readline='" << readline << "'" << std::endl;
            }
            else {
                break;
            }
        }
        std::cerr << "readline='" << readline << "'" << std::endl;
    
        if( fclose( cfilestream ) ) {
            perror( "fclose cfilestream" );
        }
    
        free( readline );
        return 0;
    }
    

    Bonus

    You can also use Python C Extensions (API).

    It takes about 2.3 seconds to parse 319MB without converting them to cached version UTF-8 char*

    And takes about 3.2 seconds to parse 319MB converting them to UTF-8 char*. And also takes about 3.2 seconds to parse 319MB converting them to cached ASCII char*.

    #define PY_SSIZE_T_CLEAN
    #include 
    #include 
    
    typedef struct
    {
        PyObject_HEAD
    }
    PyFastFile;
    
    static PyModuleDef fastfilepackagemodule =
    {
        // https://docs.python.org/3/c-api/module.html#c.PyModuleDef
        PyModuleDef_HEAD_INIT,
        "fastfilepackage", /* name of module */
        "Example module that wrapped a C++ object", /* module documentation, may be NULL */
        -1, /* size of per-interpreter state of the module, or 
                    -1 if the module keeps state in global variables. */
    
        NULL, /* PyMethodDef* m_methods */
        NULL, /* inquiry m_reload */
        NULL, /* traverseproc m_traverse */
        NULL, /* inquiry m_clear */
        NULL, /* freefunc m_free */
    };
    
    // initialize PyFastFile Object
    static int PyFastFile_init(PyFastFile* self, PyObject* args, PyObject* kwargs) {
        char* filepath;
    
        if( !PyArg_ParseTuple( args, "s", &filepath ) ) {
            return -1;
        }
    
        int linecount = 0;
        PyObject* iomodule;
        PyObject* openfile;
        PyObject* fileiterator;
    
        iomodule = PyImport_ImportModule( "builtins" );
        if( iomodule == NULL ) {
            std::cerr << "ERROR: FastFile failed to import the io module '"
                    "(and open the file " << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
        PyObject* openfunction = PyObject_GetAttrString( iomodule, "open" );
    
        if( openfunction == NULL ) {
            std::cerr << "ERROR: FastFile failed get the io module open "
                    << "function (and open the file '" << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
        openfile = PyObject_CallFunction( 
                openfunction, "ssiss", filepath, "r", -1, "ASCII", "ignore" );
    
        if( openfile == NULL ) {
            std::cerr << "ERROR: FastFile failed to open the file'"
                    << filepath << "'!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
        PyObject* iterfunction = PyObject_GetAttrString( openfile, "__iter__" );
        Py_DECREF( openfunction );
    
        if( iterfunction == NULL ) {
            std::cerr << "ERROR: FastFile failed get the io module iterator" 
                    << "function (and open the file '" << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
        PyObject* openiteratorobject = PyObject_CallObject( iterfunction, NULL );
        Py_DECREF( iterfunction );
    
        if( openiteratorobject == NULL ) {
            std::cerr << "ERROR: FastFile failed get the io module iterator object"
                    << " (and open the file '" << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
        fileiterator = PyObject_GetAttrString( openfile, "__next__" );
        Py_DECREF( openiteratorobject );
    
        if( fileiterator == NULL ) {
            std::cerr << "ERROR: FastFile failed get the io module iterator "
                    << "object (and open the file '" << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
    
        PyObject* readline;
        while( ( readline = PyObject_CallObject( fileiterator, NULL ) ) != NULL ) {
            linecount += 1;
            PyUnicode_AsUTF8( readline );
            Py_DECREF( readline );
            // std::cerr << "linecount " << linecount << " readline '" << readline
            //         << "' '" << PyUnicode_AsUTF8( readline ) << "'" << std::endl;
        }
        std::cerr << "linecount " << linecount << std::endl;
    
        // PyErr_PrintEx(100);
        PyErr_Clear();
        PyObject* closefunction = PyObject_GetAttrString( openfile, "close" );
    
        if( closefunction == NULL ) {
            std::cerr << "ERROR: FastFile failed get the close file function for '"
                    << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
    
        PyObject* closefileresult = PyObject_CallObject( closefunction, NULL );
        Py_DECREF( closefunction );
    
        if( closefileresult == NULL ) {
            std::cerr << "ERROR: FastFile failed close open file '"
                    << filepath << "')!" << std::endl;
            PyErr_PrintEx(100);
            return -1;
        }
        Py_DECREF( closefileresult );
    
        Py_XDECREF( iomodule );
        Py_XDECREF( openfile );
        Py_XDECREF( fileiterator );
    
        return 0;
    }
    
    // destruct the object
    static void PyFastFile_dealloc(PyFastFile* self) {
        Py_TYPE(self)->tp_free( (PyObject*) self );
    }
    
    static PyTypeObject PyFastFileType =
    {
        PyVarObject_HEAD_INIT( NULL, 0 )
        "fastfilepackage.FastFile" /* tp_name */
    };
    
    // create the module
    PyMODINIT_FUNC PyInit_fastfilepackage(void)
    {
        PyObject* thismodule;
    
        // https://docs.python.org/3/c-api/typeobj.html
        PyFastFileType.tp_new = PyType_GenericNew;
        PyFastFileType.tp_basicsize = sizeof(PyFastFile);
        PyFastFileType.tp_dealloc = (destructor) PyFastFile_dealloc;
        PyFastFileType.tp_flags = Py_TPFLAGS_DEFAULT;
        PyFastFileType.tp_doc = "FastFile objects";
        PyFastFileType.tp_init = (initproc) PyFastFile_init;
    
        if( PyType_Ready( &PyFastFileType) < 0 ) {
            return NULL;
        }
    
        thismodule = PyModule_Create(&fastfilepackagemodule);
        if( thismodule == NULL ) {
            return NULL;
        }
    
        // Add FastFile class to thismodule allowing the use to create objects
        Py_INCREF( &PyFastFileType );
        PyModule_AddObject( thismodule, "FastFile", (PyObject*) &PyFastFileType );
        return thismodule;
    }
    

    To built it, create the file source/fastfilewrappar.cpp with the contents of the above file and the setup.py with the following contents:

    #! /usr/bin/env python
    # -*- coding: utf-8 -*-
    from setuptools import setup, Extension
    
    myextension = Extension(
        language = "c++",
        extra_link_args = ["-std=c++11"],
        extra_compile_args = ["-std=c++11"],
        name = 'fastfilepackage',
        sources = [
            'source/fastfilewrapper.cpp'
        ],
        include_dirs = [ 'source' ],
    )
    
    setup(
            name = 'fastfilepackage',
            ext_modules= [ myextension ],
        )
    

    To run example, use following Python script:

    import time
    import datetime
    import fastfilepackage
    
    testfile = './test.txt'
    timenow = time.time()
    iterable = fastfilepackage.FastFile( testfile )
    
    fastfile_time = time.time() - timenow
    timedifference = datetime.timedelta( seconds=fastfile_time )
    print( 'FastFile timedifference', timedifference, flush=True )
    

    Example:

    user@user-pc$ /usr/bin/pip3.6 install .
    Processing /fastfilepackage
    Building wheels for collected packages: fastfilepackage
      Building wheel for fastfilepackage (setup.py) ... done
      Stored in directory: /pip-ephem-wheel-cache-j313cpzc/wheels/e5/5f/bc/52c820
    Successfully built fastfilepackage
    Installing collected packages: fastfilepackage
      Found existing installation: fastfilepackage 0.0.0
        Uninstalling fastfilepackage-0.0.0:
          Successfully uninstalled fastfilepackage-0.0.0
    Successfully installed fastfilepackage-0.0.0
    
    user@user-pc$ /usr/bin/python3.6 fastfileperformance.py
    linecount 820800
    FastFile timedifference 0:00:03.204614
    

    Using std::getline

    This takes about 4.7 seconds to parse 319MB.

    If you remove the UTF-8 removal algorithm borrowed from the fastest benchmark using stdlib.h getline(), it takes 1.7 seconds to run.

    #include 
    #include 
    #include 
    #include 
    #include 
    
    int main(int argc, char const *argv[])
    {
        unsigned int fixedchar;
        int linecount = -1;
    
        char* source;
        char* lineend;
        char* destination;
    
        if( ( source = setlocale( LC_ALL, "en_US.ascii" ) ) == NULL ) {
            perror( "setlocale" );
            return -1;
        }
        else {
            std::cerr << "locale='" << source << "'" << std::endl;
        }
    
        std::ifstream fileifstream{ "./test.txt" };
        if( fileifstream.fail() ) {
            std::cerr << "ERROR: FastFile failed to open the file!" << std::endl;
            return -1;
        }
    
        size_t linebuffersize = 131072;
        char* readline = (char*) malloc( linebuffersize );
    
        if( readline == NULL ) {
            perror( "malloc readline" );
            return -1;
        }
    
        while( true )
        {
            if( !fileifstream.eof() )
            {
                linecount += 1;
                fileifstream.getline( readline, linebuffersize );
                lineend = readline + fileifstream.gcount();
                destination = readline;
    
                for( source = readline; source != lineend; ++source )
                {
                    fixedchar = static_cast( *source );
                    // std::cerr << "fixedchar=" << std::setw(10)
                    //         << fixedchar << " -> '" << *source << "'" << std::endl;
    
                    if( 31 < fixedchar && fixedchar < 128 ) {
                        *destination = *source;
                        ++destination;
                    }
                }
    
                // Trim out the new line character
                if( *source == '\n' ) {
                    *--destination = '\0';
                }
                else {
                    *destination = '\0';
                }
    
                // std::cerr << "readline='" << readline << "'" << std::endl;
            }
            else {
                break;
            }
        }
        std::cerr << "linecount='" << linecount << "'" << std::endl;
    
        if( fileifstream.is_open() ) {
            fileifstream.close();
        }
    
        free( readline );
        return 0;
    }
    

    Resume

    1. 2.6 seconds trimming UTF-8 using two buffers with indexing
    2. 3.1 seconds trimming UTF-8 using two buffers with memcpy
    3. 4.6 seconds removing invalid UTF-8 with iconv
    4. 24.2 seconds removing invalid UTF-8 with mbtowc
    5. 2.4 seconds trimming UTF-8 using one buffer with pointer direct assigning

    Bonus

    1. 2.3 seconds removing invalid UTF-8 without converting them to a cached UTF-8 char*
    2. 3.2 seconds removing invalid UTF-8 converting them to a cached UTF-8 char*
    3. 3.2 seconds trimming UTF-8 and caching as ASCII char*
    4. 4.7 seconds trimming UTF-8 with std::getline() using one buffer with pointer direct assigning

    The used file ./text.txt had 820.800 lines where each line was equal to:

    id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char\r\n

    And all versions where compiled with

    1. g++ (GCC) 7.4.0
    2. iconv (GNU libiconv 1.14)
    3. g++ -o main test.cpp -O3 -liconv && time ./main

提交回复
热议问题