Why is valarray so slow?

后端 未结 7 1961
情书的邮戳 2020-12-23 15:18

I am trying to use valarray since it is much like MATLAB while operating vector and matrices. I first did some performance check and found that valarray cannot achieve the p

  • 2020-12-23 15:21

    I finally got this through using delayed evaluation. The code may be ugly since I am just starting learning these C++ advanced concepts.

    Here is the code:

    #include <iostream>
    #include <valarray>
    #include <iostream>
    #include "windows.h"
    using namespace std;
    SYSTEMTIME stime;
    LARGE_INTEGER sys_freq;
    double gettime_hp();
    // To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
    // Which causes the program really slow
    // The solution is the expression template and let the compiler to decide when all the expression is known.
    // Delayed evaluation
    //typedef valarray<double> Vector;
    class Vector;
    class VecMul
            const Vector& va;
            const Vector& vb;
            //Vector& vc;
            VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
            operator Vector();
    class Vector:public valarray<double>
        valarray<double> *p;
            explicit Vector(int n)
                p = new valarray<double>(n);
            Vector& operator = (const VecMul &m)
                for(int i=0; i<m.va.size(); i++)
                    (*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
                return *this;
            double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
            int size()const {return (*p).size();}
    inline VecMul operator*(const Vector& v1, const Vector& v2)
        return VecMul(v1, v2);
    int main()
        enum {N = 5*1024*1024};
        Vector a(N), b(N), c(N);
        int i, j;
        for (j=0 ; j<8 ; ++j)
            for (i=0 ; i<N ; ++i)
                a[i] = rand();
                b[i] = rand();
            double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
            double dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c1[i] = a1[i] * b1[i];
            dtime = gettime_hp()-dtime;
            cout << "double operator* " << dtime << " ms\n";
            dtime = gettime_hp();
            c = a*b;
            dtime = gettime_hp()-dtime;
            cout << "valarray operator* " << dtime << " ms\n";
            dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c[i] = a[i] * b[i];
            dtime = gettime_hp() - dtime;
            cout << "valarray[i] operator* " << dtime << " ms\n";
            cout << "------------------------------------------------------\n";
    double gettime_hp()
        LARGE_INTEGER tick;
        extern LARGE_INTEGER sys_freq;
        return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;

    The running result on Visual studio is:

    double operator* 41.2031 ms
    valarray operator* 43.8407 ms
    valarray[i] operator* 42.49 ms
    0 讨论(0)
  • 2020-12-23 15:22

    The whole point of valarray is to be fast on vector machines, which x86 machines just aren't.

    A good implementation on a nonvector machine should be able to match the performance that you get with something like

    for (i=0; i < N; ++i) 
        c1[i] = a1[i] * b1[i];

    and a bad one of course won't. Unless there is something in the hardware to expedite parallel processing, that is going to be pretty close to the best that you can do.

    0 讨论(0)
  • 2020-12-23 15:24

    I think Michael Burr's reply is right. And maybe you can create a virtual type as the type the return value of operator +, and reload another operator= for this virtual type like operator=(virtual type& v){&valarray=&v;v=NULL;} (roughly speaking).

    Of course, it is difficult to implement the idea on valarray. But when you create a new class, you can try this idea. And then, the efficiency for operator+ is almost the same as operator+=.

    0 讨论(0)
  • 2020-12-23 15:27

    Hmm..I tested Blitz++ and it's same as valarray... And moreover, the Blitz++ [] operator is very slow.

    #include <blitz/array.h>
    #include <iostream>
    #ifdef WIN32
    #include "windows.h"
    LARGE_INTEGER sys_freq;
    #ifdef LINUX
    using namespace std;
    SYSTEMTIME stime;
    __forceinline double gettime_hp();
    double gettime_hp()
        #ifdef WIN32
            LARGE_INTEGER tick;
            extern LARGE_INTEGER sys_freq;
            return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
        #ifdef LINUX
            struct timespec timestamp;
            clock_gettime(CLOCK_REALTIME, &timestamp);
            return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
    int main()
        int N = 5*1024*1024;
        // Create three-dimensional arrays of double
        Array<double, 1> a(N), b(N), c(N);
        int i, j;
        #ifdef WIN32
        for (j=0 ; j<8 ; ++j)
            for (i=0 ; i<N ; ++i)
                a[i] = rand();
                b[i] = rand();
            double* a1 = a.data(), *b1 = b.data(), *c1 = c.data();
            double dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c1[i] = a1[i] * b1[i];
            dtime = gettime_hp() - dtime;
            cout << "double operator* " << dtime << " ms\n";
            dtime = gettime_hp();
            c = a*b;
            dtime = gettime_hp() - dtime;
            cout << "blitz operator* " << dtime << " ms\n";
            dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c[i] = a[i] * b[i];
            dtime = gettime_hp() - dtime;
            cout << "blitz[i] operator* " << dtime<< " ms\n";
            cout << "------------------------------------------------------\n";
    0 讨论(0)
  • 2020-12-23 15:28

    I'm compiling in release x64, Visual Studio 2010. I changed your code very slightly:

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            a1[i] *= b1[i];
        dtime = gettime_hp() - dtime;
        cout << "double operator* " << dtime << " ms\n";
        dtime = gettime_hp();
        a *= b;
        dtime = gettime_hp() - dtime;
        cout << "valarray operator* " << dtime << " ms\n";
        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            a[i] *= b[i];
        dtime = gettime_hp() - dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n";
        cout << "------------------------------------------------------\n" ;

    Here you can see that I used *= instead of c = a * b. In more modern mathematical libraries, very complex expression template mechanisms are used that eliminate this problem. In this case, I actually got very slightly faster results from valarray, although that's probably just because the contents were already in a cache. The overhead that you are seeing is simply redundant temporaries and nothing intrinsic to valarray, specifically- you'd see the same behaviour with something like std::string.

    0 讨论(0)
  • 2020-12-23 15:30

    I suspect that the reason c = a*b is so much slower than performing the operations an element at a time is that the

    template<class T> valarray<T> operator*
        (const valarray<T>&, const valarray<T>&);

    operator must allocate memory to put the result into, then returns that by value.

    Even if a "swaptimization" is used to perform the copy, that function still has the overhead of

    • allocating the new block for the resulting valarray
    • initializing the new valarray (it's possible that this might be optimized away)
    • putting the results into the new valarray
    • paging in the memory for the new valarray as it is initialized or set with result values
    • deallocating the old valarray that gets replaced by the result
    0 讨论(0)