Why is valarray so slow?

后端 未结 7 1959
情书的邮戳
情书的邮戳 2020-12-23 15:18

I am trying to use valarray since it is much like MATLAB while operating vector and matrices. I first did some performance check and found that valarray cannot achieve the p

相关标签:
7条回答
  • 2020-12-23 15:21

    I finally got this through using delayed evaluation. The code may be ugly since I am just starting learning these C++ advanced concepts.

    Here is the code:

    #include <iostream>
    #include <valarray>
    #include <iostream>
    #include "windows.h"
    
    using namespace std;
    SYSTEMTIME stime;
    LARGE_INTEGER sys_freq;
    
    double gettime_hp();
    
    // To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
    // Which causes the program really slow
    // The solution is the expression template and let the compiler to decide when all the expression is known.
    
    
    // Delayed evaluation
    //typedef valarray<double> Vector;
    class Vector;
    
    class VecMul
    {
        public:
            const Vector& va;
            const Vector& vb;
            //Vector& vc;
            VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
            operator Vector();
    };
    
    class Vector:public valarray<double>
    {
        valarray<double> *p;
    
        public:
            explicit Vector(int n)
            {
                p = new valarray<double>(n);
            }
            Vector& operator = (const VecMul &m)
            {
                for(int i=0; i<m.va.size(); i++)
                    (*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
                return *this;
            }
            double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
            int size()const {return (*p).size();}
    };
    
    
    inline VecMul operator*(const Vector& v1, const Vector& v2)
    {
        return VecMul(v1, v2);
    }
    
    
    int main()
    {
        enum {N = 5*1024*1024};
        Vector a(N), b(N), c(N);
        QueryPerformanceFrequency(&sys_freq);
        int i, j;
        for (j=0 ; j<8 ; ++j)
        {
            for (i=0 ; i<N ; ++i)
            {
                a[i] = rand();
                b[i] = rand();
            }
    
            double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
            double dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c1[i] = a1[i] * b1[i];
            dtime = gettime_hp()-dtime;
            cout << "double operator* " << dtime << " ms\n";
    
            dtime = gettime_hp();
            c = a*b;
            dtime = gettime_hp()-dtime;
            cout << "valarray operator* " << dtime << " ms\n";
    
            dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c[i] = a[i] * b[i];
            dtime = gettime_hp() - dtime;
            cout << "valarray[i] operator* " << dtime << " ms\n";
    
            cout << "------------------------------------------------------\n";
        }
    }
    
    double gettime_hp()
    {
        LARGE_INTEGER tick;
        extern LARGE_INTEGER sys_freq;
        QueryPerformanceCounter(&tick);
        return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
    }
    

    The running result on Visual studio is:

    double operator* 41.2031 ms
    valarray operator* 43.8407 ms
    valarray[i] operator* 42.49 ms
    
    0 讨论(0)
  • 2020-12-23 15:22

    The whole point of valarray is to be fast on vector machines, which x86 machines just aren't.

    A good implementation on a nonvector machine should be able to match the performance that you get with something like

    for (i=0; i < N; ++i) 
        c1[i] = a1[i] * b1[i];
    

    and a bad one of course won't. Unless there is something in the hardware to expedite parallel processing, that is going to be pretty close to the best that you can do.

    0 讨论(0)
  • 2020-12-23 15:24

    I think Michael Burr's reply is right. And maybe you can create a virtual type as the type the return value of operator +, and reload another operator= for this virtual type like operator=(virtual type& v){&valarray=&v;v=NULL;} (roughly speaking).

    Of course, it is difficult to implement the idea on valarray. But when you create a new class, you can try this idea. And then, the efficiency for operator+ is almost the same as operator+=.

    0 讨论(0)
  • 2020-12-23 15:27

    Hmm..I tested Blitz++ and it's same as valarray... And moreover, the Blitz++ [] operator is very slow.

    #include <blitz/array.h>
    #include <iostream>
    
    #ifdef WIN32
    #include "windows.h"
    LARGE_INTEGER sys_freq;
    #endif
    
    #ifdef LINUX
    <ctime>
    #endif
    
    using namespace std;
    SYSTEMTIME stime;
    
    __forceinline double gettime_hp();
    double gettime_hp()
    {
        #ifdef WIN32
            LARGE_INTEGER tick;
            extern LARGE_INTEGER sys_freq;
            QueryPerformanceCounter(&tick);
            return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
        #endif
    
        #ifdef LINUX
            struct timespec timestamp;
    
            clock_gettime(CLOCK_REALTIME, &timestamp);
            return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
        #endif
    }
    BZ_USING_NAMESPACE(blitz)
    
    int main()
    {
        int N = 5*1024*1024;
    
        // Create three-dimensional arrays of double
        Array<double, 1> a(N), b(N), c(N);
    
        int i, j;
    
        #ifdef WIN32
            QueryPerformanceFrequency(&sys_freq);
        #endif
    
        for (j=0 ; j<8 ; ++j)
        {
            for (i=0 ; i<N ; ++i)
            {
                a[i] = rand();
                b[i] = rand();
            }
    
            double* a1 = a.data(), *b1 = b.data(), *c1 = c.data();
            double dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c1[i] = a1[i] * b1[i];
            dtime = gettime_hp() - dtime;
            cout << "double operator* " << dtime << " ms\n";
    
            dtime = gettime_hp();
            c = a*b;
            dtime = gettime_hp() - dtime;
            cout << "blitz operator* " << dtime << " ms\n";
    
            dtime = gettime_hp();
            for (i=0 ; i<N ; ++i)
                c[i] = a[i] * b[i];
            dtime = gettime_hp() - dtime;
            cout << "blitz[i] operator* " << dtime<< " ms\n";
    
            cout << "------------------------------------------------------\n";
        }
    }
    
    0 讨论(0)
  • 2020-12-23 15:28

    I'm compiling in release x64, Visual Studio 2010. I changed your code very slightly:

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
        double dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            a1[i] *= b1[i];
        dtime = gettime_hp() - dtime;
        cout << "double operator* " << dtime << " ms\n";
    
        dtime = gettime_hp();
        a *= b;
        dtime = gettime_hp() - dtime;
        cout << "valarray operator* " << dtime << " ms\n";
    
        dtime = gettime_hp();
        for (i=0 ; i<N ; ++i)
            a[i] *= b[i];
        dtime = gettime_hp() - dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n";
    
        cout << "------------------------------------------------------\n" ;
    

    Here you can see that I used *= instead of c = a * b. In more modern mathematical libraries, very complex expression template mechanisms are used that eliminate this problem. In this case, I actually got very slightly faster results from valarray, although that's probably just because the contents were already in a cache. The overhead that you are seeing is simply redundant temporaries and nothing intrinsic to valarray, specifically- you'd see the same behaviour with something like std::string.

    0 讨论(0)
  • 2020-12-23 15:30

    I suspect that the reason c = a*b is so much slower than performing the operations an element at a time is that the

    template<class T> valarray<T> operator*
        (const valarray<T>&, const valarray<T>&);
    

    operator must allocate memory to put the result into, then returns that by value.

    Even if a "swaptimization" is used to perform the copy, that function still has the overhead of

    • allocating the new block for the resulting valarray
    • initializing the new valarray (it's possible that this might be optimized away)
    • putting the results into the new valarray
    • paging in the memory for the new valarray as it is initialized or set with result values
    • deallocating the old valarray that gets replaced by the result
    0 讨论(0)
提交回复
热议问题