问题:

Excuse me for my question for the valarray again. I am trying to use it since it is much like the matlab while operating the vector & matrices. I first did some performance check and found that valarray cannot achieve the performance declared as in the book c++ programming language by stroustrup.

The test program actually did 5M multiplication of doubles. I thought that c=a*b would at least be comparable to the for loop double type element multiplication, but I am totally wrong. Tried on several computers and vc6.0 and vs2008.

By the way, I tested on matlab using the following code:

len=5*1024*1024; a=rand(len,1);b=rand(len,1);c=zeros(len,1); tic;c=a.*b;toc;

and the result is 46ms. This time is not high precision, only works as a reference.

The code is:

#include <iostream> #include <valarray> #include <iostream> #include "windows.h"  using namespace std ; SYSTEMTIME stime; LARGE_INTEGER sys_freq;  double gettime_hp();  int main() {     enum { N = 5*1024*1024 };     valarray<double> a(N), b(N), c(N) ;     QueryPerformanceFrequency(&sys_freq);        int i,j;     for(  j=0 ; j<8 ; ++j )     {         for(  i=0 ; i<N ; ++i )          {             a[i]=rand();             b[i]=rand();         }          double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;         double dtime=gettime_hp();         for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;         dtime=gettime_hp()-dtime;         cout << "double operator* " << dtime << " ms\n" ;          dtime=gettime_hp();         c = a*b ;         dtime=gettime_hp()-dtime;         cout << "valarray operator* " << dtime << " ms\n" ;          dtime=gettime_hp();         for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;         dtime=gettime_hp()-dtime;         cout << "valarray[i] operator* " << dtime<< " ms\n" ;          cout << "------------------------------------------------------\n" ;     } }  double gettime_hp() {     LARGE_INTEGER tick;     extern LARGE_INTEGER sys_freq;     QueryPerformanceCounter(&tick);     return (double)tick.QuadPart*1000.0/sys_freq.QuadPart; }

The running results: (release mode with maximal speed optimization)

double operator* 52.3019 ms valarray operator* 128.338 ms valarray[i] operator* 43.1801 ms ------------------------------------------------------ double operator* 43.4036 ms valarray operator* 145.533 ms valarray[i] operator* 44.9121 ms ------------------------------------------------------ double operator* 43.2619 ms valarray operator* 158.681 ms valarray[i] operator* 43.4871 ms ------------------------------------------------------ double operator* 42.7317 ms valarray operator* 173.164 ms valarray[i] operator* 80.1004 ms ------------------------------------------------------ double operator* 43.2236 ms valarray operator* 158.004 ms valarray[i] operator* 44.3813 ms ------------------------------------------------------

debugging mode with same optimization:

double operator* 41.8123 ms valarray operator* 201.484 ms valarray[i] operator* 41.5452 ms ------------------------------------------------------ double operator* 40.2238 ms valarray operator* 215.351 ms valarray[i] operator* 40.2076 ms ------------------------------------------------------ double operator* 40.5859 ms valarray operator* 232.007 ms valarray[i] operator* 40.8803 ms ------------------------------------------------------ double operator* 40.9734 ms valarray operator* 234.325 ms valarray[i] operator* 40.9711 ms ------------------------------------------------------ double operator* 41.1977 ms valarray operator* 234.409 ms valarray[i] operator* 41.1429 ms ------------------------------------------------------ double operator* 39.7754 ms valarray operator* 234.26 ms valarray[i] operator* 39.6338 ms ------------------------------------------------------

回答1:

I suspect that the reason c = a*b is so much slower than performing the operations an element at a time is that the

template<class T> valarray<T> operator*     (const valarray<T>&, const valarray<T>&);

operator must allocate memory to put the result into, then returns that by value.

Even if a "swaptimization" is used to perform the copy, that function still has the overhead of

allocating the new block for the resulting valarray
initializing the new valarray (it's possible that this might be optimized away)
putting the results into the new valarray
paging in the memory for the new valarray as it is initialized or set with result values
deallocating the old valarray that gets replaced by the result

回答2:

I just tried it on a Linux x86-64 system (Sandy Bridge CPU):

gcc 4.5.0:

double operator* 9.64185 ms valarray operator* 9.36987 ms valarray[i] operator* 9.35815 ms

Intel ICC 12.0.2:

double operator* 7.76757 ms valarray operator* 9.60208 ms valarray[i] operator* 7.51409 ms

In both cases I just used -O3 and no other optimisation-related flags.

It looks like the MS C++ compiler and/or valarray implementation suck.

Here's the OP's code modified for Linux:

#include <iostream> #include <valarray> #include <iostream> #include <ctime>  using namespace std ;  double gettime_hp();  int main() {     enum { N = 5*1024*1024 };     valarray<double> a(N), b(N), c(N) ;     int i,j;     for(  j=0 ; j<8 ; ++j )     {         for(  i=0 ; i<N ; ++i )         {             a[i]=rand();             b[i]=rand();         }          double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;         double dtime=gettime_hp();         for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;         dtime=gettime_hp()-dtime;         cout << "double operator* " << dtime << " ms\n" ;          dtime=gettime_hp();         c = a*b ;         dtime=gettime_hp()-dtime;         cout << "valarray operator* " << dtime << " ms\n" ;          dtime=gettime_hp();         for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;         dtime=gettime_hp()-dtime;         cout << "valarray[i] operator* " << dtime<< " ms\n" ;          cout << "------------------------------------------------------\n" ;     } }  double gettime_hp() {     struct timespec timestamp;      clock_gettime(CLOCK_REALTIME, &timestamp);     return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6; }

回答3:

The whole point of valarray is to be fast on vector machines, which x86 machines just aren't. A good implementation on a nonvector machine should be able to match the performance that you get with something like
for (i=0; i < N; ++i) c1[i] = a1[i] * b1[i];

and a bad one of course won't. Unless there is something in the hardware to expedite parallel processing, that is going to be pretty close to the best that you can do.

回答4:

I finally got this through using delayed evaluation. The code may be ugly since I am just starting learning these c++ advanced concepts. Correct me if you have better idea please. Thanks a lot for all your assistance. Here is the code:

#include <iostream> #include <valarray> #include <iostream> #include "windows.h"  using namespace std ; SYSTEMTIME stime; LARGE_INTEGER sys_freq;  double gettime_hp(); //to improve the c=a*b (it will generate a temp first, assigned to c and delete the temp //which causes the program really slow //the solution is the expression template and let the compiler to decide when all the expression is known //delayed evaluation //typedef valarray<double> Vector; class Vector; class VecMul { public:     const Vector& va;     const Vector& vb;     //Vector& vc;     VecMul(const Vector& v1,const Vector& v2):va(v1),vb(v2){}     operator Vector(); };  class Vector:public valarray<double> {     valarray<double> *p; public:     explicit Vector(int n)     {         p=new valarray<double>(n);     }     Vector& operator=(const VecMul &m)     {         for(int i=0;i<m.va.size();i++) (*p)[i]=(m.va)[i]*(m.vb)[i];//ambiguous         return *this;     }     double& operator[](int i) const {return (*p)[i];}  //const vector_type[i]     int size()const {return (*p).size();} };    inline VecMul operator*(const Vector& v1,const Vector& v2) {     return VecMul(v1,v2); }   int main() {     enum { N = 5*1024*1024 };     Vector a(N), b(N), c(N) ;     QueryPerformanceFrequency(&sys_freq);        int i,j;     for(  j=0 ; j<8 ; ++j )     {         for(  i=0 ; i<N ; ++i )          {             a[i]=rand();             b[i]=rand();         }          double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;         double dtime=gettime_hp();         for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;         dtime=gettime_hp()-dtime;         cout << "double operator* " << dtime << " ms\n" ;          dtime=gettime_hp();         c = a*b ;         dtime=gettime_hp()-dtime;         cout << "valarray operator* " << dtime << " ms\n" ;          dtime=gettime_hp();         for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;         dtime=gettime_hp()-dtime;         cout << "valarray[i] operator* " << dtime<< " ms\n" ;          cout << "------------------------------------------------------\n" ;     } }  double gettime_hp() {     LARGE_INTEGER tick;     extern LARGE_INTEGER sys_freq;     QueryPerformanceCounter(&tick);     return (double)tick.QuadPart*1000.0/sys_freq.QuadPart; }

The running result on Visual studio is:

double operator* 41.2031 ms valarray operator* 43.8407 ms valarray[i] operator* 42.49 ms

回答5:

I'm compiling in release x64, VS 2010. I changed your code very slightly:

    double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;     double dtime=gettime_hp();     for(  i=0 ; i<N ; ++i ) a1[i] *= b1[i] ;     dtime=gettime_hp()-dtime;     cout << "double operator* " << dtime << " ms\n" ;      dtime=gettime_hp();     a *= b;     dtime=gettime_hp()-dtime;     cout << "valarray operator* " << dtime << " ms\n" ;      dtime=gettime_hp();     for(  i=0 ; i<N ; ++i ) a[i] *= b[i] ;     dtime=gettime_hp()-dtime;     cout << "valarray[i] operator* " << dtime<< " ms\n" ;      cout << "------------------------------------------------------\n" ;

Here you can see that I used *= instead of c = a * b. In more modern mathematical libraries, very complex expression template mechanisms are used that eliminate this problem. In this case, I actually got very slightly faster results from valarray, although that's probably just because the contents were already in cache. The overhead that you are seeing is simply redundant temporaries and nothing intrinsic to valarray, specifically- you'd see the same behaviour with something like std::string.

回答6:

hmm..I tested blitz and its same as valarray..and more blitz++ [] operatpr is very slow

 #include <blitz/array.h>       #include <iostream>     #ifdef WIN32     #include "windows.h"     LARGE_INTEGER sys_freq;     #endif     #ifdef LINUX     <ctime>     #endif         using namespace std ;     SYSTEMTIME stime;       __forceinline double gettime_hp();     double gettime_hp()     {     #ifdef WIN32         LARGE_INTEGER tick;         extern LARGE_INTEGER sys_freq;         QueryPerformanceCounter(&tick);         return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;     #endif     #ifdef LINUX         struct timespec timestamp;          clock_gettime(CLOCK_REALTIME, &timestamp);         return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;     #endif     }     BZ_USING_NAMESPACE(blitz)      int main()     {         int N = 5*1024*1024 ;          // Create three-dimensional arrays of double         Array<double,1> a(N), b(N),c(N);           int i,j;     #ifdef WIN32         QueryPerformanceFrequency(&sys_freq);        #endif         for(  j=0 ; j<8 ; ++j )         {             for(  i=0 ; i<N ; ++i )              {                 a[i]=rand();                 b[i]=rand();             }              double* a1 = a.data() , *b1 = b.data(), *c1 = c.data() ;             double dtime=gettime_hp();             for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;             dtime=gettime_hp()-dtime;             cout << "double operator* " << dtime << " ms\n" ;              dtime=gettime_hp();             c = a*b ;             dtime=gettime_hp()-dtime;             cout << "blitz operator* " << dtime << " ms\n" ;              dtime=gettime_hp();             for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;             dtime=gettime_hp()-dtime;             cout << "blitz[i] operator* " << dtime<< " ms\n" ;              cout << "------------------------------------------------------\n" ;         }     }

回答7:

I think Michael Burr's reply is right. And may be you can create an virtual type as the type the return value of operator +, and reload another operator= for this virtual type like operator=(virtual type& v){&valarray=&v;v=NULL;}(roughly speaking). Of course, it is difficult to implement the idea on valarray. But when you create a new class, you can try this idea. And then, the efficiency for operator+ is almost the same as operator+=

转载请标明出处:why is valarray so slow?

文章来源: why is valarray so slow?

标签

vector

const