I am trying to compare the performance of boost::multi_array to native dynamically allocated arrays, with the following test program:
#include
I was looking at this question because I had the same question. I had some thoughts to give a more rigorous test.
On a Mac, the following code is configured to give more meaningful answers. There are 4 tests here.
#define BOOST_DISABLE_ASSERTS
#include "boost/multi_array.hpp"
#include <sys/time.h>
#include <stdint.h>
#include<string>
uint64_t GetTimeMs64()
{
struct timeval tv;
gettimeofday( &tv, NULL );
uint64_t ret = tv.tv_usec;
/* Convert from micro seconds (10^-6) to milliseconds (10^-3) */
ret /= 1000;
/* Adds the seconds (10^0) after converting them to milliseconds (10^-3) */
ret += ( tv.tv_sec * 1000 );
return ret;
}
void function1( const int X_SIZE, const int Y_SIZE, const int ITERATIONS )
{
double nativeMatrix1add[X_SIZE*Y_SIZE];
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
nativeMatrix1add[y + ( x * Y_SIZE )] = rand();
}
}
// Create the native array
double* __restrict const nativeMatrix1p = new double[X_SIZE * Y_SIZE];
uint64_t startTime = GetTimeMs64();
for( int i = 0 ; i < ITERATIONS ; ++i )
{
for( int xy = 0 ; xy < X_SIZE*Y_SIZE ; ++xy )
{
nativeMatrix1p[xy] += nativeMatrix1add[xy];
}
}
uint64_t endTime = GetTimeMs64();
printf( "[Native Pointer] Elapsed time: %6.3f seconds\n", ( endTime - startTime ) / 1000.0 );
}
void function2( const int X_SIZE, const int Y_SIZE, const int ITERATIONS )
{
double nativeMatrix1add[X_SIZE*Y_SIZE];
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
nativeMatrix1add[y + ( x * Y_SIZE )] = rand();
}
}
// Create the native array
double* __restrict const nativeMatrix1 = new double[X_SIZE * Y_SIZE];
uint64_t startTime = GetTimeMs64();
for( int i = 0 ; i < ITERATIONS ; ++i )
{
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
nativeMatrix1[y + ( x * Y_SIZE )] += nativeMatrix1add[y + ( x * Y_SIZE )];
}
}
}
uint64_t endTime = GetTimeMs64();
printf( "[Native 1D Array] Elapsed time: %6.3f seconds\n", ( endTime - startTime ) / 1000.0 );
}
void function3( const int X_SIZE, const int Y_SIZE, const int ITERATIONS )
{
double nativeMatrix2add[X_SIZE][Y_SIZE];
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
nativeMatrix2add[x][y] = rand();
}
}
// Create the native array
double nativeMatrix2[X_SIZE][Y_SIZE];
uint64_t startTime = GetTimeMs64();
for( int i = 0 ; i < ITERATIONS ; ++i )
{
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
nativeMatrix2[x][y] += nativeMatrix2add[x][y];
}
}
}
uint64_t endTime = GetTimeMs64();
printf( "[Native 2D Array] Elapsed time: %6.3f seconds\n", ( endTime - startTime ) / 1000.0 );
}
void function4( const int X_SIZE, const int Y_SIZE, const int ITERATIONS )
{
boost::multi_array<double, 2> boostMatrix2add( boost::extents[X_SIZE][Y_SIZE] );
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
boostMatrix2add[x][y] = rand();
}
}
// Create the native array
boost::multi_array<double, 2> boostMatrix( boost::extents[X_SIZE][Y_SIZE] );
uint64_t startTime = GetTimeMs64();
for( int i = 0 ; i < ITERATIONS ; ++i )
{
for( int x = 0 ; x < X_SIZE ; ++x )
{
for( int y = 0 ; y < Y_SIZE ; ++y )
{
boostMatrix[x][y] += boostMatrix2add[x][y];
}
}
}
uint64_t endTime = GetTimeMs64();
printf( "[Boost Array] Elapsed time: %6.3f seconds\n", ( endTime - startTime ) / 1000.0 );
}
int main( int argc, char* argv[] )
{
srand( time( NULL ) );
const int X_SIZE = std::stoi( argv[1] );
const int Y_SIZE = std::stoi( argv[2] );
const int ITERATIONS = std::stoi( argv[3] );
function1( X_SIZE, Y_SIZE, ITERATIONS );
function2( X_SIZE, Y_SIZE, ITERATIONS );
function3( X_SIZE, Y_SIZE, ITERATIONS );
function4( X_SIZE, Y_SIZE, ITERATIONS );
return 0;
}
One with just a single dimensional array using the [] with integer math and a double loop
One with the same single dimensional array using pointer incrementing
A multidimensional C array
A boost multi_array
so run from a command line, run
./test_array xsize ysize iterations"
and you can get a good idea of how these approaches will perform. Here is what I got with the following compiler flags:
g++4.9.2 -O3 -march=native -funroll-loops -mno-avx --fast-math -DNDEBUG -c -std=c++11
./test_array 51200 1 20000
[Native 1-Loop ] Elapsed time: 0.537 seconds
[Native 1D Array] Elapsed time: 2.045 seconds
[Native 2D Array] Elapsed time: 2.749 seconds
[Boost Array] Elapsed time: 1.167 seconds
./test_array 25600 2 20000
[Native 1-Loop ] Elapsed time: 0.531 seconds
[Native 1D Array] Elapsed time: 1.241 seconds
[Native 2D Array] Elapsed time: 1.631 seconds
[Boost Array] Elapsed time: 0.954 seconds
./test_array 12800 4 20000
[Native 1-Loop ] Elapsed time: 0.536 seconds
[Native 1D Array] Elapsed time: 1.214 seconds
[Native 2D Array] Elapsed time: 1.223 seconds
[Boost Array] Elapsed time: 0.798 seconds
./test_array 6400 8 20000
[Native 1-Loop ] Elapsed time: 0.540 seconds
[Native 1D Array] Elapsed time: 0.845 seconds
[Native 2D Array] Elapsed time: 0.878 seconds
[Boost Array] Elapsed time: 0.803 seconds
./test_array 3200 16 20000
[Native 1-Loop ] Elapsed time: 0.537 seconds
[Native 1D Array] Elapsed time: 0.661 seconds
[Native 2D Array] Elapsed time: 0.673 seconds
[Boost Array] Elapsed time: 0.708 seconds
./test_array 1600 32 20000
[Native 1-Loop ] Elapsed time: 0.532 seconds
[Native 1D Array] Elapsed time: 0.592 seconds
[Native 2D Array] Elapsed time: 0.596 seconds
[Boost Array] Elapsed time: 0.764 seconds
./test_array 800 64 20000
[Native 1-Loop ] Elapsed time: 0.546 seconds
[Native 1D Array] Elapsed time: 0.594 seconds
[Native 2D Array] Elapsed time: 0.606 seconds
[Boost Array] Elapsed time: 0.764 seconds
./test_array 400 128 20000
[Native 1-Loop ] Elapsed time: 0.536 seconds
[Native 1D Array] Elapsed time: 0.560 seconds
[Native 2D Array] Elapsed time: 0.564 seconds
[Boost Array] Elapsed time: 0.746 seconds
So, I think that it is safe to say that the boost multi_array performs pretty good. Nothing beats a single loop evaluation, but depending on the dimension of the array, the boost::multi_array may beat a standard c-array with a double loop.
Your test is flawed.
What you're likely seeing is the result of your optimizing compiler seeing that most or all of your "native array" loops can be removed. The same is theoretically true of your boost::MultiArray loops, but MultiArray is probably complex enough to defeat your optimizer.
Make this small change to your testbed and you'll see more true-to-life results: Change both occurances of "= 2.345
" with "*= 2.345
" and compile again with optimizations. This will prevent your compiler from discovering that the outer loop of each test is redundant.
I did it and got a speed comparison closer to 2:1.
Consider using Blitz++ instead. I tried out Blitz, and its performance is on par with C-style array!
Check out your code with Blitz added below:
#include <windows.h>
#define _SCL_SECURE_NO_WARNINGS
#define BOOST_DISABLE_ASSERTS
#include <boost/multi_array.hpp>
#include <blitz/array.h>
int main(int argc, char* argv[])
{
const int X_SIZE = 200;
const int Y_SIZE = 200;
const int ITERATIONS = 500;
unsigned int startTime = 0;
unsigned int endTime = 0;
// Create the boost array
typedef boost::multi_array<double, 2> ImageArrayType;
ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
//------------------Measure boost----------------------------------------------
startTime = ::GetTickCount();
for (int i = 0; i < ITERATIONS; ++i)
{
for (int y = 0; y < Y_SIZE; ++y)
{
for (int x = 0; x < X_SIZE; ++x)
{
boostMatrix[x][y] = 2.345;
}
}
}
endTime = ::GetTickCount();
printf("[Boost] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
//------------------Measure blitz-----------------------------------------------
blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
startTime = ::GetTickCount();
for (int i = 0; i < ITERATIONS; ++i)
{
for (int y = 0; y < Y_SIZE; ++y)
{
for (int x = 0; x < X_SIZE; ++x)
{
blitzArray(x,y) = 2.345;
}
}
}
endTime = ::GetTickCount();
printf("[Blitz] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
//------------------Measure native-----------------------------------------------
// Create the native array
double *nativeMatrix = new double [X_SIZE * Y_SIZE];
startTime = ::GetTickCount();
for (int i = 0; i < ITERATIONS; ++i)
{
for (int y = 0; y < Y_SIZE; ++y)
{
for (int x = 0; x < X_SIZE; ++x)
{
nativeMatrix[x + (y * X_SIZE)] = 2.345;
}
}
}
endTime = ::GetTickCount();
printf("[Native]Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
return 0;
}
Here's the result in debug and release.
DEBUG:
Boost 2.093 secs
Blitz 0.375 secs
Native 0.078 secs
RELEASE:
Boost 0.266 secs
Blitz 0.016 secs
Native 0.015 secs
I used MSVC 2008 SP1 compiler for this.
Can we now say good-bye to C-stlye array? =p
I tested on a Snow Leopard Mac OS using gcc 4.2.1
Debug:
[Boost] Elapsed time: 2.268 seconds
[Native]Elapsed time: 0.076 seconds
Release:
[Boost] Elapsed time: 0.065 seconds
[Native]Elapsed time: 0.020 seconds
Here, is the code (modified so that it can be compiled on Unix):
#define BOOST_DISABLE_ASSERTS
#include <boost/multi_array.hpp>
#include <ctime>
int main(int argc, char* argv[])
{
const int X_SIZE = 200;
const int Y_SIZE = 200;
const int ITERATIONS = 500;
unsigned int startTime = 0;
unsigned int endTime = 0;
// Create the boost array
typedef boost::multi_array<double, 2> ImageArrayType;
ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
// Create the native array
double *nativeMatrix = new double [X_SIZE * Y_SIZE];
//------------------Measure boost----------------------------------------------
startTime = clock();
for (int i = 0; i < ITERATIONS; ++i)
{
for (int y = 0; y < Y_SIZE; ++y)
{
for (int x = 0; x < X_SIZE; ++x)
{
boostMatrix[x][y] = 2.345;
}
}
}
endTime = clock();
printf("[Boost] Elapsed time: %6.3f seconds\n", (endTime - startTime) / (double)CLOCKS_PER_SEC);
//------------------Measure native-----------------------------------------------
startTime = clock();
for (int i = 0; i < ITERATIONS; ++i)
{
for (int y = 0; y < Y_SIZE; ++y)
{
for (int x = 0; x < X_SIZE; ++x)
{
nativeMatrix[x + (y * X_SIZE)] = 2.345;
}
}
}
endTime = clock();
printf("[Native]Elapsed time: %6.3f seconds\n", (endTime - startTime) / (double)CLOCKS_PER_SEC);
return 0;
}
As answered by rodrigob, activating the proper optimization (GCC's default is -O0) is the key to get good performance. In addition, I also tested with Blaze DynamicMatrix , which yielded an additional factor 2 performance improvement with the exact same optimization flags. https://bitbucket.org/account/user/blaze-lib/projects/BLAZE
On my machine using
g++ -O3 -march=native -mtune=native --fast-math -DNDEBUG test.cpp -o test && ./test
I get
[Boost] Elapsed time: 0.020 seconds
[Native]Elapsed time: 0.020 seconds
However changing const int ITERATIONS
to 5000
I get
[Boost] Elapsed time: 0.240 seconds
[Native]Elapsed time: 0.180 seconds
then with ITERATIONS
back to 500
but X_SIZE
and Y_SIZE
set to 400
I get a much more significant difference
[Boost] Elapsed time: 0.460 seconds
[Native]Elapsed time: 0.070 seconds
finally inverting the inner loop for the [Boost]
case so it looks like
for (int x = 0; x < X_SIZE; ++x)
{
for (int y = 0; y < Y_SIZE; ++y)
{
and keeping ITERATIONS
, X_SIZE
and Y_SIZE
to 500
, 400
and 400
I get
[Boost] Elapsed time: 0.060 seconds
[Native]Elapsed time: 0.080 seconds
If I invert the inner loop also for the [Native]
case (so it is in the wrong order for that case), I get, unsurprisingly,
[Boost] Elapsed time: 0.070 seconds
[Native]Elapsed time: 0.450 seconds
I am using gcc (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5
on Ubuntu 10.10
So in conclusion: