I\'ve always thought it\'s the general wisdom that std::vector
is \"implemented as an array,\" blah blah blah. Today I went down and tested it, and it seems to
A better benchmark (I think...), compiler due to optimizations can change code, becouse results of allocated vectors/arrays are not used anywhere. Results:
$ g++ test.cpp -o test -O3 -march=native
$ ./test
UseArray inner completed in 0.652 seconds
UseArray completed in 0.773 seconds
UseVector inner completed in 0.638 seconds
UseVector completed in 0.757 seconds
UseVectorPushBack inner completed in 6.732 seconds
UseVectorPush completed in 6.856 seconds
The whole thing completed in 8.387 seconds
Compiler:
gcc version 6.2.0 20161019 (Debian 6.2.0-9)
CPU:
model name : Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz
And the code:
#include <cstdlib>
#include <vector>
#include <iostream>
#include <string>
#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>
class TestTimer
{
public:
TestTimer(const std::string & name) : name(name),
start(boost::date_time::microsec_clock<boost::posix_time::ptime>::local_time())
{
}
~TestTimer()
{
using namespace std;
using namespace boost;
posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
posix_time::time_duration d = now - start;
cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
" seconds" << endl;
}
private:
std::string name;
boost::posix_time::ptime start;
};
struct Pixel
{
Pixel()
{
}
Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)
{
}
unsigned char r, g, b;
};
void UseVector(std::vector<std::vector<Pixel> >& results)
{
TestTimer t("UseVector inner");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel>& pixels = results.at(i);
pixels.resize(dimension * dimension);
for(int i = 0; i < dimension * dimension; ++i)
{
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
}
}
void UseVectorPushBack(std::vector<std::vector<Pixel> >& results)
{
TestTimer t("UseVectorPushBack inner");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
std::vector<Pixel>& pixels = results.at(i);
pixels.reserve(dimension * dimension);
for(int i = 0; i < dimension * dimension; ++i)
pixels.push_back(Pixel(255, 0, 0));
}
}
void UseArray(Pixel** results)
{
TestTimer t("UseArray inner");
for(int i = 0; i < 1000; ++i)
{
int dimension = 999;
Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);
results[i] = pixels;
for(int i = 0 ; i < dimension * dimension; ++i)
{
pixels[i].r = 255;
pixels[i].g = 0;
pixels[i].b = 0;
}
// free(pixels);
}
}
void UseArray()
{
TestTimer t("UseArray");
Pixel** array = (Pixel**)malloc(sizeof(Pixel*)* 1000);
UseArray(array);
for(int i=0;i<1000;++i)
free(array[i]);
free(array);
}
void UseVector()
{
TestTimer t("UseVector");
{
std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());
UseVector(vector);
}
}
void UseVectorPushBack()
{
TestTimer t("UseVectorPush");
{
std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());
UseVectorPushBack(vector);
}
}
int main()
{
TestTimer t1("The whole thing");
UseArray();
UseVector();
UseVectorPushBack();
return 0;
}
I did some extensive tests that I wanted to for a while now. Might as well share this.
This is my dual boot machine i7-3770, 16GB Ram, x86_64, on Windows 8.1 and on Ubuntu 16.04. More information and conclusions, remarks below. Tested both MSVS 2017 and g++ (both on Windows and on Linux).
#include <iostream>
#include <chrono>
//#include <algorithm>
#include <array>
#include <locale>
#include <vector>
#include <queue>
#include <deque>
// Note: total size of array must not exceed 0x7fffffff B = 2,147,483,647B
// which means that largest int array size is 536,870,911
// Also image size cannot be larger than 80,000,000B
constexpr int long g_size = 100000;
int g_A[g_size];
int main()
{
std::locale loc("");
std::cout.imbue(loc);
constexpr int long size = 100000; // largest array stack size
// stack allocated c array
std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
int A[size];
for (int i = 0; i < size; i++)
A[i] = i;
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "c-style stack array duration=" << duration / 1000.0 << "ms\n";
std::cout << "c-style stack array size=" << sizeof(A) << "B\n\n";
// global stack c array
start = std::chrono::steady_clock::now();
for (int i = 0; i < g_size; i++)
g_A[i] = i;
duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "global c-style stack array duration=" << duration / 1000.0 << "ms\n";
std::cout << "global c-style stack array size=" << sizeof(g_A) << "B\n\n";
// raw c array heap array
start = std::chrono::steady_clock::now();
int* AA = new int[size]; // bad_alloc() if it goes higher than 1,000,000,000
for (int i = 0; i < size; i++)
AA[i] = i;
duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "c-style heap array duration=" << duration / 1000.0 << "ms\n";
std::cout << "c-style heap array size=" << sizeof(AA) << "B\n\n";
delete[] AA;
// std::array<>
start = std::chrono::steady_clock::now();
std::array<int, size> AAA;
for (int i = 0; i < size; i++)
AAA[i] = i;
//std::sort(AAA.begin(), AAA.end());
duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "std::array duration=" << duration / 1000.0 << "ms\n";
std::cout << "std::array size=" << sizeof(AAA) << "B\n\n";
// std::vector<>
start = std::chrono::steady_clock::now();
std::vector<int> v;
for (int i = 0; i < size; i++)
v.push_back(i);
//std::sort(v.begin(), v.end());
duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "std::vector duration=" << duration / 1000.0 << "ms\n";
std::cout << "std::vector size=" << v.size() * sizeof(v.back()) << "B\n\n";
// std::deque<>
start = std::chrono::steady_clock::now();
std::deque<int> dq;
for (int i = 0; i < size; i++)
dq.push_back(i);
//std::sort(dq.begin(), dq.end());
duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "std::deque duration=" << duration / 1000.0 << "ms\n";
std::cout << "std::deque size=" << dq.size() * sizeof(dq.back()) << "B\n\n";
// std::queue<>
start = std::chrono::steady_clock::now();
std::queue<int> q;
for (int i = 0; i < size; i++)
q.push(i);
duration = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "std::queue duration=" << duration / 1000.0 << "ms\n";
std::cout << "std::queue size=" << q.size() * sizeof(q.front()) << "B\n\n";
}
//////////////////////////////////////////////////////////////////////////////////////////
// with MSVS 2017:
// >> cl /std:c++14 /Wall -O2 array_bench.cpp
//
// c-style stack array duration=0.15ms
// c-style stack array size=400,000B
//
// global c-style stack array duration=0.130ms
// global c-style stack array size=400,000B
//
// c-style heap array duration=0.90ms
// c-style heap array size=4B
//
// std::array duration=0.20ms
// std::array size=400,000B
//
// std::vector duration=0.544ms
// std::vector size=400,000B
//
// std::deque duration=1.375ms
// std::deque size=400,000B
//
// std::queue duration=1.491ms
// std::queue size=400,000B
//
//////////////////////////////////////////////////////////////////////////////////////////
//
// with g++ version:
// - (tdm64-1) 5.1.0 on Windows
// - (Ubuntu 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609 on Ubuntu 16.04
// >> g++ -std=c++14 -Wall -march=native -O2 array_bench.cpp -o array_bench
//
// c-style stack array duration=0ms
// c-style stack array size=400,000B
//
// global c-style stack array duration=0.124ms
// global c-style stack array size=400,000B
//
// c-style heap array duration=0.648ms
// c-style heap array size=8B
//
// std::array duration=1ms
// std::array size=400,000B
//
// std::vector duration=0.402ms
// std::vector size=400,000B
//
// std::deque duration=0.234ms
// std::deque size=400,000B
//
// std::queue duration=0.304ms
// std::queue size=400,000
//
//////////////////////////////////////////////////////////////////////////////////////////
Notes
std::sort()
too (you can see it commented out) but removed them later because there were no significant relative differences. std::array
's time variations between consecutive runs, while others especially std:: data structs varied wildly in comparisonstd::array
and c-style arrays faster on Windows without optimizationOf course this is code for an optimized build. And since the question was about std::vector
then yes it is !much! slower than plain arrays (optimized/unoptimized). But when you're doing a benchmark, you naturally want to produce optimized code.
The star of the show for me though has been std::array
.
Well, because vector::resize() does much more processing than plain memory allocation (by malloc).
Try to put a breakpoint in your copy constructor (define it so that you can breakpoint!) and there goes the additional processing time.
Martin York's answer bothers me because it seems like an attempt to brush the initialisation problem under the carpet. But he is right to identify redundant default construction as the source of performance problems.
[EDIT: Martin's answer no longer suggests changing the default constructor.]
For the immediate problem at hand, you could certainly call the 2-parameter version of the vector<Pixel>
ctor instead:
std::vector<Pixel> pixels(dimension * dimension, Pixel(255, 0, 0));
That works if you want to initialise with a constant value, which is a common case. But the more general problem is: How can you efficiently initialise with something more complicated than a constant value?
For this you can use a back_insert_iterator
, which is an iterator adaptor. Here's an example with a vector of int
s, although the general idea works just as well for Pixel
s:
#include <iterator>
// Simple functor return a list of squares: 1, 4, 9, 16...
struct squares {
squares() { i = 0; }
int operator()() const { ++i; return i * i; }
private:
int i;
};
...
std::vector<int> v;
v.reserve(someSize); // To make insertions efficient
std::generate_n(std::back_inserter(v), someSize, squares());
Alternatively you could use copy()
or transform()
instead of generate_n()
.
The downside is that the logic to construct the initial values needs to be moved into a separate class, which is less convenient than having it in-place (although lambdas in C++1x make this much nicer). Also I expect this will still not be as fast as a malloc()
-based non-STL version, but I expect it will be close, since it only does one construction for each element.