I have a code to compute a Gaussian Mixture Model with Expectation Maximization in order to identify the clusters from a given input data sample.
A piece of the code
OP's question attracted my attention because number-crunching with speed-up earned by multi-threading is one of the top todo's on my personal list.
I must admit that my experience with the Eigen library is very limited. (I once used the decompose of 3×3 rotation matrices to Euler angles which is very clever solved in a general way in the Eigen library.)
Hence, I defined another sample task consisting of a stupid counting of values in a sample data set.
This is done multiple times (using the same evaluation function):
test-multi-threading.cc
:
#include <cstdint>
#include <cstdlib>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <thread>
#include <vector>
// a sample function to process a certain amount of data
template <typename T>
size_t countFrequency(
size_t n, const T data[], const T &begin, const T &end)
{
size_t result = 0;
for (size_t i = 0; i < n; ++i) result += data[i] >= begin && data[i] < end;
return result;
}
typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;
Time duration(const Clock::time_point &t0)
{
return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}
std::vector<Time> makeTest()
{
const Value SizeGroup = 4, NGroups = 10000, N = SizeGroup * NGroups;
const size_t NThreads = std::thread::hardware_concurrency();
// make a test sample
std::vector<Value> sample(N);
for (Value &value : sample) value = (Value)rand();
// prepare result vectors
std::vector<size_t> results4[4] = {
std::vector<size_t>(NGroups, 0),
std::vector<size_t>(NGroups, 0),
std::vector<size_t>(NGroups, 0),
std::vector<size_t>(NGroups, 0)
};
// make test
std::vector<Time> times{
[&]() { // single threading
// make a copy of test sample
std::vector<Value> data(sample);
std::vector<size_t> &results = results4[0];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment single-threaded
for (size_t i = 0; i < NGroups; ++i) {
results[i] = countFrequency(data.size(), data.data(),
(Value)(i * SizeGroup), (Value)((i + 1) * SizeGroup));
}
// done
return duration(t0);
}(),
[&]() { // multi-threading - stupid aproach
// make a copy of test sample
std::vector<Value> data(sample);
std::vector<size_t> &results = results4[1];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment multi-threaded
std::vector<std::thread> threads(NThreads);
for (Value i = 0; i < NGroups;) {
size_t nT = 0;
for (; nT < NThreads && i < NGroups; ++nT, ++i) {
threads[nT] = std::move(std::thread(
[i, &results, &data, SizeGroup]() {
size_t result = countFrequency(data.size(), data.data(),
(Value)(i * SizeGroup), (Value)((i + 1) * SizeGroup));
results[i] = result;
}));
}
for (size_t iT = 0; iT < nT; ++iT) threads[iT].join();
}
// done
return duration(t0);
}(),
[&]() { // multi-threading - interleaved
// make a copy of test sample
std::vector<Value> data(sample);
std::vector<size_t> &results = results4[2];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment multi-threaded
std::vector<std::thread> threads(NThreads);
for (Value iT = 0; iT < NThreads; ++iT) {
threads[iT] = std::move(std::thread(
[iT, &results, &data, NGroups, SizeGroup, NThreads]() {
for (Value i = iT; i < NGroups; i += NThreads) {
size_t result = countFrequency(data.size(), data.data(),
(Value)(i * SizeGroup), (Value)((i + 1) * SizeGroup));
results[i] = result;
}
}));
}
for (std::thread &threadI : threads) threadI.join();
// done
return duration(t0);
}(),
[&]() { // multi-threading - grouped
std::vector<Value> data(sample);
std::vector<size_t> &results = results4[3];
// remember start time
const Clock::time_point t0 = Clock::now();
// do experiment multi-threaded
std::vector<std::thread> threads(NThreads);
for (Value iT = 0; iT < NThreads; ++iT) {
threads[iT] = std::move(std::thread(
[iT, &results, &data, NGroups, SizeGroup, NThreads]() {
for (Value i = iT * NGroups / NThreads,
iN = (iT + 1) * NGroups / NThreads; i < iN; ++i) {
size_t result = countFrequency(data.size(), data.data(),
(Value)(i * SizeGroup), (Value)((i + 1) * SizeGroup));
results[i] = result;
}
}));
}
for (std::thread &threadI : threads) threadI.join();
// done
return duration(t0);
}()
};
// check results (must be equal for any kind of computation)
const unsigned nResults = sizeof results4 / sizeof *results4;
for (unsigned i = 1; i < nResults; ++i) {
size_t nErrors = 0;
for (Value j = 0; j < NGroups; ++j) {
if (results4[0][j] != results4[i][j]) {
++nErrors;
#ifdef _DEBUG
std::cerr
<< "results4[0][" << j << "]: " << results4[0][j]
<< " != results4[" << i << "][" << j << "]: " << results4[i][j]
<< "!\n";
#endif // _DEBUG
}
}
if (nErrors) std::cerr << nErrors << " errors in results4[" << i << "]!\n";
}
// done
return times;
}
int main()
{
std::cout << "std::thread::hardware_concurrency(): "
<< std::thread::hardware_concurrency() << '\n';
// heat up
std::cout << "Heat up...\n";
for (unsigned i = 0; i < 3; ++i) makeTest();
// repeat NTrials times
const unsigned NTrials = 10;
std::cout << "Measuring " << NTrials << " runs...\n"
<< " "
<< " | " << std::setw(10) << "Single"
<< " | " << std::setw(10) << "Multi 1"
<< " | " << std::setw(10) << "Multi 2"
<< " | " << std::setw(10) << "Multi 3"
<< '\n';
std::vector<double> sumTimes;
for (unsigned i = 0; i < NTrials; ++i) {
std::vector<Time> times = makeTest();
std::cout << std::setw(2) << (i + 1) << ".";
for (const Time &time : times) {
std::cout << " | " << std::setw(10) << time.count();
}
std::cout << '\n';
sumTimes.resize(times.size(), 0.0);
for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
}
std::cout << "Average Values:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(1)
<< sumTime / NTrials;
}
std::cout << '\n';
std::cout << "Ratio:\n ";
for (const double &sumTime : sumTimes) {
std::cout << " | "
<< std::setw(10) << std::fixed << std::setprecision(3)
<< sumTime / sumTimes.front();
}
std::cout << '\n';
// done
return 0;
}
Compiled and tested on cygwin64 on Windows 10:
$ g++ --version
g++ (GCC) 7.3.0
$ g++ -std=c++11 -O2 -o test-multi-threading test-multi-threading.cc
$ ./test-multi-threading
std::thread::hardware_concurrency(): 8
Heat up...
Measuring 10 runs...
| Single | Multi 1 | Multi 2 | Multi 3
1. | 384008 | 1052937 | 130662 | 138411
2. | 386500 | 1103281 | 133030 | 132576
3. | 382968 | 1078988 | 137123 | 137780
4. | 395158 | 1120752 | 138731 | 138650
5. | 385870 | 1105885 | 144825 | 129405
6. | 366724 | 1071788 | 137684 | 130289
7. | 352204 | 1104191 | 133675 | 130505
8. | 331679 | 1072299 | 135476 | 138257
9. | 373416 | 1053881 | 138467 | 137613
10. | 370872 | 1096424 | 136810 | 147960
Average Values:
| 372939.9 | 1086042.6 | 136648.3 | 136144.6
Ratio:
| 1.000 | 2.912 | 0.366 | 0.365
I did the same on coliru.com. (I had to reduce the heat up cycles and the sample size as I exceeded the time limit with the original values.):
g++ (GCC) 8.1.0
Copyright (C) 2018 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
std::thread::hardware_concurrency(): 4
Heat up...
Measuring 10 runs...
| Single | Multi 1 | Multi 2 | Multi 3
1. | 224684 | 297729 | 48334 | 39016
2. | 146232 | 337222 | 66308 | 59994
3. | 195750 | 344056 | 61383 | 63172
4. | 198629 | 317719 | 62695 | 50413
5. | 149125 | 356471 | 61447 | 57487
6. | 155355 | 322185 | 50254 | 35214
7. | 140269 | 316224 | 61482 | 53889
8. | 154454 | 334814 | 58382 | 53796
9. | 177426 | 340723 | 62195 | 54352
10. | 151951 | 331772 | 61802 | 46727
Average Values:
| 169387.5 | 329891.5 | 59428.2 | 51406.0
Ratio:
| 1.000 | 1.948 | 0.351 | 0.303
Live Demo on coliru
I wonder a little bit that the ratios on coliru (with only 4 threads) are even better than on my PC with (with 8 threads). Actually, I don't know how to explain this. However, there are a lot of other differences in the two setups which may or may not be responsible. At least, both measurements show a rough speed-up of 3 for 3rd and 4th approach where the 2nd consumes uniquely every potential speed-up (probably by starting and joining all these threads).
Looking at the sample code, you will recognize that there is no mutex or any other explicit locking. This is intentionally. As I've learned (many, many years ago), every attempt for parallelization may cause a certain extra amount of communication overhead (for concurrent tasks which have to exchange data). If communication overhead becomes to big, it simply consumes the speed advantage of concurrency. So, best speed-up can be achieved by:
In my sample code, I
Concerning 3. I struggled a bit whether this is legal or not i.e. is it granted for data which is written in threads to appear correctly in main thread after joining. (The fact that something seems to work fine is illusive in general but especially illusive concerning multi-threading.)
cppreference.com provides the following explanations
for std::thread::thread()
The completion of the invocation of the constructor synchronizes-with (as defined in std::memory_order) the beginning of the invocation of the copy of f on the new thread of execution.
for std::thread::join()
The completion of the thread identified by
*this
synchronizes with the corresponding successful return from join().
In Stack Overflow, I found the following related Q/A's:
which convinced me, it is OK.
However, the drawback is that
An alternative approach could be the usage of a thread pool to overcome this. I googled a bit and found e.g. Jakob Progsch's ThreadPool on github. However, I guess, with a thread pool the locking issue is back “in the game”.
Whether this will work for Eigen functions as well, depends on how the resp. Eigen functions are implemented. If there are accesses to global variables in them (which become shared when the same function is called concurrently), this will cause a data race.
Googling a bit, I found the following doc.
Eigen and multi-threading – Using Eigen in a multi-threaded application:
In the case your own application is multithreaded, and multiple threads make calls to Eigen, then you have to initialize Eigen by calling the following routine before creating the threads:
#include <Eigen/Core>
int main(int argc, char** argv)
{
Eigen::initParallel();
...
}
Note
With Eigen 3.3, and a fully C++11 compliant compiler (i.e., thread-safe static local variable initialization), then calling
initParallel()
is optional.Warning
note that all functions generating random matrices are not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature.