问题
When allocating a lot of memory on 4 distinct NVIDIA V100 GPUs, I observe the following behavior with regards to parallelization via OpenMP:
Using the #pragma omp parallel for
directive, and therefore making the cudaMalloc
calls on each GPU in parallel, results in the same performance as doing it completely serial. This is tested and the same effect validated on two HPC systems: IBM Power AC922 and an AWS EC2 p3dn.24xlarge. (The numbers are obtained on the Power machine.)
./test 4000000000
# serial
GPU 0: 0.472018550
GPU 1: 0.325776811
GPU 2: 0.334342752
GPU 3: 0.337432169
total: 1.469773541
# parallel
GPU 0: 1.199741600
GPU 2: 1.200597044
GPU 3: 1.200619017
GPU 1: 1.482700315
total: 1.493352924
How can I make the parallelization faster?
Here is my code:
#include <chrono>
#include <iomanip>
#include <iostream>
int main(int argc, char* argv[]) {
size_t num_elements = std::stoull(argv[1]);
auto t0s = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for (int i = 0; i < 4; ++i)
{
auto t0is = std::chrono::high_resolution_clock::now();
cudaSetDevice(i);
int* ptr;
cudaMalloc((void**)&ptr, sizeof(int) * num_elements);
auto t1is = std::chrono::high_resolution_clock::now();
std::cout << "GPU " << i << ": " << std::fixed << std::setprecision(9)
<< std::chrono::duration<double>(t1is - t0is).count() << std::endl;
}
auto t1s = std::chrono::high_resolution_clock::now();
std::cout << "total: " << std::fixed << std::setprecision(9)
<< std::chrono::duration<double>(t1s - t0s).count() << std::endl;
return 0;
}
You can compile the microbenchmark with:
nvcc -std=c++11 -Xcompiler -fopenmp -O3 test.cu -o test
- I also tried
std::thread
instead of OpenMP with the same results.
来源:https://stackoverflow.com/questions/63874103/how-to-make-parallel-cudamalloc-fast