问题
When I came across this question on SO, I was curious to know the answer. so I wrote below piece of code to test atomic operation performance in different scenarios. The OS is Ubuntu 12.04 with CUDA 5.5 and the device is GeForce GTX780 (Kepler architecture). I compiled the code with -O3
flag and for CC=3.5.
#include <stdio.h>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define BLOCK_SIZE 256
#define RESTRICTION_SIZE 32
__global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+i, 6); //arbitrary number to add
}
}
__global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6); //arbitrary number to add
}
}
__global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+(i>>5), 6); //arbitrary number to add
}
}
__global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data, 6); //arbitrary number to add
}
}
__global__ void CoalescedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+threadIdx.x, data[i]);
}
}
__global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]);
}
}
__global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]);
}
}
__global__ void SameAddressAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data, data[0]);
}
}
int main(void)
{
const int n = 2 << 24;
int* data = new int[n];
int i;
for(i=0; i<n; i++) {
data[i] = i%1024+1;
}
int* dev_data;
HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) );
HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) );
HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
HANDLE_ERROR( cudaDeviceReset() );
printf("Program finished without error.\n");
return 0;
}
Basically in above code there are 8 kernels in which all threads do atomicAdd
on all data.
- Coalesced atomic addition on global memory.
- Atomic addition on a restricted address space in global memory. (32 in the code)
- Atomic addition for warp lanes on the same address in global memory.
- Atomic addition of all threads on the same address in global memory.
Items 5 to 8 can be found by replacing global with shared in above items. Chosen block size is 256.
I used nvprof
to profile the program. The output is:
Time(%) Time Calls Avg Min Max Name
44.33% 2.35113s 50 47.023ms 46.987ms 47.062ms SameAddressAtomicOnSharedMem(int*, int)
31.89% 1.69104s 50 33.821ms 33.818ms 33.826ms SameAddressAtomicOnGlobalMem(int*, int)
10.10% 535.88ms 50 10.718ms 10.707ms 10.738ms WarpRestrictedAtomicOnSharedMem(int*, int)
3.96% 209.95ms 50 4.1990ms 4.1895ms 4.2103ms AddressRestrictedAtomicOnSharedMem(int*, int)
3.95% 209.47ms 50 4.1895ms 4.1893ms 4.1900ms AddressRestrictedAtomicOnGlobalMem(int*, int)
3.33% 176.48ms 50 3.5296ms 3.5050ms 3.5498ms WarpRestrictedAtomicOnGlobalMem(int*, int)
1.08% 57.428ms 50 1.1486ms 1.1460ms 1.1510ms CoalescedAtomicOnGlobalMem(int*, int)
0.84% 44.784ms 50 895.68us 888.65us 905.77us CoalescedAtomicOnSharedMem(int*, int)
0.51% 26.805ms 1 26.805ms 26.805ms 26.805ms [CUDA memcpy HtoD]
0.01% 543.61us 1 543.61us 543.61us 543.61us [CUDA memset]
Obviously coalesced conflict-free atomic operations had the best performance, and same-address had the the worst. One thing I couldn't explain was that why same address atomic on shared memory (inside a block) is slower comparing to on global memory (common between all threads).
When all the warps lanes access the same place in shared memory the performance is very bad but it's (surprisingly) not the case when they perform it onto global memory. I cannot explain why. Another confusion case is address restricted atomic on global is performing worse than when all the threads inside the warp perform it on the same address while it seems memory contentions in the first case is lower.
Anyways I would be happy if anyone could explain above profiling results.
回答1:
As a forward-looking statement, to some extent my comments here may be architecture-specific. But for the architectures at hand (up to cc 3.5, AFAIK) shared memory atomics get implemented via a code sequence (created by the assembler). This code sequence, operating on shared memory, is subject to serialization if multiple threads are contending for access to the same bank/location.
The R-M-W operation itself is atomic in the sense that no other thread can disrupt the operation (i.e. create incorrect results), but when the threads are contending to do an atomic operation on a single shared memory location, the contention gives rise to serialization, exacerbating the delay associated with atomics.
To quote Nick from the CUDA Handbook:
Unlike global memory, which implements atomics using single instructions (either GATOM or GRED, depending on whether the return value is used), shared memory atomics are implemented with explicit lock/unlock semantics, and the compiler emits code that causes each thread to loop over these lock operations until the thread has performed its atomic operation.
and:
Take care to avoid contention, or the loop in Listing 8-2 may iterate up to 32 times.
I suggest you read the full section 8.1.5, at least.
来源:https://stackoverflow.com/questions/22367238/cuda-atomic-operation-performance-in-different-scenarios