CUDA atomicAdd() produces wrong result

后端未结

关注

 1  1040

I am a CUDA newbie, playing with CUDA kernels for the first time. I\'ve got the following kernel that implements convloution (very naively), with a dummy loop that performs

相关标签:

1条回答

春和景丽

2021-01-19 05:15

You're doing arbitrary arithmetic on float values and expecting perfect accuracy.

float values can store integers perfectly up to a certain mantissa. Once we exceed that value, then float operations begin to become imprecise. Naturally, the values in your result that tend to accumulate to the largest numbers (those towards the end of the res array) will show this effect first.

Let's call the product of the loops count in your kernel and the loops count in your host code around the kernel the total_loops. For a total_loops value up to around 700, I get "precise" results, that is, all results are evenly divisible by total_loops. After that, as you gradually increase total_loops, then the errors start to creep in, starting at the end of the res array.

You could switch to double instead of float and your results would be different, except that a version of atomicAdd for double isn't conveniently available. However, the programming guide shows how to create arbitrary atomic operations, and the example they give just happens to be implementing atomicAdd for double

So the following modification of your code allows you to explore both ideas:

if you want to see how double fixes the issue, change the define to USE_DOUBLE
instead, if you want to see how reducing the total_loops fixes the issue, change the LOOPS1 define from 100 to 70.
I would also mention that it's good practice to do cuda error checking on all API calls and kernel calls (you're only covering a few, and not the kernel), but it's not an issue in this case.

Here's the code:

#include <stdio.h>
#define LOOPS1 100
#define LOOPS2 10
// set to USE_DOUBLE or USE_FLOAT
#define USE_FLOAT

#ifndef USE_DOUBLE
typedef float mytype;
#else
typedef double mytype;
#endif

__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

__global__ void conv(mytype *input, mytype *kernel, mytype *target)
{
    for (long i = 0; i <LOOPS1; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

int main(){

mytype image[1024] = {0.0};
mytype kernel[] =
{
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};

mytype res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(mytype)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

mytype *dev_image = 0;
mytype *dev_kernel = 0;
mytype *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<LOOPS2; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("results:\n");
for (int i = 0; i< (28*28); i++)
  if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;}

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

  return 0;
}

Note that even if you use double, the problem will eventually show up again if you accumulate to large enough values.

Also note that this isn't really a CUDA/GPU issue. float in host code has similar restrictions.

0 讨论(0)