I am a CUDA newbie, playing with CUDA kernels for the first time. I\'ve got the following kernel that implements convloution (very naively), with a dummy loop that performs
You're doing arbitrary arithmetic on float
values and expecting perfect accuracy.
float
values can store integers perfectly up to a certain mantissa. Once we exceed that value, then float operations begin to become imprecise. Naturally, the values in your result that tend to accumulate to the largest numbers (those towards the end of the res
array) will show this effect first.
Let's call the product of the loops count in your kernel and the loops count in your host code around the kernel the total_loops
. For a total_loops
value up to around 700, I get "precise" results, that is, all results are evenly divisible by total_loops
. After that, as you gradually increase total_loops
, then the errors start to creep in, starting at the end of the res
array.
You could switch to double
instead of float
and your results would be different, except that a version of atomicAdd for double isn't conveniently available. However, the programming guide shows how to create arbitrary atomic operations, and the example they give just happens to be implementing atomicAdd for double
So the following modification of your code allows you to explore both ideas:
USE_DOUBLE
total_loops
fixes the issue, change the LOOPS1 define from 100 to 70.Here's the code:
#include <stdio.h>
#define LOOPS1 100
#define LOOPS2 10
// set to USE_DOUBLE or USE_FLOAT
#define USE_FLOAT
#ifndef USE_DOUBLE
typedef float mytype;
#else
typedef double mytype;
#endif
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__global__ void conv(mytype *input, mytype *kernel, mytype *target)
{
for (long i = 0; i <LOOPS1; i++)
{
atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
}
}
int main(){
mytype image[1024] = {0.0};
mytype kernel[] =
{
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};
mytype res[784]={0};
for (int i = 0; i < 1024; i++)
{
image[i]=(mytype)i;
} // Got 32x32 matrix
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
exit (-1);
}
mytype *dev_image = 0;
mytype *dev_kernel = 0;
mytype *dev_res = 0;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);
cudaMemset(dev_res,0,sizeof(res));
// Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);
for (int itr = 0; itr<LOOPS2; itr++)
{
conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}
cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i< (28*28); i++)
if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;}
cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);
return 0;
}
Note that even if you use double
, the problem will eventually show up again if you accumulate to large enough values.
Also note that this isn't really a CUDA/GPU issue. float
in host code has similar restrictions.