I found some difficulty when I try to access a global array from function that\'s executed from device:
float globTemp[3][3] = \"some value in here\";
__device__
There are a variety of problems with your code:
.x
and .y
built-in variables). A single thread won't get the work done certainly, and a 1D threadblock won't work with your kernel code.__device__
variables are not accessed with cudaMalloc
and cudaMemcpy
. We use a different set of API calls like cudaMemcpyToSymbol
.float
variables (a_d
) with int
variables in the kernel parameters (int *a
) so I don't think this code would compile without at least a warning. And that can lead to strange behavior of course if you ignore it.This is the closest I could come to your code while fixing all the errors:
#include
__device__ float* globTemp_d;
__global__ void compute(float *a, int w)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int i = (y*w)+x;
if((x<3) && (y<3))
a[i] = 1.0f+globTemp_d[i];
}
int main(){
float *a_d, *d_globTemp;
float globTemp[3][3] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f};
float a[(3*3)];
dim3 threads(3,3);
dim3 blocks(1);
cudaMalloc((void**)&a_d, 3*3*sizeof(float));
cudaMalloc((void**)&d_globTemp, 3*3*sizeof(float));
cudaMemcpy(d_globTemp,globTemp, 3*3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(globTemp_d, &d_globTemp, sizeof(float *));
compute<<>>(a_d,3);
cudaMemcpy(a,a_d, 3*3*sizeof(float), cudaMemcpyDeviceToHost);
printf("results:\n");
for (int i = 0; i<(3*3); i++)
printf("a[%d] = %f\n", i, a[i]);
return 0;
}
This code can be simplified by dispensing with the __device__
variable and just passing d_globTemp
as a parameter to the kernel, and using it in place of references to globTemp_d
. However I did not make that simplification.