OpenCL: Correct results on CPU not on GPU: how to manage memory correctly?

问题

__kernel void CKmix(__global short* MCL, __global short* MPCL,__global short *C,  int S,  int B)
{       
    unsigned int i=get_global_id(0);
    unsigned int ii=get_global_id(1);
    MCL[i]+=MPCL[B*ii+i+C[ii]+S];
}

Kernel seams ok, it compiles successfully, and I have obtained the correct results using the CPU as a device, but that was when I had the program release and and recreate my memory objects each time the kernel is called, which for my testing purpose is about 16000 times.

The code I am posting is where I am at now, trying to use pinned memory and mapping.

OpenCLProgram = clCreateProgramWithSource(hContext[Plat-1][Dev-1],11, OpenCLSource, NULL ,NULL);
clBuildProgram(OpenCLProgram, 0,NULL,NULL, NULL,NULL);
ocKernel = clCreateKernel(OpenCLProgram, "CKmix", NULL);

This is also successful. The reason I have a 2d array of contexts is that I iterate through all platforms and devices and allow the user to select the platform and device to use.

WorkSize[0]=SN;
WorkSize[1]=NF;  

PinnedCCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE| CL_MEM_ALLOC_HOST_PTR, sizeof(short) *NF, NULL, NULL);
PinnedMCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(short) * Z*NF, NULL, NULL);
PinnedMO =  clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(short) * Z,NULL, NULL);
PinnedMTEMP =  clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(short) * Z,NULL, NULL);

DevComboCCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE, sizeof(short) *NF, NULL, NULL);    
DevMappedMCL = clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE , sizeof(short) * Z*NF, NULL,NULL);
DevMO =  clCreateBuffer(hContext[Plat-1][Dev-1], CL_MEM_READ_WRITE , sizeof(short) * Z,NULL, NULL);

MO = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedMO, CL_TRUE, CL_MAP_READ, 0, sizeof(short)*Z, 0, NULL, NULL, NULL);
CCL = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedCCL, CL_TRUE, CL_MAP_WRITE, 0, sizeof(short)*NF, 0, NULL, NULL,NULL);
MCL = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedMCL, CL_TRUE, CL_MAP_WRITE, 0, sizeof(short)*Z*NF, 0, NULL, NULL, NULL);
MTEMP = (short*) clEnqueueMapBuffer(hCmdQueue[Plat-1][Dev-1], PinnedMTEMP, CL_TRUE, CL_MAP_READ, 0, sizeof(short)*Z, 0, NULL, NULL, NULL);

for (n=0; n < Z; ++n) {
    MTEMP[n]=0;
    }

clSetKernelArg(ocKernel, 0, sizeof(cl_mem), (void*) &DevMO);
clSetKernelArg(ocKernel, 1, sizeof(cl_mem), (void*) &DevMCL);    
clSetKernelArg(ocKernel, 2, sizeof(cl_mem), (void*) &DevCCL);
clSetKernelArg(ocKernel, 3, sizeof(int),    (void*) &SH);
clSetKernelArg(ocKernel, 4, sizeof(int),    (void*) &SN);

The above constitutes my initialization, and the rest below, happens repeatedly.

clEnqueueWriteBuffer(hCmdQueue[Plat-1][Dev-1], DevMCL, CL_TRUE, 0, Z*NF*sizeof(short), MCL, 0, NULL, NULL);
clEnqueueWriteBuffer(hCmdQueue[Plat-1][Dev-1], DevCCL, CL_TRUE, 0, NF*sizeof(short), CCL, 0, NULL, NULL);
clEnqueueWriteBuffer(hCmdQueue[Plat-1][Dev-1], DevMO, CL_TRUE, 0, Z*sizeof(short), MTEMP, 0, NULL, NULL);

clEnqueueNDRangeKernel(hCmdQueue[Plat-1][Dev-1], ocKernel, 2, NULL, WorkSize, NULL, 0, NULL, NULL);
clEnqueueReadBuffer(hCmdQueue[Plat-1][Dev-1],DevMO, CL_TRUE, 0, Z * sizeof(short),(void*) MO , 0, NULL, NULL);

I have checked for errors, and I am not getting any errors. The kernel is launched many times repeatedly with fresh data. I am not sure where I am doing wrong.

NVIDIA 550 ti compute capability 2.1, latest Dev Driver, Cuda SDK 4.0,

回答1:

I don't know if its the only problem with the code, but this:

unsigned int i=get_global_id(0);
unsigned int ii=get_global_id(1);
MCL[i]+=MPCL[B*ii+i+C[ii]+S];

is definitely not a good idea. You will generally get multiple threads working on the same global_id(0), so several threads might try to update MCL[i] simultaneous (note that += is not atomic). I would assume that for the CPU there are not enough threads generated to show such a behaviour in most of the cases, while having thousands of threads on the gpu will almost surely lead to problems.

The most reasonable way to do this is to have only a 1 dimensional workingset and for each thread accumulate all values which go to one position:

unsigned int i=get_global_id(0);
short accum = MCL[i]; //or 0, if thats the start
for(int ii = 0; ii < size; ++ii)
  accum += MPCL[B*ii+i+C[ii]+S];
MCL[i] = accum;

Of course that might or might not be feasible. If it isn't the fix probably won't be quite that simple.

来源：https://stackoverflow.com/questions/8823862/opencl-correct-results-on-cpu-not-on-gpu-how-to-manage-memory-correctly

标签

c++

opencl

gpgpu

nvidia