Unable to execute device kernel in CUDA

问题

I am trying to call a device kernel within a global kernel. My global kernel is a Matrix Multiplication and my device kernel is finding the maximum value and the index in each column of the product matrix. Following is the code :

__device__ void MaxFunction(float* Pd, float* max)
{
  int x = (threadIdx.x + blockIdx.x * blockDim.x);  
  int y = (threadIdx.y + blockIdx.y * blockDim.y); 
  int k = 0;
  int temp = 0; int temp_idx = 0;
  for (k = 0; k < wB; ++k) {
   if(Pd[x*wB + y] > temp){
    temp = Pd[x*wB + y];
    temp_idx = x*wB + y;
   }
       max[y*2 + 0] = temp;
       max[y*2 + 1] = temp_idx;
  }
}

__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
  // declare cache in the shared memory
  __shared__ float Mds[blockD][blockD];
  __shared__ float Nds[blockD][blockD];

  float Pvalue = 0;
  // Loop over the Md and Nd block dimension required to compute the Pd element
  for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x); 
                            m < ((wA * blockD * blockIdx.y)+wA-1); 
                                        m += blockD, n += (blockD*hB)){

    // collaboratively loading of Md and Nd blocks into shared memory    
    Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
    Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
    __syncthreads();

    // keep track of the running sum    
    for (int k = 0; k < blockD; k++)
      Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
    __syncthreads();
  }

  // write back to the global memory
  int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
  Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
  __syncthreads();

  MaxFunction(Pd, max);

}

The Main code :

#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>

#define blockD 32


const int wA = 128;
const int hA = 1024;

const int wB = 128;
const int hB = wA;

main(void){

    void MatrixMultiplication(float *, float *, float *, float *);

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *M, *N, *P, *C;   


    // allocate memory on the CPU
    M = (float*)malloc(size_A);
    N = (float*)malloc(size_B);
    P = (float*)malloc(size_max);
    C = (float*)malloc(size_C);

    // initialize the matrices
    for (int y=0; y < hA; y++) {
        for (int x=0; x < wA; x++){
            M[y*wA + x] = x;
       }
    }

    for (int y=0; y<hB; y++) {
        for (int x=0; x<wB; x++){
            N[y*wB + x] = x;
       }
    }

    MatrixMultiplication(M, N, P, C);

    //Write
    FILE *f1;
    int i, j;
    f1 = fopen("max_val.txt","w");
    for(i=0; i < (wB * 2); i+=2){
    fprintf(f1,"%d\t%d\n",int(P[i]),int(P[i+1]));
    }
    fclose(f1);

    f1 = fopen("Prod_mat.txt","w");
    for(i=0; i < 2; i++){
    for(j=0; j < wB; j++){
        fprintf(f1,"%d\t",int(C[i*wB + j]));
    }
    fprintf(f1,"\n");
    }
    fclose(f1);

    free( M );
    free( N );
    free( P ); 
            free( C );

    cudaDeviceReset();
    return 0;
}


void MatrixMultiplication(float *M, float *N, float *P, float *C) {

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *Md, *Nd, *Pd, *max; 

    // allocate memory on the GPU
    cudaMalloc((void**)&Md, size_A);
    cudaMalloc((void**)&Nd, size_B);
    cudaMalloc((void**)&Pd, size_C);
    cudaMalloc((void**)&max, size_max);

    // transfer M and N to device memory
    cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);

    // kernel invocation code
    dim3 dimBlock(blockD, blockD);
    dim3 dimGrid(wA/blockD, hB/blockD);

    //Execute Kernel
    MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);

    // transfer P from device    
    cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
    cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);

    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
    cudaFree(max);
}

The Matrix Multiplication result is fine (Verified using Matlab), but I am not able to get the max values and their corresponding index. I would appreciate if anyone can kindly point out at what I am doing wrong. The max variable has only garbage when I run the above code.

回答1:

Apparently you are attempting to find the maximum value in each column, as well as the offset to that value.

But all of your threads in y are hammering on the same location for max value (max[x*2 + 0]). This isn't recommended, as there is no way to sort out a race condition. You should use atomic operations, or other methods (e.g. reduction) to handle multiple threads updating a single max value this way.

Since you have a need to update two values atomically (the max value and it's location), it's not a simple matter of replacing your plain access with a standard atomic function. However, since you are dealing with two 32-bit adjacent quantities, you may be interested in my answer here.

By the way I think matlab's native matrix multiply on gpuArray should be faster than any matrix multiply code you write. But it would require the Parallel Compute Toolbox.

来源：https://stackoverflow.com/questions/18099031/unable-to-execute-device-kernel-in-cuda

标签

cuda

nvidia

matrix-multiplication

gpu-programming