Why does my CUDA kernel crash (unspecified launch failure) with a different dataset size?

后端 未结 2 1921
别跟我提以往
别跟我提以往 2020-11-29 12:02

I have a kernel to calculate different elements of a matrix, based on their position (diagonal or off-diagonal). The kernel works as expected when calculating matrices of si

相关标签:
2条回答
  • 2020-11-29 12:27

    Solved the problem. Turns out the WDDM TDR (timeout detecion recovery) was enabled and the delay was set to 2 seconds. This means that if the kernel execution time exceeds 2s, the driver will crash and recover. This is applicable to graphics and rendering (for general purpose uses of the GPU). In this case however, the TDR must either me disabled or the delay increased. By increasing the delay to 10s, the crash error "unspecified launch failure" ceased to appear and kernel execution continued as before.

    The TDR delay (as well as enabling/disabling) can be done through Nsight options in the Nsight Monitor or through the Registry (HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers) - DWORDS Tdrdelay and Tdrlevel.

    0 讨论(0)
  • 2020-11-29 12:34

    I tried to reproduce your code with the following complete example. The code compiles, runs with no error.

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include <stdio.h>
    
    #include "cuComplex.h"
    
    __global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
    {
        int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
        int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
        int index = rowIdx*numberOfBuses + colIdx;
        if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
        {
            for (int i=0; i<numberOfBranches; ++i)
            {
                if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
                    y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
                }
                if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
                    y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
                }
                if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
                    y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
                }
                if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
                    y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
                }
            }
        }
    }
    
    
    int main ()
    {
        int numLines = 32 ;
        int numberOfBuses = 2383 ;
    
        int* dev_fromBus, *dev_toBus;
        float *dev_R, *dev_X, *dev_B;
        cuComplex* dev_y ; 
    
        cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
        cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
        cudaMalloc((void**)&dev_R, numLines*sizeof(float));
        cudaMalloc((void**)&dev_X, numLines*sizeof(float));
        cudaMalloc((void**)&dev_B, numLines*sizeof(float));
        cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
    
        dim3 dimBlock(16, 16); //number of threads 
        dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16);  //number of blocks
    
        //launch kernel once data has been copied to GPU
        createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);
    
        cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;
    
        //copy results back to CPU
        cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
        if (cudaStat6 != cudaSuccess) {
            printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
            return 1;
        }
        return 0 ;
    }
    

    Your error seems to be somewhere else.

    You want to run your code in NSIGHT debug mode with cuda mem check activated. If compiled with debug information, the tool should point out the location of your error.

    EDIT: The problem appears to ne caused by WDDM TDR as discussed in comment.

    0 讨论(0)
提交回复
热议问题