Block reduction in CUDA

前端 未结 3 1630
一个人的身影
一个人的身影 2020-12-16 01:43

I am trying to do reduction in CUDA and I am really a newbie. I am currently studying a sample code from NVIDIA.

I guess I am really not sure how to set up the block

3条回答
  •  囚心锁ツ
    2020-12-16 02:16

    Robert Crovella has already answered this question, which is mainly about understanding rather than performance.

    However, for all those bumping into this question, I just want to highlight that CUB makes block reduction features available. Below, I'm providing a simple worked example on how using CUB's BlockReduce.

    #include 
    #include 
    
    #include "Utilities.cuh"
    
    #include 
    
    #define BLOCKSIZE   32
    
    const int N = 1024;
    
    /**************************/
    /* BLOCK REDUCTION KERNEL */
    /**************************/
    __global__ void sum(const float * __restrict__ indata, float * __restrict__ outdata) {
    
        unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
        // --- Specialize BlockReduce for type float. 
        typedef cub::BlockReduce BlockReduceT; 
    
        // --- Allocate temporary storage in shared memory 
        __shared__ typename BlockReduceT::TempStorage temp_storage; 
    
        float result;
        if(tid < N) result = BlockReduceT(temp_storage).Sum(indata[tid]);
    
        // --- Update block reduction value
        if(threadIdx.x == 0) outdata[blockIdx.x] = result;
    
        return;  
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        // --- Allocate host side space for 
        float *h_data       = (float *)malloc(N * sizeof(float));
        float *h_result     = (float *)malloc((N / BLOCKSIZE) * sizeof(float));
    
        float *d_data;      gpuErrchk(cudaMalloc(&d_data, N * sizeof(float)));
        float *d_result;    gpuErrchk(cudaMalloc(&d_result, (N / BLOCKSIZE) * sizeof(float)));
    
        for (int i = 0; i < N; i++) h_data[i] = (float)i;
    
        gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice));
    
        sum<<>>(d_data, d_result);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        gpuErrchk(cudaMemcpy(h_result, d_result, (N / BLOCKSIZE) * sizeof(float), cudaMemcpyDeviceToHost));
    
        std::cout << "output: ";
        for(int i = 0; i < (N / BLOCKSIZE); i++) std::cout << h_result[i] << " ";
        std::cout << std::endl;
    
        gpuErrchk(cudaFree(d_data));
        gpuErrchk(cudaFree(d_result));
    
        return 0;
    }
    

    In this example, an array of length N is created and the result is the sum of 32 consecutive elements. So

    result[0] = data[0] + ... + data[31];
    result[1] = data[32] + ... + data[63];
    ....
    

提交回复
热议问题