I am trying to do reduction in CUDA and I am really a newbie. I am currently studying a sample code from NVIDIA.
I guess I am really not sure how to set up the block
Robert Crovella has already answered this question, which is mainly about understanding rather than performance.
However, for all those bumping into this question, I just want to highlight that CUB makes block reduction features available. Below, I'm providing a simple worked example on how using CUB's BlockReduce.
#include "Utilities.cuh"
#define BLOCKSIZE 32
const int N = 1024;
__global__ void sum(const float * __restrict__ indata, float * __restrict__ outdata) {
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
// --- Specialize BlockReduce for type float.
typedef cub::BlockReduce BlockReduceT;
// --- Allocate temporary storage in shared memory
__shared__ typename BlockReduceT::TempStorage temp_storage;
float result;
if(tid < N) result = BlockReduceT(temp_storage).Sum(indata[tid]);
// --- Update block reduction value
if(threadIdx.x == 0) outdata[blockIdx.x] = result;
/* MAIN */
int main() {
// --- Allocate host side space for
float *h_data = (float *)malloc(N * sizeof(float));
float *h_result = (float *)malloc((N / BLOCKSIZE) * sizeof(float));
float *d_data; gpuErrchk(cudaMalloc(&d_data, N * sizeof(float)));
float *d_result; gpuErrchk(cudaMalloc(&d_result, (N / BLOCKSIZE) * sizeof(float)));
for (int i = 0; i < N; i++) h_data[i] = (float)i;
gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice));
sum<<>>(d_data, d_result);
gpuErrchk(cudaMemcpy(h_result, d_result, (N / BLOCKSIZE) * sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "output: ";
for(int i = 0; i < (N / BLOCKSIZE); i++) std::cout << h_result[i] << " ";
std::cout << std::endl;
return 0;
In this example, an array of length N
is created and the result is the sum of 32
consecutive elements. So
result[0] = data[0] + ... + data[31];
result[1] = data[32] + ... + data[63];