I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++
You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init
. For example [Disclaimer: it is a non-tested function]
__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < Np) {
curandState state;
curand_init(seed, i, 0, &state);
numbers[i] = curand_uniform(&state);
}
}
You can also avoid passing the seed from outside if you change the curand_init
instruction to
curand_init(clock64(), i, 0, &state);
EDIT
Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072
elements:
Below is the code. The timing for generating has been the following:
861ms
;852ms
;866ms
;2556ms
;I hope I have correctly understood the performance issue raised by Roger Dahl.
#include
#include
#include
#define DSIZE 8192*16
#define nTPB 256
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void testrand1(curandState *state, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a[idx] = curand_uniform(&state[idx]);
}
__global__ void testrand2(unsigned long seed, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState state;
curand_init(seed, idx, 0, &state);
a[idx] = curand_uniform(&state);
}
/********/
/* MAIN */
/********/
int main() {
int n_iter = 20;
curandState *devState; gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels with multiple random number generation: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels for multiple random number generation: %3.1f ms \n", time);
getchar();
}
Output on GTX660:
Elapsed time for separate kernels: 1960.3 ms
Elapsed time for single kernels: 1536.9 ms
Elapsed time for separate kernels with multiple random number generation: 1576.0 ms
Elapsed time for single kernels for multiple random number generation: 4612.2 ms
Output on GTX570:
Elapsed time for separate kernels: 957.2 ms
Elapsed time for single kernels: 947.7 ms
Elapsed time for separate kernels with multiple random number generation: 964.6 ms
Elapsed time for single kernels for multiple random number generation: 2839.0 ms
Approximately same performance as the K20c.