How could we generate random numbers in CUDA C with different seed on each run?

前端未结

关注

 3  617

情深已故 2021-02-10 01:16

I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++

3条回答

南笙 (楼主)

2021-02-10 02:11

You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]

__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i < Np) {

        curandState state;

        curand_init(seed, i, 0, &state);

        numbers[i] = curand_uniform(&state);
    }
}

You can also avoid passing the seed from outside if you change the curand_init instruction to

curand_init(clock64(), i, 0, &state);

EDIT

Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:

Single random number generation: separate kernels for initialization and random number generation;
Single random number generation: unique kernel for initialization and random number generation;
Multiple random number generation: separate kernels for initialization and random number generation;
Multiple random number generation: unique kernel for initialization and random number generation;

Below is the code. The timing for generating has been the following:

861ms;
852ms;
866ms;
2556ms;

I hope I have correctly understood the performance issue raised by Roger Dahl.

#include 
#include 
#include 

#define DSIZE 8192*16
#define nTPB 256

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
        if (abort) exit(code);
    }
}

/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}

__global__ void testrand1(curandState *state, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    a[idx] = curand_uniform(&state[idx]);
}

__global__ void testrand2(unsigned long seed, float *a){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);
    a[idx] = curand_uniform(&state);
}

/********/
/* MAIN */
/********/
int main() {

    int n_iter = 20;

    curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));

    float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));

    float time;
    cudaEvent_t start, stop;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    for (int i=0; i>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i>>(devState, 1);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);

    cudaEventRecord(start, 0);

    for (int i=0; i>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    }

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);

    getchar();
}

Output on GTX660:

Elapsed time for separate kernels:  1960.3 ms
Elapsed time for single kernels:  1536.9 ms
Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
Elapsed time for single kernels for multiple random number generation:  4612.2 ms

Output on GTX570:

Elapsed time for separate kernels:  957.2 ms 
Elapsed time for single kernels:  947.7 ms 
Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
Elapsed time for single kernels for multiple random number generation:  2839.0 ms

Approximately same performance as the K20c.

0 讨论(0)

查看其它3个回答