How could we generate random numbers in CUDA C with different seed on each run?

前端 未结 3 617
情深已故
情深已故 2021-02-10 01:16

I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++

3条回答
  •  南笙
    南笙 (楼主)
    2021-02-10 02:11

    You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]

    __global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
    
        int i = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (i < Np) {
    
            curandState state;
    
            curand_init(seed, i, 0, &state);
    
            numbers[i] = curand_uniform(&state);
        }
    }
    

    You can also avoid passing the seed from outside if you change the curand_init instruction to

    curand_init(clock64(), i, 0, &state);
    

    EDIT

    Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:

    1. Single random number generation: separate kernels for initialization and random number generation;
    2. Single random number generation: unique kernel for initialization and random number generation;
    3. Multiple random number generation: separate kernels for initialization and random number generation;
    4. Multiple random number generation: unique kernel for initialization and random number generation;

    Below is the code. The timing for generating has been the following:

    1. 861ms;
    2. 852ms;
    3. 866ms;
    4. 2556ms;

    I hope I have correctly understood the performance issue raised by Roger Dahl.

    #include 
    #include 
    #include 
    
    #define DSIZE 8192*16
    #define nTPB 256
    
    /***********************/
    /* CUDA ERROR CHECKING */
    /***********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
            if (abort) exit(code);
        }
    }
    
    /*************************/
    /* CURAND INITIALIZATION */
    /*************************/
    __global__ void initCurand(curandState *state, unsigned long seed){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        curand_init(seed, idx, 0, &state[idx]);
    }
    
    __global__ void testrand1(curandState *state, float *a){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        a[idx] = curand_uniform(&state[idx]);
    }
    
    __global__ void testrand2(unsigned long seed, float *a){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        curandState state;
        curand_init(seed, idx, 0, &state);
        a[idx] = curand_uniform(&state);
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        int n_iter = 20;
    
        curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
    
        float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
    
        float time;
        cudaEvent_t start, stop;
    
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
    
        for (int i=0; i>>(devState, 1);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for separate kernels:  %3.1f ms \n", time);
    
        cudaEventRecord(start, 0);
    
        for (int i=0; i>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for single kernels:  %3.1f ms \n", time);
    
        cudaEventRecord(start, 0);
    
        for (int i=0; i>>(devState, 1);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);
    
        cudaEventRecord(start, 0);
    
        for (int i=0; i>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);
    
        getchar();
    }
    

    Output on GTX660:

    Elapsed time for separate kernels:  1960.3 ms
    Elapsed time for single kernels:  1536.9 ms
    Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
    Elapsed time for single kernels for multiple random number generation:  4612.2 ms
    

    Output on GTX570:

    Elapsed time for separate kernels:  957.2 ms 
    Elapsed time for single kernels:  947.7 ms 
    Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
    Elapsed time for single kernels for multiple random number generation:  2839.0 ms 
    

    Approximately same performance as the K20c.

提交回复
热议问题