How could we generate random numbers in CUDA C with different seed on each run?

前端 未结 3 618
情深已故
情深已故 2021-02-10 01:16

I am working on a stochastic process and I wanted to generate different series if random numbers in CUDA kernel each time I run the program. This similar to what we does in C++

相关标签:
3条回答
  • 2021-02-10 01:56

    You can create more than one global function for random number initialization and generation. or create a loop to go over the global function example: for (int rns = 0; rns < 5; rns++) { // too seed 'loop' times

        init << < N, 10 >> > (devState, time(0));
        gpuErrchk(cudaMalloc((void**)&gpu_no, N * sizeof(double))); // allocate memory for random numbers on device/GPU
    //rndn << < N, 10 >> > (devState, gpu_no);//invoke kernel to launch the random numbers
        gpuErrchk(cudaMemcpy(cpu_no, gpu_no, N * sizeof(double), cudaMemcpyDeviceToHost))
    } cout << "the transition matrix  " << ++generate << " seed generation is:  " << init << endl;
    

    This does nt have any noticeable effect on the random number generated. But there is a fear of not being correlated and also lack of convergence in the long run. Why would you like to seed more than once in an iteration anyways. you can use the library function to generate different types of random number distribution like "curand_uniform" curand_normal, curand_poission and so on.

    I don't know if this answers your question.

    0 讨论(0)
  • 2021-02-10 01:59

    Using a different seed on each run should be straightforward. The exact method will depend on which generator you are using, but if you're using one of the cuRAND generators then you can cast your time_t from time(NULL) to a 64-bit integer and pass it in to the seed functions.

    If you're calling the generator from your kernel you will need to pass this seed in either as a kernel argument or via a __device__ variable. You can then use an offset to curand_init() or use skip_ahead() to get distinct sub-sequences.

    If you have a specific generator for which this will not work, please post more information.

    0 讨论(0)
  • 2021-02-10 02:11

    You don't need to pass an array of random seeds, but, when you use the cuRAND library, you can properly set the sequence number parameter of curand_init. For example [Disclaimer: it is a non-tested function]

    __global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
    
        int i = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (i < Np) {
    
            curandState state;
    
            curand_init(seed, i, 0, &state);
    
            numbers[i] = curand_uniform(&state);
        }
    }
    

    You can also avoid passing the seed from outside if you change the curand_init instruction to

    curand_init(clock64(), i, 0, &state);
    

    EDIT

    Following Roger Dahl's comment, I have done a comparison (Kepler K20c) between four different possibilities for the generation of arrays of 131072 elements:

    1. Single random number generation: separate kernels for initialization and random number generation;
    2. Single random number generation: unique kernel for initialization and random number generation;
    3. Multiple random number generation: separate kernels for initialization and random number generation;
    4. Multiple random number generation: unique kernel for initialization and random number generation;

    Below is the code. The timing for generating has been the following:

    1. 861ms;
    2. 852ms;
    3. 866ms;
    4. 2556ms;

    I hope I have correctly understood the performance issue raised by Roger Dahl.

    #include <stdio.h>
    #include <curand.h>
    #include <curand_kernel.h>
    
    #define DSIZE 8192*16
    #define nTPB 256
    
    /***********************/
    /* CUDA ERROR CHECKING */
    /***********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
            if (abort) exit(code);
        }
    }
    
    /*************************/
    /* CURAND INITIALIZATION */
    /*************************/
    __global__ void initCurand(curandState *state, unsigned long seed){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        curand_init(seed, idx, 0, &state[idx]);
    }
    
    __global__ void testrand1(curandState *state, float *a){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        a[idx] = curand_uniform(&state[idx]);
    }
    
    __global__ void testrand2(unsigned long seed, float *a){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        curandState state;
        curand_init(seed, idx, 0, &state);
        a[idx] = curand_uniform(&state);
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        int n_iter = 20;
    
        curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
    
        float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
    
        float time;
        cudaEvent_t start, stop;
    
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
    
        for (int i=0; i<n_iter; i++) {
    
            initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for separate kernels:  %3.1f ms \n", time);
    
        cudaEventRecord(start, 0);
    
        for (int i=0; i<n_iter; i++) {
    
            testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for single kernels:  %3.1f ms \n", time);
    
        cudaEventRecord(start, 0);
    
        for (int i=0; i<n_iter; i++) {
    
            initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);
    
        cudaEventRecord(start, 0);
    
        for (int i=0; i<n_iter; i++) {
    
            testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    
            testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
        }
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);
    
        getchar();
    }
    

    Output on GTX660:

    Elapsed time for separate kernels:  1960.3 ms
    Elapsed time for single kernels:  1536.9 ms
    Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
    Elapsed time for single kernels for multiple random number generation:  4612.2 ms
    

    Output on GTX570:

    Elapsed time for separate kernels:  957.2 ms 
    Elapsed time for single kernels:  947.7 ms 
    Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
    Elapsed time for single kernels for multiple random number generation:  2839.0 ms 
    

    Approximately same performance as the K20c.

    0 讨论(0)
提交回复
热议问题