Getting CUDA Thrust to use a CUDA stream of your choice

前端 未结 2 1150
面向向阳花
面向向阳花 2021-01-01 03:50

Looking at kernel launches within the code of CUDA Thrust, it seems they always use the default stream. Can I make Thrust use a stream of my choice? Am I missing something i

相关标签:
2条回答
  • 2021-01-01 04:24

    No you are not missing anything (at least up to the release snapshot which ships with CUDA 6.0).

    The original Thrust tag based dispatch system deliberately abstracts all of the underlying CUDA API calls away, sacrificing some performance for ease of use and consistency (keep in mind that thrust has backends other than CUDA). If you want that level of flexibility, you will need to try another library (CUB, for example).

    In versions since the CUDA 7.0 snapshot it has become possible to set a stream of choice for thrust operations via the execution policy and dispatch feature.

    0 讨论(0)
  • 2021-01-01 04:41

    I want to update the answer provided by talonmies following the release of Thrust 1.8 which introduces the possibility of indicating the CUDA execution stream as

    thrust::cuda::par.on(stream)
    

    see also

    Thrust Release 1.8.0.

    In the following, I'm recasting the example in

    False dependency issue for the Fermi architecture

    in terms of CUDA Thrust APIs.

    #include <iostream>
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include <stdio.h>
    
    #include <thrust\device_vector.h>
    #include <thrust\execution_policy.h>
    
    #include "Utilities.cuh"
    
    using namespace std;
    
    #define NUM_THREADS 32
    #define NUM_BLOCKS 16
    #define NUM_STREAMS 3
    
    struct BinaryOp{ __host__ __device__ int operator()(const int& o1,const int& o2) { return o1 * o2; } };
    
    int main()
    {
        const int N = 6000000;
    
        // --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
        int *h_in = new int[N]; for(int i = 0; i < N; i++) h_in[i] = 5;
        gpuErrchk(cudaHostRegister(h_in, N * sizeof(int), cudaHostRegisterPortable));
    
        // --- Host side input data allocation and initialization. Registering host memory as page-locked (required for asynch cudaMemcpyAsync).
        int *h_out = new int[N]; for(int i = 0; i < N; i++) h_out[i] = 0;
        gpuErrchk(cudaHostRegister(h_out, N * sizeof(int), cudaHostRegisterPortable));
    
        // --- Host side check results vector allocation and initialization
        int *h_checkResults = new int[N]; for(int i = 0; i < N; i++) h_checkResults[i] = h_in[i] * h_in[i];
    
        // --- Device side input data allocation.
        int *d_in = 0;              gpuErrchk(cudaMalloc((void **)&d_in, N * sizeof(int)));
    
        // --- Device side output data allocation. 
        int *d_out = 0;             gpuErrchk( cudaMalloc((void **)&d_out, N * sizeof(int)));
    
        int streamSize = N / NUM_STREAMS;
        size_t streamMemSize = N * sizeof(int) / NUM_STREAMS;
    
        // --- Set kernel launch configuration
        dim3 nThreads       = dim3(NUM_THREADS,1,1);
        dim3 nBlocks        = dim3(NUM_BLOCKS, 1,1);
        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
    
        // --- Create CUDA streams
        cudaStream_t streams[NUM_STREAMS];
        for(int i = 0; i < NUM_STREAMS; i++)
            gpuErrchk(cudaStreamCreate(&streams[i]));
    
        /**************************/
        /* BREADTH-FIRST APPROACH */
        /**************************/
    
        for(int i = 0; i < NUM_STREAMS; i++) {
            int offset = i * streamSize;
            cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);
        }
    
        for(int i = 0; i < NUM_STREAMS; i++)
        {
            int offset = i * streamSize;
    
            thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamSize/2, 
                                                                thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), BinaryOp());
            thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_in[offset + streamSize/2]) + streamSize/2, 
                                                                thrust::device_pointer_cast(&d_in[offset + streamSize/2]), thrust::device_pointer_cast(&d_out[offset + streamSize/2]), BinaryOp());
    
        }
    
        for(int i = 0; i < NUM_STREAMS; i++) {
            int offset = i * streamSize;
            cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
        }
    
        for(int i = 0; i < NUM_STREAMS; i++)
            gpuErrchk(cudaStreamSynchronize(streams[i]));
    
        gpuErrchk(cudaDeviceSynchronize());
    
        // --- Release resources
        gpuErrchk(cudaHostUnregister(h_in));
        gpuErrchk(cudaHostUnregister(h_out));
        gpuErrchk(cudaFree(d_in));
        gpuErrchk(cudaFree(d_out));
    
        for(int i = 0; i < NUM_STREAMS; i++)
            gpuErrchk(cudaStreamDestroy(streams[i]));
    
        cudaDeviceReset();  
    
        // --- GPU output check
        int sum = 0;
        for(int i = 0; i < N; i++) {     
            //printf("%i %i\n", h_out[i], h_checkResults[i]);
            sum += h_checkResults[i] - h_out[i];
        }
    
        cout << "Error between CPU and GPU: " << sum << endl;
    
        delete[] h_in;
        delete[] h_out;
        delete[] h_checkResults;
    
        return 0;
    }
    

    The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page.

    The Visual Profiler timeline shows the concurrency of CUDA Thrust operations and memory transfers

    enter image description here

    0 讨论(0)
提交回复
热议问题