What is the most efficient way to transpose a matrix in CUDA?

后端 未结 3 1686
予麋鹿
予麋鹿 2021-01-06 16:27

I have a M*N host memory matrix, and upon copying into a device memory, I need it to be transposed into a N*M matrix. Is there any cuda (cuBLAS...)

相关标签:
3条回答
  • 2021-01-06 17:10

    To answer your question on efficiency, I have compared two ways to perform matrix transposition, one using the Thrust library and one using cublas<t>geam, as suggested by Robert Crovella. The result of the comparison is the following on a Kepler K20c card:

    | Matrix size   | Thrust [ms]   | cuBLAS [ms]   |
    |               |               |               |
    | 32x32         | 0.015         | 0.016         |
    | 64x64         | 0.015         | 0.017         |
    | 128x128       | 0.019         | 0.017         |
    | 256x256       | 0.028         | 0.017         |
    | 512x512       | 0.088         | 0.042         |
    | 1024x1024     | 0.34          | 0.13          |
    | 2048x2048     | 1.24          | 0.48          |
    | 4096x4096     | 11.02         | 1.98          |
    

    As it can be seen, the cublas<t>geam outperforms the version using Thrust. Below is the code to perform the comparison.

    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/functional.h>
    #include <thrust/gather.h>
    #include <thrust/scan.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <iostream>
    #include <iomanip>
    #include <cublas_v2.h>
    #include <conio.h>
    #include <assert.h>
    
    /**********************/
    /* cuBLAS ERROR CHECK */
    /**********************/
    #ifndef cublasSafeCall
    #define cublasSafeCall(err)     __cublasSafeCall(err, __FILE__, __LINE__)
    #endif
    
    inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
    {
        if( CUBLAS_STATUS_SUCCESS != err) {
            fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
            getch(); cudaDeviceReset(); assert(0); 
        }
    }
    
    // convert a linear index to a linear index in the transpose 
    struct transpose_index : public thrust::unary_function<size_t,size_t>
    {
        size_t m, n;
    
        __host__ __device__
        transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
    
        __host__ __device__
        size_t operator()(size_t linear_index)
        {
            size_t i = linear_index / n;
            size_t j = linear_index % n;
    
            return m * j + i;
        }
    };
    
    // convert a linear index to a row index
    struct row_index : public thrust::unary_function<size_t,size_t>
    {
        size_t n;
    
        __host__ __device__
        row_index(size_t _n) : n(_n) {}
    
        __host__ __device__
    
        size_t operator()(size_t i)
        {
            return i / n;
        }
    };
    
    // transpose an M-by-N array
    template <typename T>
    void transpose(size_t m, size_t n, thrust::device_vector<T>& src, thrust::device_vector<T>& dst)
    {
        thrust::counting_iterator<size_t> indices(0);
    
        thrust::gather
        (thrust::make_transform_iterator(indices, transpose_index(n, m)),
        thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
        src.begin(),dst.begin());
    }
    
    // print an M-by-N array
    template <typename T>
    void print(size_t m, size_t n, thrust::device_vector<T>& d_data)
    {
        thrust::host_vector<T> h_data = d_data;
    
        for(size_t i = 0; i < m; i++)
        {
            for(size_t j = 0; j < n; j++)
                std::cout << std::setw(8) << h_data[i * n + j] << " ";
                std::cout << "\n";
        }
    }
    
    int main(void)
    {
        size_t m = 5; // number of rows
        size_t n = 4; // number of columns
    
        // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
        thrust::device_vector<double> data(m * n, 1.);
        data[1] = 2.;
        data[3] = 3.;
    
        std::cout << "Initial array" << std::endl;
        print(m, n, data);
    
        std::cout << "Transpose array - Thrust" << std::endl;
        thrust::device_vector<double> transposed_thrust(m * n);
        transpose(m, n, data, transposed_thrust);
        print(n, m, transposed_thrust);
    
        std::cout << "Transpose array - cuBLAS" << std::endl;
        thrust::device_vector<double> transposed_cuBLAS(m * n);
        double* dv_ptr_in  = thrust::raw_pointer_cast(data.data());
        double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
        double alpha = 1.;
        double beta  = 0.;
        cublasHandle_t handle;
        cublasSafeCall(cublasCreate(&handle));
        cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m)); 
        print(n, m, transposed_cuBLAS);
    
        getch();
    
        return 0;
    }
    
    0 讨论(0)
  • 2021-01-06 17:16

    CULA has auxiliary routines to compute the transpose (culaDevice?geTranspose). In case of a square matrix you could also use inplace transposition (culaDevise?geTransposeInplace).

    Note: CULA has a free license available, if you meet certain conditions.

    0 讨论(0)
  • 2021-01-06 17:30

    In the cublas API:

    cublas<t>geam()
    
    This function performs the matrix-matrix addition/transposition
    the user can transpose matrix A by setting *alpha=1 and *beta=0.  
    

    (and specifying the transa operator as CUBLAS_OP_T for transpose)

    0 讨论(0)
提交回复
热议问题