Using cuBLAS with complex numbers from Thrust

后端 未结 1 1004
醉梦人生
醉梦人生 2021-01-17 05:05

In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.

Using complex numbers from c

相关标签:
1条回答
  • 2021-01-17 05:41

    Despite your protestations to the contrary, the C++ standard library complex (or thrust::complex) most certainly does work with CUBLAS. The cuComplex and cuDoubleComplex are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.

    A simple modification to the code you posted in comments works exactly as you might imagine:

    #include <algorithm>
    #include <iostream>
    #include <complex>
    #include <cublas_v2.h>
    
    using namespace std;
    
    int main()
    {
      int xmax = 100;
      complex<double>  u[xmax][xmax];
      double arrSize = sizeof(complex<double>) * xmax * xmax;
    
      fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
      u[49][51] += complex<double>(665.0,665.0);
      u[51][49] *= 2.0;
    
      cout << "Before:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      complex<double> alpha(1.0, 0.0);
      complex<double> beta(0.0, 0.0);
      cublasHandle_t handle;
      cublasCreate(&handle);
    
      cuDoubleComplex* d_u;
      cuDoubleComplex* d_v;
      cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
      cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
      cudaMalloc(&d_u, arrSize);
      cudaMalloc(&d_v, arrSize);
      cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
      cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                      _alpha, d_u, xmax,
                      _beta,  d_u, xmax,
                      d_v, xmax);
    
      cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
    
      cout << "After:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      return 0;
    }
    

    built and run like so:

    ~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
    ~/SO$ ./complex_transpose 
    Before:
    (666,666)
    (2,2)
    After:
    (2,2)
    (666,666)
    

    The only modifications required are explicit casts of the std::complex<double> types to cuDoubleComplex. Do that and everything works as expected.

    Use thrust, the code looks almost identical:

    #include <iostream>
    #include <thrust/fill.h>
    #include <thrust/complex.h>
    #include <cublas_v2.h>
    
    using namespace std;
    
    int main()
    {
      int xmax = 100;
      thrust::complex<double>  u[xmax][xmax];
      double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
    
      thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
      u[49][51] += thrust::complex<double>(665.0,665.0);
      u[51][49] *= 2.0;
    
      cout << "Before:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      thrust::complex<double> alpha(1.0, 0.0);
      thrust::complex<double> beta(0.0, 0.0);
      cublasHandle_t handle;
      cublasCreate(&handle);
    
      cuDoubleComplex* d_u;
      cuDoubleComplex* d_v;
      cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
      cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
      cudaMalloc(&d_u, arrSize);
      cudaMalloc(&d_v, arrSize);
      cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
      cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                      _alpha, d_u, xmax,
                      _beta,  d_u, xmax,
                      d_v, xmax);
    
      cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
    
      cout << "After:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      return 0;
    }
    

    Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:

    #include <iostream>
    #include <thrust/device_vector.h>
    #include <thrust/complex.h>
    #include <thrust/execution_policy.h>
    #include <thrust/copy.h>
    #include <cublas_v2.h>
    
    __global__ void setup_kernel(thrust::complex<double>* u, int xmax)
    {
      u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
      u[49 + 51*xmax] *= 2.0;
    }
    
    int main()
    {
      int xmax = 100;
    
      thrust::complex<double> alpha(1.0, 0.0);
      thrust::complex<double> beta(0.0, 0.0);
      cublasHandle_t handle;
      cublasCreate(&handle);
    
      thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
      thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
      setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
    
      cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
      cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
      cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
      cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
    
      cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                      _alpha, _d_u, xmax,
                      _beta, _d_u, xmax,
                      _d_v, xmax);
    
      thrust::complex<double>  u[xmax][xmax];
    
      thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
      std::cout << "Before:" << std::endl;
      std::cout << u[49][51] << std::endl;
      std::cout << u[51][49] << std::endl;
    
      thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
      std::cout << "After:" << std::endl;
      std::cout << u[49][51] << std::endl;
      std::cout << u[51][49] << std::endl;
    
      return 0;
    
    }
    
    0 讨论(0)
提交回复
热议问题