Using cuBLAS with complex numbers from Thrust

后端 未结 1 1005
醉梦人生
醉梦人生 2021-01-17 05:05

In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.

Using complex numbers from c

1条回答
  •  爱一瞬间的悲伤
    2021-01-17 05:41

    Despite your protestations to the contrary, the C++ standard library complex (or thrust::complex) most certainly does work with CUBLAS. The cuComplex and cuDoubleComplex are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.

    A simple modification to the code you posted in comments works exactly as you might imagine:

    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    
    int main()
    {
      int xmax = 100;
      complex  u[xmax][xmax];
      double arrSize = sizeof(complex) * xmax * xmax;
    
      fill(&u[0][0], &u[0][0] + (xmax * xmax), complex(1.0,1.0));
      u[49][51] += complex(665.0,665.0);
      u[51][49] *= 2.0;
    
      cout << "Before:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      complex alpha(1.0, 0.0);
      complex beta(0.0, 0.0);
      cublasHandle_t handle;
      cublasCreate(&handle);
    
      cuDoubleComplex* d_u;
      cuDoubleComplex* d_v;
      cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
      cuDoubleComplex* _beta = reinterpret_cast(&beta);
      cudaMalloc(&d_u, arrSize);
      cudaMalloc(&d_v, arrSize);
      cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
      cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                      _alpha, d_u, xmax,
                      _beta,  d_u, xmax,
                      d_v, xmax);
    
      cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
    
      cout << "After:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      return 0;
    }
    

    built and run like so:

    ~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
    ~/SO$ ./complex_transpose 
    Before:
    (666,666)
    (2,2)
    After:
    (2,2)
    (666,666)
    

    The only modifications required are explicit casts of the std::complex types to cuDoubleComplex. Do that and everything works as expected.

    Use thrust, the code looks almost identical:

    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    
    int main()
    {
      int xmax = 100;
      thrust::complex  u[xmax][xmax];
      double arrSize = sizeof(thrust::complex) * xmax * xmax;
    
      thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex(1.0,1.0));
      u[49][51] += thrust::complex(665.0,665.0);
      u[51][49] *= 2.0;
    
      cout << "Before:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      thrust::complex alpha(1.0, 0.0);
      thrust::complex beta(0.0, 0.0);
      cublasHandle_t handle;
      cublasCreate(&handle);
    
      cuDoubleComplex* d_u;
      cuDoubleComplex* d_v;
      cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
      cuDoubleComplex* _beta = reinterpret_cast(&beta);
      cudaMalloc(&d_u, arrSize);
      cudaMalloc(&d_v, arrSize);
      cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
      cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                      _alpha, d_u, xmax,
                      _beta,  d_u, xmax,
                      d_v, xmax);
    
      cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
    
      cout << "After:" << endl;
      cout << u[49][51] << endl;
      cout << u[51][49] << endl;
    
      return 0;
    }
    

    Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    __global__ void setup_kernel(thrust::complex* u, int xmax)
    {
      u[51 + 49*xmax] += thrust::complex(665.0,665.0);
      u[49 + 51*xmax] *= 2.0;
    }
    
    int main()
    {
      int xmax = 100;
    
      thrust::complex alpha(1.0, 0.0);
      thrust::complex beta(0.0, 0.0);
      cublasHandle_t handle;
      cublasCreate(&handle);
    
      thrust::device_vector> d_u(xmax * xmax, thrust::complex(1.0,1.0));
      thrust::device_vector> d_v(xmax * xmax, thrust::complex(0.,0.));
      setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
    
      cuDoubleComplex* _d_u = reinterpret_cast(thrust::raw_pointer_cast(d_u.data()));
      cuDoubleComplex* _d_v = reinterpret_cast(thrust::raw_pointer_cast(d_v.data()));
      cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
      cuDoubleComplex* _beta = reinterpret_cast(&beta);
    
      cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                      _alpha, _d_u, xmax,
                      _beta, _d_u, xmax,
                      _d_v, xmax);
    
      thrust::complex  u[xmax][xmax];
    
      thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
      std::cout << "Before:" << std::endl;
      std::cout << u[49][51] << std::endl;
      std::cout << u[51][49] << std::endl;
    
      thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
      std::cout << "After:" << std::endl;
      std::cout << u[49][51] << std::endl;
      std::cout << u[51][49] << std::endl;
    
      return 0;
    
    }
    

    0 讨论(0)
提交回复
热议问题