Using cuBLAS with complex numbers from Thrust

后端未结

关注

 1  1005

醉梦人生 2021-01-17 05:05

In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.

Using complex numbers from c

1条回答

爱一瞬间的悲伤 (楼主)

2021-01-17 05:41

Despite your protestations to the contrary, the C++ standard library complex (or thrust::complex) most certainly does work with CUBLAS. The cuComplex and cuDoubleComplex are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.

A simple modification to the code you posted in comments works exactly as you might imagine:

#include 
#include 
#include 
#include 

using namespace std;

int main()
{
  int xmax = 100;
  complex  u[xmax][xmax];
  double arrSize = sizeof(complex) * xmax * xmax;

  fill(&u[0][0], &u[0][0] + (xmax * xmax), complex(1.0,1.0));
  u[49][51] += complex(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  complex alpha(1.0, 0.0);
  complex beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);

  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);

  cout << "After:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

built and run like so:

~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose 
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)

The only modifications required are explicit casts of the std::complex types to cuDoubleComplex. Do that and everything works as expected.

Use thrust, the code looks almost identical:

#include 
#include 
#include 
#include 

using namespace std;

int main()
{
  int xmax = 100;
  thrust::complex  u[xmax][xmax];
  double arrSize = sizeof(thrust::complex) * xmax * xmax;

  thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex(1.0,1.0));
  u[49][51] += thrust::complex(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  thrust::complex alpha(1.0, 0.0);
  thrust::complex beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);

  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);

  cout << "After:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:

#include 
#include 
#include 
#include 
#include 
#include 

__global__ void setup_kernel(thrust::complex* u, int xmax)
{
  u[51 + 49*xmax] += thrust::complex(665.0,665.0);
  u[49 + 51*xmax] *= 2.0;
}

int main()
{
  int xmax = 100;

  thrust::complex alpha(1.0, 0.0);
  thrust::complex beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  thrust::device_vector> d_u(xmax * xmax, thrust::complex(1.0,1.0));
  thrust::device_vector> d_v(xmax * xmax, thrust::complex(0.,0.));
  setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);

  cuDoubleComplex* _d_u = reinterpret_cast(thrust::raw_pointer_cast(d_u.data()));
  cuDoubleComplex* _d_v = reinterpret_cast(thrust::raw_pointer_cast(d_v.data()));
  cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast(&beta);

  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, _d_u, xmax,
                  _beta, _d_u, xmax,
                  _d_v, xmax);

  thrust::complex  u[xmax][xmax];

  thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
  std::cout << "Before:" << std::endl;
  std::cout << u[49][51] << std::endl;
  std::cout << u[51][49] << std::endl;

  thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
  std::cout << "After:" << std::endl;
  std::cout << u[49][51] << std::endl;
  std::cout << u[51][49] << std::endl;

  return 0;

}

0 讨论(0)