In my code I use arrays with complex numbers from thrust library and I would like to use cublasZgeam() in order to transpose the array.
Using complex numbers from c
Despite your protestations to the contrary, the C++ standard library complex
(or thrust::complex
) most certainly does work with CUBLAS. The cuComplex
and cuDoubleComplex
are design to be binary compatible with standard host complex types so that data does not be translated when passed to CUBLAS functions which use complex data on the device.
A simple modification to the code you posted in comments works exactly as you might imagine:
#include
#include
#include
#include
using namespace std;
int main()
{
int xmax = 100;
complex u[xmax][xmax];
double arrSize = sizeof(complex) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex(1.0,1.0));
u[49][51] += complex(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex alpha(1.0, 0.0);
complex beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
cuDoubleComplex* _beta = reinterpret_cast(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
built and run like so:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
The only modifications required are explicit casts of the std::complex
types to cuDoubleComplex
. Do that and everything works as expected.
Use thrust, the code looks almost identical:
#include
#include
#include
#include
using namespace std;
int main()
{
int xmax = 100;
thrust::complex u[xmax][xmax];
double arrSize = sizeof(thrust::complex) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex(1.0,1.0));
u[49][51] += thrust::complex(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex alpha(1.0, 0.0);
thrust::complex beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
cuDoubleComplex* _beta = reinterpret_cast(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
Perhaps something closer to your use case, using thrust device containers with a kernel performing some initialisation prior to a CUBLAS call:
#include
#include
#include
#include
#include
#include
__global__ void setup_kernel(thrust::complex* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex alpha(1.0, 0.0);
thrust::complex beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector> d_u(xmax * xmax, thrust::complex(1.0,1.0));
thrust::device_vector> d_v(xmax * xmax, thrust::complex(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast(&alpha);
cuDoubleComplex* _beta = reinterpret_cast(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}