I'm currently trying to use the CUSPARSE library in order to speed up an HPCG implementation. However, it appears I'm making some kind of mistake during device data allocation.
This is the code segment that results in CUSPARSE_STATUS_MAPPING_ERROR:
int HPC_sparsemv( CRS_Matrix *A_crs_d,
FP * x_d, FP * y_d)
FP alpha = 1.0f;
FP beta = 0.0f;
FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;
/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseStatus_t status;
/*hand off control to CUSPARSE routine*/
#ifdef DOUBLE
status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
inds, x_d, &beta, y_d);
status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
col_ind, x_d, &beta, y_d);
NOTE: FP is a typedef wrapped by conditional compilation guards, meaning it gets evaluated to be either a float or a double alias at compile-time.
And here is the function handling the data allocation:
int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP* b_d, const FP* const b, FP * x_d, FP * const x,
struct CRS_Matrix* A_crs_d, int nrows, int ncols, int nnz){
std::cout << "Beginning device allocation..." << std::endl;
int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);
int allocStatus = 0;
/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );
/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );
/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );
/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(b_d, b, size_r, cudaMemcpyHostToDevice));
/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &x_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(x_d, x, size_r, cudaMemcpyHostToDevice));
/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );
int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind,
nnz * sizeof(int) , cudaMemcpyHostToDevice) );
int * rowtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp, (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr,
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );
allocStatus |= (int) checkCuda( cudaMallocHost( &A_crs_d, sizeof(CRS_Matrix)) );
A_crs_d->vals = valtmp;
A_crs_d->col_ind = indtmp;
A_crs_d->row_ptr = rowtmp;
A_crs_d->nrows = CRS->nrows;
A_crs_d->ncols = CRS->ncols;
A_crs_d->nnz = CRS->nnz;
std::cout << "Device allocation done." << std::endl;
return allocStatus;
During my first stop at StackOverflow I found this solved issue posted by somebody else: Cusparse status mapping error while using cuda constant memory
However, as I'm not using constant memory on the arguments passed to csrmv() that didn't solve my problem. I also checked data integrity and the CRS_Matrix on the device exactly matches the original in host memory.
I'm quite at a loss with this issue and couldn't find anything that would indicate a problem in the CUDA Toolkit Documentation, so any help would be greatly appreciated.
Thanks in advance.
There are some errors in the code you have shown.
It's not possible to pass-by-value a pointer parameter to a routine, perform a
operation on that pointer, and then expect that result to show up in the calling environment. You are doing this for thex_d
, andA_crs_d
) parameters that you are passing tocudaAlloc
. One possible fix is to handle those parameters as double pointer (**
) parameters within the routine, and pass the address of the pointer to the routine. This allows the modified pointer value to show up in the calling environment. This is really a question of proper C coding, and is not specific to CUDA.At least with respect to
, it appears that you intend to implementAx=b
. In that case, the length of thex
vector is the number of columns ofA
, and the length of theb
vector is the number of rows ofA
. In yourcudaAlloc
routine, you are allocating both of these as the size of the rows ofA
, so this can't be correct. This also affects the subsequentcudaMemcpy
operation (size).
It appears that the code you have shown was only tested for the double
case, since there is a difference the colum index parameter you are passing to each call (presumably for float
and double
). In any event, I've built a complete code around what you have shown (for the double
case), plus the above changes, and it runs without error and produces the correct result for me:
$ cat t1216.cu
#include <cusparse.h>
#include <iostream>
#define checkCuda(x) x
#ifdef USE_FLOAT
typedef float FP;
#define DOUBLE
typedef double FP;
struct CRS_Matrix{
FP *vals;
int *col_ind;
int *row_ptr;
int ncols;
int nnz;
int nrows;
} *CRS;
cusparseHandle_t cuspHandle;
int HPC_sparsemv( CRS_Matrix *A_crs_d,
FP * x_d, FP * y_d)
FP alpha = 1.0f;
FP beta = 0.0f;
FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;
/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseStatus_t status;
/*hand off control to CUSPARSE routine*/
#ifdef DOUBLE
status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
inds, x_d, &beta, y_d);
status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
col_ind, x_d, &beta, y_d); // col_ind here should probably be inds
return (int)status;
int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP** b_d, const FP* const b, FP ** x_d, FP * const x,
struct CRS_Matrix** A_crs_d, int nrows, int ncols, int nnz){
std::cout << "Beginning device allocation..." << std::endl;
int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);
int allocStatus = 0;
/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );
/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );
/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );
/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(*b_d, b, size_r, cudaMemcpyHostToDevice));
/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) x_d, size_c ) );
allocStatus |= (int) checkCuda( cudaMemcpy(*x_d, x, size_c, cudaMemcpyHostToDevice));
/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );
int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind,
nnz * sizeof(int) , cudaMemcpyHostToDevice) );
int * rowtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp, (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr,
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );
allocStatus |= (int) checkCuda( cudaMallocHost( A_crs_d, sizeof(CRS_Matrix)) );
(*A_crs_d)->vals = valtmp;
(*A_crs_d)->col_ind = indtmp;
(*A_crs_d)->row_ptr = rowtmp;
(*A_crs_d)->nrows = CRS->nrows;
(*A_crs_d)->ncols = CRS->ncols;
(*A_crs_d)->nnz = CRS->nnz;
std::cout << "Device allocation done." << std::endl;
return allocStatus;
int main(){
CRS = (struct CRS_Matrix *)malloc(sizeof(struct CRS_Matrix));
// simple test matrix
#define M0_M 5
#define M0_N 5
FP m0_csr_vals[] = {2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f};
int m0_col_idxs[] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
int m0_row_ptrs[] = { 0, 2, 5, 8, 11, 13};
FP m0_d[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
int m0_nnz = 13;
FP *r_d, *p_d, *Ap_d, *b_d, *x_d;
FP *b = new FP[M0_N];
CRS_Matrix *A_crs_d;
CRS->vals = m0_csr_vals;
CRS->col_ind = m0_col_idxs;
CRS->row_ptr = m0_row_ptrs;
CRS->nrows = M0_M;
CRS->ncols = M0_N;
CRS->nnz = m0_nnz;
// Ax = b
// r_d, p_d, Ap_d ??
int stat = cudaAlloc(r_d, p_d, Ap_d, &b_d, b, &x_d, m0_d, &A_crs_d, M0_M, M0_N, m0_nnz);
std::cout << "cudaAlloc status: " << stat << std::endl;
stat = HPC_sparsemv( A_crs_d, x_d, b_d);
std::cout << "HPC_sparsemv status: " << stat << std::endl;
FP *results = new FP[M0_M];
cudaMemcpy(results, b_d, M0_M*sizeof(FP), cudaMemcpyDeviceToHost);
std::cout << "Results:" << std::endl;
for (int i = 0; i < M0_M; i++) std::cout << results[i] << std::endl;
return 0;
$ nvcc -o t1216 t1216.cu -lcusparse
t1216.cu(153): warning: variable "r_d" is used before its value is set
t1216.cu(153): warning: variable "p_d" is used before its value is set
t1216.cu(153): warning: variable "Ap_d" is used before its value is set
t1216.cu(153): warning: variable "r_d" is used before its value is set
t1216.cu(153): warning: variable "p_d" is used before its value is set
t1216.cu(153): warning: variable "Ap_d" is used before its value is set
$ cuda-memcheck ./t1216
Beginning device allocation...
Device allocation done.
cudaAlloc status: 0
HPC_sparsemv status: 0
========= ERROR SUMMARY: 0 errors
It's unclear what you intend for
, andAp_d
in thecudaAlloc
routine. I've left them as-is. But if you intend to use them for something, they will likely be subject to the issue I describe in 1 above.As mentioned, your code doesn't seem to be consistent for
in the parameters you pass to the cusparse routines inHPC_sparsemv
. In particular, the column index parameter does not match, and thedouble
version seems sensible to me, so I used that. If you work withfloat
, you will probably need to modify that parameter.In the future, I'd recommend that you provide a complete code, just as I have shown, to demonstrate the failure. It's not that much more code than what you have shown already, and it will make it easier for others to help you.