CUDA: Mapping Error using CUSPARSE csrmv() routine

问题

I'm currently trying to use the CUSPARSE library in order to speed up an HPCG implementation. However, it appears I'm making some kind of mistake during device data allocation.

This is the code segment that results in CUSPARSE_STATUS_MAPPING_ERROR:

int HPC_sparsemv( CRS_Matrix *A_crs_d, 
      FP * x_d, FP * y_d)
{
FP alpha = 1.0f;
FP beta = 0.0f;

FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;

/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseCreateMatDescr(&matDescr);

cusparseStatus_t status;

/*hand off control to CUSPARSE routine*/

#ifdef DOUBLE

status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
    A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr, 
    inds, x_d, &beta, y_d); 


#else

status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows, 
            A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
            col_ind, x_d, &beta, y_d); 

#endif

NOTE: FP is a typedef wrapped by conditional compilation guards, meaning it gets evaluated to be either a float or a double alias at compile-time.

And here is the function handling the data allocation:

int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP* b_d, const FP* const b, FP * x_d, FP * const x, 
    struct CRS_Matrix* A_crs_d, int nrows, int ncols, int nnz){

std::cout << "Beginning device allocation..." << std::endl; 

int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);

int allocStatus = 0;

/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );



/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );


/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );


/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(b_d, b, size_r, cudaMemcpyHostToDevice));

/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &x_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(x_d, x, size_r, cudaMemcpyHostToDevice));


/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );


int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind, 
nnz * sizeof(int) , cudaMemcpyHostToDevice) );


int * rowtmp; 
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp,  (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr, 
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );


allocStatus |= (int) checkCuda( cudaMallocHost( &A_crs_d, sizeof(CRS_Matrix)) );

A_crs_d->vals = valtmp;
A_crs_d->col_ind = indtmp;
A_crs_d->row_ptr = rowtmp;

A_crs_d->nrows = CRS->nrows;
A_crs_d->ncols = CRS->ncols;
A_crs_d->nnz = CRS->nnz;

std::cout << "Device allocation done." << std::endl;

return  allocStatus;
}

During my first stop at StackOverflow I found this solved issue posted by somebody else: Cusparse status mapping error while using cuda constant memory

However, as I'm not using constant memory on the arguments passed to csrmv() that didn't solve my problem. I also checked data integrity and the CRS_Matrix on the device exactly matches the original in host memory.

I'm quite at a loss with this issue and couldn't find anything that would indicate a problem in the CUDA Toolkit Documentation, so any help would be greatly appreciated.

Thanks in advance.

回答1:

There are some errors in the code you have shown.

It's not possible to pass-by-value a pointer parameter to a routine, perform a cudaMalloc operation on that pointer, and then expect that result to show up in the calling environment. You are doing this for the x_d, b_d, and A_crs_d (with cudaMallocHost) parameters that you are passing to cudaAlloc. One possible fix is to handle those parameters as double pointer (**) parameters within the routine, and pass the address of the pointer to the routine. This allows the modified pointer value to show up in the calling environment. This is really a question of proper C coding, and is not specific to CUDA.
At least with respect to cudaAlloc, it appears that you intend to implement Ax=b. In that case, the length of the x vector is the number of columns of A, and the length of the b vector is the number of rows of A. In your cudaAlloc routine, you are allocating both of these as the size of the rows of A, so this can't be correct. This also affects the subsequent cudaMemcpy operation (size).

It appears that the code you have shown was only tested for the double case, since there is a difference the colum index parameter you are passing to each call (presumably for float and double). In any event, I've built a complete code around what you have shown (for the double case), plus the above changes, and it runs without error and produces the correct result for me:

$ cat t1216.cu
#include <cusparse.h>
#include <iostream>

#define checkCuda(x) x

#ifdef USE_FLOAT
typedef float FP;
#else
#define DOUBLE
typedef double FP;
#endif

struct CRS_Matrix{
  FP *vals;
  int *col_ind;
  int *row_ptr;
  int ncols;
  int nnz;
  int nrows;
} *CRS;

cusparseHandle_t cuspHandle;

int HPC_sparsemv( CRS_Matrix *A_crs_d,
      FP * x_d, FP * y_d)
{
FP alpha = 1.0f;
FP beta = 0.0f;

FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;

/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseCreateMatDescr(&matDescr);

cusparseStatus_t status;

/*hand off control to CUSPARSE routine*/

#ifdef DOUBLE

status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
    A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
    inds, x_d, &beta, y_d);


#else

status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
            A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
            col_ind, x_d, &beta, y_d);  // col_ind here should probably be inds

#endif
return (int)status;
}



int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP** b_d, const FP* const b, FP ** x_d, FP * const x,
    struct CRS_Matrix** A_crs_d, int nrows, int ncols, int nnz){

std::cout << "Beginning device allocation..." << std::endl;

int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);

int allocStatus = 0;

/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );



/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );


/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );


/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(*b_d, b, size_r, cudaMemcpyHostToDevice));

/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) x_d, size_c ) );
allocStatus |= (int) checkCuda( cudaMemcpy(*x_d, x, size_c, cudaMemcpyHostToDevice));


/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );


int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind,
nnz * sizeof(int) , cudaMemcpyHostToDevice) );


int * rowtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp,  (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr,
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );


allocStatus |= (int) checkCuda( cudaMallocHost( A_crs_d, sizeof(CRS_Matrix)) );

(*A_crs_d)->vals = valtmp;
(*A_crs_d)->col_ind = indtmp;
(*A_crs_d)->row_ptr = rowtmp;

(*A_crs_d)->nrows = CRS->nrows;
(*A_crs_d)->ncols = CRS->ncols;
(*A_crs_d)->nnz = CRS->nnz;

std::cout << "Device allocation done." << std::endl;

return  allocStatus;

}

int main(){

  CRS = (struct CRS_Matrix *)malloc(sizeof(struct CRS_Matrix));
  cusparseCreate(&cuspHandle);

  // simple test matrix
  #define M0_M 5
  #define M0_N 5
  FP m0_csr_vals[] = {2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f};
  int   m0_col_idxs[] = {   0,    1,    0,    1,    2,    1,    2,    3,    2,    3,    4,    3,    4};
  int   m0_row_ptrs[] = {   0, 2, 5, 8, 11, 13};
  FP m0_d[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
  int m0_nnz = 13;

  FP *r_d, *p_d, *Ap_d, *b_d, *x_d;
  FP *b = new FP[M0_N];
  CRS_Matrix *A_crs_d;
  CRS->vals = m0_csr_vals;
  CRS->col_ind = m0_col_idxs;
  CRS->row_ptr = m0_row_ptrs;
  CRS->nrows = M0_M;
  CRS->ncols = M0_N;
  CRS->nnz = m0_nnz;
  // Ax = b
  // r_d, p_d, Ap_d ??
  int stat = cudaAlloc(r_d, p_d, Ap_d, &b_d, b, &x_d, m0_d, &A_crs_d, M0_M, M0_N, m0_nnz);
  std::cout << "cudaAlloc status: " << stat << std::endl;
  stat = HPC_sparsemv( A_crs_d, x_d, b_d);
  std::cout << "HPC_sparsemv status: " << stat << std::endl;
  FP *results = new FP[M0_M];
  cudaMemcpy(results, b_d, M0_M*sizeof(FP), cudaMemcpyDeviceToHost);
  std::cout << "Results:" << std::endl;
  for (int i = 0; i < M0_M; i++) std::cout << results[i] << std::endl;
  return 0;
}

$ nvcc -o t1216 t1216.cu -lcusparse
t1216.cu(153): warning: variable "r_d" is used before its value is set

t1216.cu(153): warning: variable "p_d" is used before its value is set

t1216.cu(153): warning: variable "Ap_d" is used before its value is set

t1216.cu(153): warning: variable "r_d" is used before its value is set

t1216.cu(153): warning: variable "p_d" is used before its value is set

t1216.cu(153): warning: variable "Ap_d" is used before its value is set

$ cuda-memcheck ./t1216
========= CUDA-MEMCHECK
Beginning device allocation...
Device allocation done.
cudaAlloc status: 0
HPC_sparsemv status: 0
Results:
3
4
4
4
3
========= ERROR SUMMARY: 0 errors
$

Notes:

It's unclear what you intend for r_d, p_d, and Ap_d in the cudaAlloc routine. I've left them as-is. But if you intend to use them for something, they will likely be subject to the issue I describe in 1 above.
As mentioned, your code doesn't seem to be consistent for float vs. double in the parameters you pass to the cusparse routines in HPC_sparsemv. In particular, the column index parameter does not match, and the double version seems sensible to me, so I used that. If you work with float, you will probably need to modify that parameter.
In the future, I'd recommend that you provide a complete code, just as I have shown, to demonstrate the failure. It's not that much more code than what you have shown already, and it will make it easier for others to help you.

来源：https://stackoverflow.com/questions/38412251/cuda-mapping-error-using-cusparse-csrmv-routine

标签

c++

cuda

nvidia

allocation