问题
I am trying to implement Cholesky decomposition using the cuSOLVER library. I am a beginner CUDA programmer and I have always specified block-sizes and grid-sizes, but I am not able to find out how this can be set explicitly by the programmer with cuSOLVER functions.
Here is the documentation: http://docs.nvidia.com/cuda/cusolver/index.html#introduction
The QR decomposition is implemented using the cuSOLVER library (see the example here: http://docs.nvidia.com/cuda/cusolver/index.html#ormqr-example1) and even there the above two parameters are not set.
To summarize, I have the following questions
- How can the parameters: block-size and grid-size can be set with the cuSOLVER library?
- How is the same being done with the mentioned QR example code in the NVIDIA documentation?
回答1:
Robert Crovella has already answered this question. Here, I'm just providing a full example showing how Cholesky decomposition can be easily performed using the potrf
function provided by the cuSOLVER library.
The Utilities.cu
and Utilities.cuh
files are mantained at this page and omitted here. The example implements the CPU as well as the GPU approach.
#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"
#include<iostream>
#include <fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#define prec_save 10
/******************************************/
/* SET HERMITIAN POSITIVE DEFINITE MATRIX */
/******************************************/
// --- Credit to: https://math.stackexchange.com/questions/357980/how-to-generate-random-symmetric-positive-definite-matrices-using-matlab
void setPDMatrix(double * __restrict h_A, const int N) {
// --- Initialize random seed
srand(time(NULL));
double *h_A_temp = (double *)malloc(N * N * sizeof(double));
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A_temp[i * N + j] = (float)rand() / (float)RAND_MAX;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = 0.5 * (h_A_temp[i * N + j] + h_A_temp[j * N + i]);
for (int i = 0; i < N; i++) h_A[i * N + i] = h_A[i * N + i] + N;
}
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/********/
/* MAIN */
/********/
int main(){
const int N = 1000;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Setting the host, N x N matrix
double *h_A = (double *)malloc(N * N * sizeof(double));
setPDMatrix(h_A, N);
// --- Allocate device space for the input matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * sizeof(double)));
// --- Move the relevant matrix from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice));
/****************************************/
/* COMPUTING THE CHOLESKY DECOMPOSITION */
/****************************************/
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA CHOLESKY initialization
cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, &work_size));
// --- CUDA POTRF execution
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, work, work_size, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful potrf execution\n\n" << "devInfo = " << devInfo_h << "\n\n";
// --- At this point, the lower triangular part of A contains the elements of L.
/***************************************/
/* CHECKING THE CHOLESKY DECOMPOSITION */
/***************************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\h_A.txt", N * N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\d_A.txt", N * N);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
EDIT
Cholesky decomposition requires that the relevant matrix is Hermitian and positive definite. Symmetric and positive definite matrices can be generated by the approach in How to generate random symmetric positive definite matrices using MATLAB?.
The following Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
load h_A.txt
A = reshape(h_A, N, N);
yMatlab = A * x;
Lmatlab = chol(A, 'lower');
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Lmatlab') * xprime;
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load d_A.txt
LCUDA = tril(reshape(d_A, N, N));
fprintf('Percentage rms of Cholesky decompositions in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(Lmatlab - LCUDA).^2)) / sum(sum(abs(Lmatlab).^2))));
load xCUDA.txt
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xCUDA - x).^2)) / sum(sum(abs(x).^2))));
回答2:
You don't set block sizes (or grid sizes) explicitly when using a library like cusolver, or cublas, or cusparse.
The library chooses these, when it actually runs device code internal to the library.
来源:https://stackoverflow.com/questions/29196139/cholesky-decomposition-with-cuda