问题
I'm trying to make a pycuda wrapper inspired by scikits-cuda library for some operations provided in the new cuSolver library of Nvidia. I want to solve a linear system of the form AX=B by LU factorization, to perform that first use the cublasSgetrfBatched method from scikits-cuda, that give me the factorization LU; then with that factorization I want to solve the system using cusolverDnSgetrs from cuSolve that I want to wrap, when I perform the computation return status 3, the matrices that supose to give me the answer don't change, BUT the *devInfo is zero, looking in the cusolver's documentation says:
CUSOLVER_STATUS_INVALID_VALUE=An unsupported value or parameter was passed to the function (a negative vector size, for example).
libcusolver.cusolverDnSgetrs.restype=int
libcusolver.cusolverDnSgetrs.argtypes=[_types.handle,
ctypes.c_char,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
"""
handle is the handle pointer given by calling cusolverDnCreate() from cuSolver
LU is the LU factoriced matrix given by cublasSgetrfBatched() from scikits
P is the pivots matrix given by cublasSgetrfBatched()
B is the right hand matix from AX=B
"""
def cusolverSolveLU(handle,LU,P,B):
rows_LU ,cols_LU = LU.shape
rows_B, cols_B = B.shape
B_gpu = gpuarray.to_gpu(B.astype('float32'))
info_gpu = gpuarray.zeros(1, np.int32)
status=libcusolver.cusolverDnSgetrs(
handle, 'n', rows_LU, cols_B,
int(LU.gpudata), cols_LU,
int(P.gpudata), int(B_gpu.gpudata),
cols_B, int(info_gpu.gpudata))
print info_gpu
print status
handle= cusolverCreate() #get the initialization of cusolver
LU, P = cublasLUFactorization(...)
B = np.asarray(np.random.rand(3, 3), np.float32)
cusolverSolveLU(handle,LU,P,B)
The output:
[0]
3
What I'm doing wrong?
回答1:
Here is a full working example of how to use the library; the result is validated against that obtained using numpy's built-in solver:
import ctypes
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
CUSOLVER_STATUS_SUCCESS = 0
libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so')
libcusolver.cusolverDnCreate.restype = int
libcusolver.cusolverDnCreate.argtypes = [ctypes.c_void_p]
def cusolverDnCreate():
handle = ctypes.c_void_p()
status = libcusolver.cusolverDnCreate(ctypes.byref(handle))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
return handle.value
libcusolver.cusolverDnDestroy.restype = int
libcusolver.cusolverDnDestroy.argtypes = [ctypes.c_void_p]
def cusolverDnDestroy(handle):
status = libcusolver.cusolverDnDestroy(handle)
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf_bufferSize.restype = int
libcusolver.cusolverDnSgetrf_bufferSize.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork):
status = libcusolver.cusolverDnSgetrf_bufferSize(handle, m, n,
int(A.gpudata),
n, ctypes.pointer(Lwork))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrf.restype = int
libcusolver.cusolverDnSgetrf.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p]
def cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo):
status = libcusolver.cusolverDnSgetrf(handle, m, n, int(A.gpudata),
lda,
int(Workspace.gpudata),
int(devIpiv.gpudata),
int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
libcusolver.cusolverDnSgetrs.restype = int
libcusolver.cusolverDnSgetrs.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
def cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo):
status = libcusolver.cusolverDnSgetrs(handle, trans, n, nrhs,
int(A.gpudata), lda,
int(devIpiv.gpudata), int(B.gpudata),
ldb, int(devInfo.gpudata))
if status != CUSOLVER_STATUS_SUCCESS:
raise RuntimeError('error!')
if __name__ == '__main__':
m = 3
n = 3
a = np.asarray(np.random.rand(m, n), np.float32)
a_gpu = gpuarray.to_gpu(a.T.copy())
lda = m
b = np.asarray(np.random.rand(m, n), np.float32)
b_gpu = gpuarray.to_gpu(b.T.copy())
ldb = m
handle = cusolverDnCreate()
Lwork = ctypes.c_int()
cusolverDnSgetrf_bufferSize(handle, m, n, a_gpu, lda, Lwork)
Workspace = gpuarray.empty(Lwork.value, dtype=np.float32)
devIpiv = gpuarray.zeros(min(m, n), dtype=np.int32)
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrf(handle, m, n, a_gpu, lda, Workspace, devIpiv, devInfo)
if devInfo.get()[0] != 0:
raise RuntimeError('error!')
CUBLAS_OP_N = 0
nrhs = n
devInfo = gpuarray.zeros(1, dtype=np.int32)
cusolverDnSgetrs(handle, CUBLAS_OP_N, n, nrhs, a_gpu, lda, devIpiv, b_gpu, ldb, devInfo)
x_cusolver = b_gpu.get().T
cusolverDnDestroy(handle)
x_numpy = np.linalg.solve(a, b)
print np.allclose(x_numpy, x_cusolver)
来源:https://stackoverflow.com/questions/29780180/getrs-function-of-cusolver-over-pycuda-doesnt-work-properly