I am looking for a solution with Python to perform matrix inversions. I think there should be a way with CUBLAS or MAGMA to execute these operations in a batch or concurrent
As mentioned in the comments, numpy functions in general cannot be used from pycuda kernel code (or CUDA kernel code, or numba cuda kernels).
CUBLAS offers a batched matrix inversion function, but it is not currently exposed in either pyculib cublas interface or scikit-cuda cublas interface.
We could proceed to implement our own interface (e.g. using python ctypes
), but since its known that the matrices to be inverted are 4x4, I thought the suggestion in the comments from talonmies was an interesting one. Referring to the answer here, there is a fairly concise C code to do a direct inversion of a 4x4 matrix.
What follows first is a realization of this in CUDA. The function inv4x4
is an adaptation of the previous code, allotting 16 threads per matrix (one per matrix element) and using that code as a model. Each thread is responsible for computing one result matrix element. First we will compare it to CUBLAS matinvBatched
for performance:
$ cat t411.cu
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
// 4x4 matrix inversion
// https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
// assumes warp size is 32
// assumes block size is multiple of warp size
// therefore assumes number of matrices to be inverted (n) is even
// 16 threads per matrix to invert
const unsigned block_size = 256;
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__device__ unsigned pat[3][16];
const unsigned hpat[3][16] = {
{ 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50},
{ 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14},
{ 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618}};
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
template <typename T>
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[0][lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[1][lane];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[2][lane];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
size_t nr = 2048;
int main(int argc, char *argv[]){
if (argc > 1) nr = atoi(argv[1]);
const mt m1[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0};
const mt i1[] = {-3.0, -0.5, 1.5, 1.0, 1.0, 0.25, -0.25, -0.5, 3.0, 0.25, -1.25, -0.5, -3.0, 0.0, 1.0, 1.0};
const mt m2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
const mt i2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
mt *h_d, *d_d;
h_d = (mt *)malloc(nr*2*16*sizeof(mt));
cudaMalloc(&d_d, nr*2*16*sizeof(mt));
cudaMemcpyToSymbol(pat, hpat, 3*16*sizeof(unsigned));
for (int i = 0; i < nr; i++){
memcpy(h_d+i*16*2, m1, sizeof(m1));
memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
long long t = dtime_usec(0);
inv4x4<<<nr*2*16/block_size, block_size>>>(d_d, d_d, nr*2);
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, d_d, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "kernel time: " << t << " microseconds" << std::endl;
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
//cublas
for (int i = 0; i < nr; i++){
memcpy(h_d+i*16*2, m1, sizeof(m1));
memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
cublasHandle_t h;
cublasStatus_t cs = cublasCreate(&h);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
mt **A, **Ai, *Aid, **Ap, **Aip;
A = (mt **)malloc(nr*2*sizeof(mt *));
Ai = (mt **)malloc(nr*2*sizeof(mt *));
cudaMalloc(&Aid, nr*2*16*sizeof(mt));
cudaMalloc(&Ap, nr*2*sizeof(mt *));
cudaMalloc(&Aip, nr*2*sizeof(mt *));
for (int i = 0; i < nr*2; i++) A[i] = d_d + 16*i;
for (int i = 0; i < nr*2; i++) Ai[i] = Aid + 16*i;
cudaMemcpy(Ap, A, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
cudaMemcpy(Aip, Ai, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
int *info;
cudaMalloc(&info, nr*2*sizeof(int));
t = dtime_usec(0);
cs = cublasSmatinvBatched(h, 4, Ap, 4, Aip, 4, info, nr*2);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, Aid, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "cublas time: " << t << " microseconds" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
return 0;
}
$ nvcc -o t411 t411.cu -lcublas
$ ./t411
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,-0,1,1,
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
kernel time: 49 microseconds
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
cublas time: 95 microseconds
$
We see that the code appears to provide the correct result for 2 test matrices inverted, and the overall time to invert 4096 matrices on a Tesla P100 is about 50us and is about 2x faster than CUBLAS. Note that I have not exhaustively tested this code.
What follows next is a simple pycuda implementation of a similar function. Here, for simplicity we are just inverting 2 matrices:
$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+16];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+32];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*16), dtype= np.float32)
# Get kernel function
matinv4x4 = kernel.get_function("inv4x4")
# Define block, grid and compute
blockDim = (256,1,1) # do not change
gridDim = ((n/16)+1,1,1)
# Kernel function
matinv4x4 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result)
$ python t10.py
[-3. -0.5 1.5 1. 1. 0.25 -0.25 -0.5 3. 0.25 -1.25 -0.5 -3.
-0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
1. 0. 0. 0. 0. 1. ]
$
I've spent very little time creating this pycuda test case, so please consider it as a rough demonstration vehicle.
I suspect that if the only thing you need to do in CUDA is invert these matrices, this won't be an interesting or attractive use case. I expect that the cost to transfer the data to the device and return the results back would outweigh any speed-up benefit from using the GPU, vs. ordinary numpy. However I haven't tested or benchmarked a numpy case.
Note that the use of __syncwarp()
means this kernel code requires CUDA 9.0 or later.
Also note that the code expects an even number of matrices to invert. If you don't have an even number, pad your array with any value to the next even number of matrices.
Also note that the code just assumes the matrices are invertible. There is no test to see if they are not, and for example if the determinant computed were zero, the matrix would not be invertible (using this method) and the results would typically be NaN, due to division-by-zero.
It's not clear what the purpose is here, so this example should not be construed to suggest that general matrix inversion is a good idea or a proper solution method for a particular problem.
Probably a better pythonic method for inversion of dense matrices on the GPU would be to use cupy