Adapt existing code and Kernel code to perform a high number of 3x3 matrix inversion

后端 未结 1 695
伪装坚强ぢ
伪装坚强ぢ 2020-12-22 13:19

Following a previous question ( Performing high number of 4x4 matrix inversion - PyCuda ), considering the inversion of 4x4 matrix, I would like to do the same but with 3x3

相关标签:
1条回答
  • 2020-12-22 13:37

    This answer will closely follow my answer on the 4x4 invert question, both in terms of answer layout and calculation method/kernel design. The formulas are described here.

    First, as before, we will show a CUDA C++ version with comparison to cublas:

    $ cat t432.cu
    #include <iostream>
    #include <cublas_v2.h>
    #include <cstdlib>
    // 3x3 matrix inversion
    // https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
    // https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/
    // 9 threads per matrix to invert
    // 32 matrices per 288 thread block
    
    const unsigned block_size = 288;
    typedef double mt;
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    long long dtime_usec(unsigned long long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    
    __device__ unsigned pat[9];
    const unsigned hpat[9] = {0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140};
    
    
    __device__ unsigned getoff(unsigned &off){
      unsigned ret = off & 0x0F;
      off >>= 4;
      return ret;
    }
    
    // in-place is acceptable i.e. out == in)
    // T = float or double only
    template <typename T>
    __global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n){
    
      __shared__ T si[block_size];
      size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
      T det = 1;
      if (idx < n*9)
        det = in[idx];
      unsigned sibase = (threadIdx.x / 9)*9;
      unsigned lane = threadIdx.x - sibase; // cheaper modulo
      si[threadIdx.x] = det;
      __syncthreads();
      unsigned off = pat[lane];
      T a  = si[sibase + getoff(off)];
      a   *= si[sibase + getoff(off)];
      T b  = si[sibase + getoff(off)];
      b   *= si[sibase + getoff(off)];
      a -= b;
      __syncthreads();
      if (lane == 0) si[sibase+3] = a;
      if (lane == 3) si[sibase+4] = a;
      if (lane == 6) si[sibase+5] = a;
      __syncthreads();
      det =  si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
      if (idx < n*9)
        out[idx] = a / det;
    }
    
    size_t nr = 2048;
    int main(int argc, char *argv[]){
      if (argc > 1) nr = atoi(argv[1]);
    
    
      const mt m2[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0};
      const mt i2[] = {2.0, 0.0, -1.0, -1.0, -0.33333334, 1.0, 0.0, 0.33333334,  0.0};
      const mt m1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
      const mt i1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
    
      mt *h_d, *d_d;
      h_d = (mt *)malloc(nr*9*sizeof(mt));
      cudaMalloc(&d_d, nr*9*sizeof(mt));
      cudaMemcpyToSymbol(pat, hpat, 9*sizeof(unsigned));
      for (int i = 0; i < nr/2; i++){
        memcpy(h_d+i*2*9, m1, sizeof(m1));
        memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
      cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
      long long t = dtime_usec(0);
      inv3x3<<<((nr*9)/block_size)+1, block_size>>>(d_d, d_d, nr);
      cudaDeviceSynchronize();
      t = dtime_usec(t);
      cudaMemcpy(h_d, d_d, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
      for (int i = 0; i < 2; i++){
        for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
        std::cout << std::endl;
        for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
        std::cout << std::endl;}
      std::cout << "kernel time: " << t << " microseconds" << std::endl;
      cudaError_t err = cudaGetLastError();
      if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
      //cublas
      for (int i = 0; i < nr/2; i++){
        memcpy(h_d+i*2*9, m1, sizeof(m1));
        memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
      cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
      cublasHandle_t h;
      cublasStatus_t cs = cublasCreate(&h);
      if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
      mt **A, **Ai, *Aid, **Ap, **Aip;
      A  = (mt **)malloc(nr*sizeof(mt *));
      Ai = (mt **)malloc(nr*sizeof(mt *));
      cudaMalloc(&Aid, nr*9*sizeof(mt));
      cudaMalloc(&Ap,  nr*sizeof(mt *));
      cudaMalloc(&Aip, nr*sizeof(mt *));
      for (int i = 0; i < nr; i++) A[i]  =  d_d + 9*i;
      for (int i = 0; i < nr; i++) Ai[i] =  Aid + 9*i;
      cudaMemcpy(Ap, A, nr*sizeof(mt *), cudaMemcpyHostToDevice);
      cudaMemcpy(Aip, Ai, nr*sizeof(mt *), cudaMemcpyHostToDevice);
      int *info;
      cudaMalloc(&info, nr*sizeof(int));
      t = dtime_usec(0);
      cs = cublasDmatinvBatched(h, 3,  Ap, 3, Aip, 3, info, nr);
      if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
      cudaDeviceSynchronize();
      t = dtime_usec(t);
      cudaMemcpy(h_d, Aid, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
      for (int i = 0; i < 2; i++){
        for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
        std::cout << std::endl;
        for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
        std::cout << std::endl;}
      std::cout << "cublas time: " << t << " microseconds" << std::endl;
      err = cudaGetLastError();
      if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
      return 0;
    }
    $ nvcc -o t432 t432.cu -lcublas
    $ ./t432
    1,0,0,0,1,0,0,0,1,
    1,0,0,0,1,0,0,0,1,
    2,-0,-1,-1,-0.333333,1,-0,0.333333,-0,
    2,0,-1,-1,-0.333333,1,0,0.333333,0,
    kernel time: 59 microseconds
    1,0,0,0,1,0,0,0,1,
    1,0,0,0,1,0,0,0,1,
    2,0,-1,-1,-0.333333,1,0,0.333333,0,
    2,0,-1,-1,-0.333333,1,0,0.333333,0,
    cublas time: 68 microseconds
    $
    

    So this is perhaps slightly faster than cublas but not much, for this 2048 matrix test case, CUDA 10.0, Tesla P100, linux.

    Similar to the previous answer, here is a simplified (only 2 matrices) pycuda test case:

    $ cat t14.py
    import numpy as np
    # import matplotlib.pyplot as plt
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import pycuda.autoinit
    # kernel
    kernel = SourceModule("""
    
    __device__ unsigned getoff(unsigned &off){
      unsigned ret = off & 0x0F;
      off >>= 4;
      return ret;
    }
    
    // in-place is acceptable i.e. out == in)
    // T = float or double only
    const int block_size = 288;
    typedef double T; // *** can set to float or double
    __global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
    
      __shared__ T si[block_size];
      size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
      T det = 1;
      if (idx < n*9)
        det = in[idx];
      unsigned sibase = (threadIdx.x / 9)*9;
      unsigned lane = threadIdx.x - sibase; // cheaper modulo
      si[threadIdx.x] = det;
      __syncthreads();
      unsigned off = pat[lane];
      T a  = si[sibase + getoff(off)];
      a   *= si[sibase + getoff(off)];
      T b  = si[sibase + getoff(off)];
      b   *= si[sibase + getoff(off)];
      a -= b;
      __syncthreads();
      if (lane == 0) si[sibase+3] = a;
      if (lane == 3) si[sibase+4] = a;
      if (lane == 6) si[sibase+5] = a;
      __syncthreads();
      det =  si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
      if (idx < n*9)
        out[idx] = a / det;
    }
    
    """)
    # host code
    def gpuinv3x3(inp, n):
        # internal constants not to be modified
        hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
        # Convert parameters into numpy array
        # *** change next line between float32 and float64 to match float or double
        inpd = np.array(inp, dtype=np.float64)
        hpatd = np.array(hpat, dtype=np.uint32)
        # *** change next line between float32 and float64 to match float or double
        output = np.empty((n*9), dtype= np.float64)
        # Get kernel function
        matinv3x3 = kernel.get_function("inv3x3")
        # Define block, grid and compute
        blockDim = (288,1,1) # do not change
        gridDim = ((n/32)+1,1,1)
        # Kernel function
        matinv3x3 (
            cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
            block=blockDim, grid=gridDim)
        return output
    inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
    n = 2
    result = gpuinv3x3(inp, n)
    print(result.reshape(2,3,3))
    $ python t14.py
    [[[ 2.         -0.         -1.        ]
      [-1.         -0.33333333  1.        ]
      [-0.          0.33333333 -0.        ]]
    
     [[ 1.          0.          0.        ]
      [ 0.          1.          0.        ]
      [ 0.          0.          1.        ]]]
    $
    

    The above happens to be using double i.e. float64 in pycuda. Changing it to float i.e. float32 in pycuda involves changing the same 3 lines as described in this answer.

    0 讨论(0)
提交回复
热议问题