cuda matrix inverse gaussian jordan

I didn't find any similar question to mine. I'm trying to write the gaussian-jordan inverse matrix algorithm. The idea of the algorithm is simple:)

I want to inverse only a lower triangular matrix. I got almost correct answer. Where did I do sth wrong? Does anyone can help me? I will appreciate it.

d_ A lower triangular matrix (nxn)
dI identity matrix (nxn)
n size of a matrix in one direction (n%16=0)
dim3 threadsPerBlock(n/16,n/16);
dim3 numBlocks(16,16);

I know it is a simple implementation but at first I need it to work correctly :) Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!

there is the whole cpu code:

  #include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>

using namespace std;

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i)
            if(y>=i){
                P=A[x*n+i]/A[i*n+i];
                I[x*n+y]-= I[i*n+y]*P;
                A[x*n+y]-= A[i*n+y]*P;
            }
            __syncthreads();
 }


 __global__ void dev(float *d_A,  float *dI, int h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x<h && y<h)
        if(d_A[x*h+x]!=0){
            dI[x*h+y]  /= d_A[x*h+x];
            d_A[x*h+y] /= d_A[x*h+x];
        }
    __syncthreads();

}

void savetofile(float *A, string s, int n, int h)
{
    std::ofstream plik;
    plik.open(s);

    for(int j=0;j<h;j++){
        for(int i=0;i<h;i++){
            plik<<A[j*n+i]<<"\t";}
            plik<<endl;}
    plik.close();
}

int main()
{
    int n = 16;
// creating input
    float *iL = new float [n*n];
    float *L = new float [n*n];
    for(int i=0;i<n;i++)
        for(int j=0;j<n;j++)
            if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
            else L[i*n+j] = 0;

    savetofile(L,"L.txt",n,n);

    cout<<"inv\n";
    float *d_A, *d_L,*I, *dI;
    float time;
    cudaError_t err;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(float);

    dim3 threadsPerBlock(n/16,n/16);
    dim3 numBlocks(16,16);
// memory allocation    
    err= cudaMalloc( (void**)  &d_A, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err= cudaMalloc( (void**)   &dI, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    I = new float[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=1.0;
                else I[i*n+j]=0.0;}}
 //copy data from GPU to CPU
    err =cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err =cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);  if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,    threadsPerBlock>>>(d_A, dI, n); 

    err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 
    err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
    savetofile(iL,"inv.txt",n,n);
    savetofile(L,"I.txt",n,n);
    cudaFree(d_A);
    cudaFree(dI);
    delete []I;
    delete []L;
    delete []iL;
    system("Pause");
 return 0;
}

there is my output:

60.6061 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
-34.1233    -2.13675    -0  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-48.5115    1.91603 -0.0799201  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.4891    1.8697  0.0748167   -0.0196634  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.8332    1.84732 0.0725876   0.018747    -0.00767828 -0  -0  -0  0   0   0   0   0   0   0   0   
-50.0073    1.83403 0.071321    0.0182352   0.00739595  -0.00376795 -0  -0  0   0   0   0   0   0   0   0   
-50.112 1.82521 0.0705011   0.0179073   0.0072164   0.00365346  -0.00212282 -0  0   0   0   0   0   0   0   0   
-50.1818    1.81893 0.0699261   0.0176789   0.00709196  0.00357445  0.00206784  -0.00131234 0   0   0   0   0   0   0   0   
-50.2316    1.81423 0.0695003   0.0175105   0.00700059  0.00351662  0.0020277   0.00128271  -0.00086736 -0  -0  -0  -0  -0  -0  -0  
-50.2689    1.81057 0.0691722   0.0173813   0.00693062  0.00347244  0.00199711  0.00126017  0.000850006 -0.000602925    -0  -0  -0  -0  -0  -0  
-50.2979    1.80765 0.0689115   0.0172789   0.0068753   0.00343758  0.00197301  0.00124245  0.000836382 0.000592093 -0.000435975    -0  -0  -0  -0  -0  
-50.321 1.80527 0.0686993   0.0171957   0.00683047  0.00340937  0.00195354  0.00122815  0.000825401 0.000583374 0.000428868 -0.00032541 -0  -0  -0  -0  
-50.34  1.80328 0.0685233   0.0171269   0.0067934   0.00338607  0.00193748  0.00121637  0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293    -0  -0  -0  
-50.3557    1.80159 0.0683749   0.0170689   0.00676223  0.0033665   0.001924    0.00120649  0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186    -0  -0  
-50.369 1.80015 0.0682481   0.0170195   0.00673566  0.00334983  0.00191253  0.00119809  0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673    -0  
-50.3805    1.7989  0.0681385   0.0169768   0.00671274  0.00333547  0.00190265  0.00119086  0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382  -0.000126146

and it should be:

60,6060606060606    4,44089209850063e-16    4,85722573273506e-17    -3,12250225675825e-17   0   1,73472347597681e-18    -1,08420217248550e-18   -7,58941520739853e-19   4,33680868994202e-19    -5,42101086242752e-19   0   -6,93889390390723e-18   0   -1,38777878078145e-17   0   1,18720137887163e-17
-34,1232841232841   -2,13675213675214   0   8,67361737988404e-18    3,03576608295941e-18    8,67361737988404e-19    -1,73472347597681e-18   1,35525271560688e-18    -8,67361737988404e-19   1,00288700954909e-18    0   0   6,93889390390723e-18    6,93889390390723e-18    -1,38777878078145e-17   3,02221355580334e-18
-17,9130271437964   1,91603268526345    -0,0799200799200800 1,30104260698261e-18    1,95156391047391e-18    -9,75781955236954e-19   1,95156391047391e-18    2,16840434497101e-19    -3,52365706057789e-19   -1,62630325872826e-19   1,38777878078145e-17    -3,46944695195361e-18   0   0   0   -2,72405795836983e-18
-2,86140643299924   0,0760191125748172  0,0748166415934231  -0,0196633632216454 -2,41234983378025e-18   7,99599102208060e-19    3,25260651745651e-19    -4,74338450462408e-19   2,67662411332359e-19    2,91379333855479e-19    -2,16840434497101e-18   -4,33680868994202e-19   1,30104260698261e-18    0   0   6,86096687275983e-20
-1,33482739506506   0,0346053236774996  0,00125734163772674 0,0187469132242915  -0,00767825058738617    5,35324822664718e-19    -2,23616698075135e-19   5,08219768352580e-20    5,92923063078010e-20    1,74488787134386e-19    -4,33680868994202e-19   4,33680868994202e-19    -2,16840434497101e-19   2,16840434497101e-19    0   -1,19008129089229e-19
-0,793561224702690  0,0203250367373064  0,000727127971238783    0,000177630032830862    0,00739591005669882 -0,00376795430225022    4,98055372985529e-19    -3,84552958053452e-19   3,20178454062126e-19    -1,35525271560688e-19   6,50521303491303e-19    -1,08420217248550e-19   1,08420217248550e-19    -2,16840434497101e-19   0   -7,15742840429884e-20
-0,532255026297144  0,0135340901236068  0,000479383336751935    0,000115847127348313    4,51920594555328e-05    0,00365346070706817 -0,00212282675610843    1,37219337455197e-19    -5,14996031930615e-19   3,30342849429177e-19    0   -2,71050543121376e-19   1,08420217248550e-19    0   0   5,08219768352580e-20
-0,384130052448431  0,00972113086608457 0,000342250536212794    8,21235560483452e-05    3,18129608485860e-05    1,56232096436654e-05    0,00206784220009096 -0,00131233595800525    6,39509875176997e-20    -3,37542629480839e-19   -1,08420217248550e-19   2,16840434497101e-19    0   0   0   -8,47032947254300e-22
-0,291692030052418  0,00735419164507677 0,000257375648850429    6,15185225200113e-05    2,37495210052671e-05    1,16038017329438e-05    6,53368676878396e-06    0,00128271813402154 -0,000867362869930264   1,77876918923403e-19    1,62630325872826e-19    -1,89735380184963e-19   1,62630325872826e-19    0   0   -9,07384044746169e-20
-0,229596895430646  0,00578230937666655 0,000201707743336976    4,79768824589291e-05    1,84020572663637e-05    8,96002707181433e-06    5,05525466995835e-06    3,12009781742606e-06    0,000850011219708818    -0,000602925394011745   0   2,71050543121376e-20    -8,13151629364128e-20   5,42101086242752e-20    -5,42101086242752e-20   7,73976355553617e-20
-0,185720949479909  0,00466765632076680 0,000162419592307734    3,85318721641536e-05    1,47407053519860e-05    7,17308297585328e-06    4,02178178072207e-06    2,48428717850195e-06    1,64547815065802e-06    0,000592092919336558    -0,000435974905284452   0   0   8,13151629364128e-20    -1,08420217248550e-19   2,64697796016969e-20
-0,153867987373140  0,00385473267086607 0,000133863548213241    3,17506489004575e-05    1,20962229586152e-05    5,86799087221288e-06    3,28276799988068e-06    2,02338706451671e-06    1,33735029942045e-06    9,34275734555363e-07    0,000428867197061432    -0,000325409609345764   0   2,71050543121376e-20    0   -1,09055491958991e-20
-0,129703518509601  0,00324211947468978 0,000112403568308126    2,65969300905272e-05    1,01402805713936e-05    4,89779294849866e-06    2,73496124917826e-06    1,68586638861081e-06    1,11012300345236e-06    7,73556738632873e-07    5,60933254708493e-07    0,000320553621268105    -0,000249293253625970   5,42101086242752e-20    0   -1,01114558078482e-20
-0,110691345431593  0,00276839969825208 9,59884298624889e-05    2,25961759289096e-05    8,63052307521336e-06    4,15554692230644e-06    2,31688356971108e-06    1,42511604039733e-06    9,39229137057347e-07    6,51934526276135e-07    4,72019315851685e-07    3,53897320062806e-07    0,000245863313382516    -0,000195185934120844   0   -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05    1,95227026042985e-05    7,41637441475814e-06    3,57424367962823e-06    1,99334817579930e-06    1,21993241781196e-06    8,05577604288488e-07    5,57554928001086e-07    4,03155267486669e-07    3,01723475812485e-07    2,31838854154289e-07    0,000192695260333710    -0,000155673036807333   -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05    1,70229067498473e-05    6,46008752692950e-06    3,11455737751181e-06    1,73159030599080e-06    1,06073213436631e-06    6,96842172109705e-07    4,82764206408816e-07    3,49217230232344e-07    2,60145440758586e-07    2,00286821017368e-07    1,56906945950947e-07    0,000153820426928509    -0,000126146355001072

It seems the problem was in your gaussjordan kernel.

When you are doing gauss-jordan elimination on the original (L) matrix, it is acceptable to work only on the row elements to the right of the pivot point.

But when you are applying the same row operations to the identity matrix to create the inverse (I), it's necessary to apply the equivalent row operations to every member of the row, not just those to the right of the pivot point.

So if you modify your gaussjordan kernel like this:

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i){ // this limits operation to rows below the pivot point
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] -= I[i*n+y]*P;  // apply for every row member
            if(y>=i){ //limits  to row members to the right of the pivot
                A[x*n+y] -= A[i*n+y]*P;  // apply only to members right of pivot
            }
        }
 }

I believe you'll have better results. With the above changes, I was able to duplicate your expected results within the accuracy of float vs. double, I believe.

来源：https://stackoverflow.com/questions/22041477/cuda-matrix-inverse-gaussian-jordan

标签

algorithm

matrix

cuda

gaussian

inverse