问题
I didn't find any similar question to mine. I'm trying to write the gaussian-jordan inverse matrix algorithm. The idea of the algorithm is simple:)
I want to inverse only a lower triangular matrix. I got almost correct answer. Where did I do sth wrong? Does anyone can help me? I will appreciate it.
- d_ A lower triangular matrix (nxn)
- dI identity matrix (nxn)
n size of a matrix in one direction (n%16=0)
dim3 threadsPerBlock(n/16,n/16);
- dim3 numBlocks(16,16);
I know it is a simple implementation but at first I need it to work correctly :) Does anyone can help me or give me any hint? I will appreciate it. Thanks a lot!
there is the whole cpu code:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
using namespace std;
__global__ void gaussjordan(float *A, float *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
float P;
if(x<n && y<n)
if(x>i)
if(y>=i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y]-= I[i*n+y]*P;
A[x*n+y]-= A[i*n+y]*P;
}
__syncthreads();
}
__global__ void dev(float *d_A, float *dI, int h)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if(d_A[x*h+x]!=0){
dI[x*h+y] /= d_A[x*h+x];
d_A[x*h+y] /= d_A[x*h+x];
}
__syncthreads();
}
void savetofile(float *A, string s, int n, int h)
{
std::ofstream plik;
plik.open(s);
for(int j=0;j<h;j++){
for(int i=0;i<h;i++){
plik<<A[j*n+i]<<"\t";}
plik<<endl;}
plik.close();
}
int main()
{
int n = 16;
// creating input
float *iL = new float [n*n];
float *L = new float [n*n];
for(int i=0;i<n;i++)
for(int j=0;j<n;j++)
if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
else L[i*n+j] = 0;
savetofile(L,"L.txt",n,n);
cout<<"inv\n";
float *d_A, *d_L,*I, *dI;
float time;
cudaError_t err;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(float);
dim3 threadsPerBlock(n/16,n/16);
dim3 numBlocks(16,16);
// memory allocation
err= cudaMalloc( (void**) &d_A, ddsize); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
err= cudaMalloc( (void**) &dI, ddsize); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
I = new float[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=1.0;
else I[i*n+j]=0.0;}}
//copy data from GPU to CPU
err =cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
err =cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
savetofile(iL,"inv.txt",n,n);
savetofile(L,"I.txt",n,n);
cudaFree(d_A);
cudaFree(dI);
delete []I;
delete []L;
delete []iL;
system("Pause");
return 0;
}
there is my output:
60.6061 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-34.1233 -2.13675 -0 -0 -0 -0 -0 -0 0 0 0 0 0 0 0 0
-48.5115 1.91603 -0.0799201 -0 -0 -0 -0 -0 0 0 0 0 0 0 0 0
-49.4891 1.8697 0.0748167 -0.0196634 -0 -0 -0 -0 0 0 0 0 0 0 0 0
-49.8332 1.84732 0.0725876 0.018747 -0.00767828 -0 -0 -0 0 0 0 0 0 0 0 0
-50.0073 1.83403 0.071321 0.0182352 0.00739595 -0.00376795 -0 -0 0 0 0 0 0 0 0 0
-50.112 1.82521 0.0705011 0.0179073 0.0072164 0.00365346 -0.00212282 -0 0 0 0 0 0 0 0 0
-50.1818 1.81893 0.0699261 0.0176789 0.00709196 0.00357445 0.00206784 -0.00131234 0 0 0 0 0 0 0 0
-50.2316 1.81423 0.0695003 0.0175105 0.00700059 0.00351662 0.0020277 0.00128271 -0.00086736 -0 -0 -0 -0 -0 -0 -0
-50.2689 1.81057 0.0691722 0.0173813 0.00693062 0.00347244 0.00199711 0.00126017 0.000850006 -0.000602925 -0 -0 -0 -0 -0 -0
-50.2979 1.80765 0.0689115 0.0172789 0.0068753 0.00343758 0.00197301 0.00124245 0.000836382 0.000592093 -0.000435975 -0 -0 -0 -0 -0
-50.321 1.80527 0.0686993 0.0171957 0.00683047 0.00340937 0.00195354 0.00122815 0.000825401 0.000583374 0.000428868 -0.00032541 -0 -0 -0 -0
-50.34 1.80328 0.0685233 0.0171269 0.0067934 0.00338607 0.00193748 0.00121637 0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293 -0 -0 -0
-50.3557 1.80159 0.0683749 0.0170689 0.00676223 0.0033665 0.001924 0.00120649 0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186 -0 -0
-50.369 1.80015 0.0682481 0.0170195 0.00673566 0.00334983 0.00191253 0.00119809 0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673 -0
-50.3805 1.7989 0.0681385 0.0169768 0.00671274 0.00333547 0.00190265 0.00119086 0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382 -0.000126146
and it should be:
60,6060606060606 4,44089209850063e-16 4,85722573273506e-17 -3,12250225675825e-17 0 1,73472347597681e-18 -1,08420217248550e-18 -7,58941520739853e-19 4,33680868994202e-19 -5,42101086242752e-19 0 -6,93889390390723e-18 0 -1,38777878078145e-17 0 1,18720137887163e-17
-34,1232841232841 -2,13675213675214 0 8,67361737988404e-18 3,03576608295941e-18 8,67361737988404e-19 -1,73472347597681e-18 1,35525271560688e-18 -8,67361737988404e-19 1,00288700954909e-18 0 0 6,93889390390723e-18 6,93889390390723e-18 -1,38777878078145e-17 3,02221355580334e-18
-17,9130271437964 1,91603268526345 -0,0799200799200800 1,30104260698261e-18 1,95156391047391e-18 -9,75781955236954e-19 1,95156391047391e-18 2,16840434497101e-19 -3,52365706057789e-19 -1,62630325872826e-19 1,38777878078145e-17 -3,46944695195361e-18 0 0 0 -2,72405795836983e-18
-2,86140643299924 0,0760191125748172 0,0748166415934231 -0,0196633632216454 -2,41234983378025e-18 7,99599102208060e-19 3,25260651745651e-19 -4,74338450462408e-19 2,67662411332359e-19 2,91379333855479e-19 -2,16840434497101e-18 -4,33680868994202e-19 1,30104260698261e-18 0 0 6,86096687275983e-20
-1,33482739506506 0,0346053236774996 0,00125734163772674 0,0187469132242915 -0,00767825058738617 5,35324822664718e-19 -2,23616698075135e-19 5,08219768352580e-20 5,92923063078010e-20 1,74488787134386e-19 -4,33680868994202e-19 4,33680868994202e-19 -2,16840434497101e-19 2,16840434497101e-19 0 -1,19008129089229e-19
-0,793561224702690 0,0203250367373064 0,000727127971238783 0,000177630032830862 0,00739591005669882 -0,00376795430225022 4,98055372985529e-19 -3,84552958053452e-19 3,20178454062126e-19 -1,35525271560688e-19 6,50521303491303e-19 -1,08420217248550e-19 1,08420217248550e-19 -2,16840434497101e-19 0 -7,15742840429884e-20
-0,532255026297144 0,0135340901236068 0,000479383336751935 0,000115847127348313 4,51920594555328e-05 0,00365346070706817 -0,00212282675610843 1,37219337455197e-19 -5,14996031930615e-19 3,30342849429177e-19 0 -2,71050543121376e-19 1,08420217248550e-19 0 0 5,08219768352580e-20
-0,384130052448431 0,00972113086608457 0,000342250536212794 8,21235560483452e-05 3,18129608485860e-05 1,56232096436654e-05 0,00206784220009096 -0,00131233595800525 6,39509875176997e-20 -3,37542629480839e-19 -1,08420217248550e-19 2,16840434497101e-19 0 0 0 -8,47032947254300e-22
-0,291692030052418 0,00735419164507677 0,000257375648850429 6,15185225200113e-05 2,37495210052671e-05 1,16038017329438e-05 6,53368676878396e-06 0,00128271813402154 -0,000867362869930264 1,77876918923403e-19 1,62630325872826e-19 -1,89735380184963e-19 1,62630325872826e-19 0 0 -9,07384044746169e-20
-0,229596895430646 0,00578230937666655 0,000201707743336976 4,79768824589291e-05 1,84020572663637e-05 8,96002707181433e-06 5,05525466995835e-06 3,12009781742606e-06 0,000850011219708818 -0,000602925394011745 0 2,71050543121376e-20 -8,13151629364128e-20 5,42101086242752e-20 -5,42101086242752e-20 7,73976355553617e-20
-0,185720949479909 0,00466765632076680 0,000162419592307734 3,85318721641536e-05 1,47407053519860e-05 7,17308297585328e-06 4,02178178072207e-06 2,48428717850195e-06 1,64547815065802e-06 0,000592092919336558 -0,000435974905284452 0 0 8,13151629364128e-20 -1,08420217248550e-19 2,64697796016969e-20
-0,153867987373140 0,00385473267086607 0,000133863548213241 3,17506489004575e-05 1,20962229586152e-05 5,86799087221288e-06 3,28276799988068e-06 2,02338706451671e-06 1,33735029942045e-06 9,34275734555363e-07 0,000428867197061432 -0,000325409609345764 0 2,71050543121376e-20 0 -1,09055491958991e-20
-0,129703518509601 0,00324211947468978 0,000112403568308126 2,65969300905272e-05 1,01402805713936e-05 4,89779294849866e-06 2,73496124917826e-06 1,68586638861081e-06 1,11012300345236e-06 7,73556738632873e-07 5,60933254708493e-07 0,000320553621268105 -0,000249293253625970 5,42101086242752e-20 0 -1,01114558078482e-20
-0,110691345431593 0,00276839969825208 9,59884298624889e-05 2,25961759289096e-05 8,63052307521336e-06 4,15554692230644e-06 2,31688356971108e-06 1,42511604039733e-06 9,39229137057347e-07 6,51934526276135e-07 4,72019315851685e-07 3,53897320062806e-07 0,000245863313382516 -0,000195185934120844 0 -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05 1,95227026042985e-05 7,41637441475814e-06 3,57424367962823e-06 1,99334817579930e-06 1,21993241781196e-06 8,05577604288488e-07 5,57554928001086e-07 4,03155267486669e-07 3,01723475812485e-07 2,31838854154289e-07 0,000192695260333710 -0,000155673036807333 -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05 1,70229067498473e-05 6,46008752692950e-06 3,11455737751181e-06 1,73159030599080e-06 1,06073213436631e-06 6,96842172109705e-07 4,82764206408816e-07 3,49217230232344e-07 2,60145440758586e-07 2,00286821017368e-07 1,56906945950947e-07 0,000153820426928509 -0,000126146355001072
回答1:
It seems the problem was in your gaussjordan
kernel.
When you are doing gauss-jordan elimination on the original (L
) matrix, it is acceptable to work only on the row elements to the right of the pivot point.
But when you are applying the same row operations to the identity matrix to create the inverse (I
), it's necessary to apply the equivalent row operations to every member of the row, not just those to the right of the pivot point.
So if you modify your gaussjordan
kernel like this:
__global__ void gaussjordan(float *A, float *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
float P;
if(x<n && y<n)
if(x>i){ // this limits operation to rows below the pivot point
P=A[x*n+i]/A[i*n+i];
I[x*n+y] -= I[i*n+y]*P; // apply for every row member
if(y>=i){ //limits to row members to the right of the pivot
A[x*n+y] -= A[i*n+y]*P; // apply only to members right of pivot
}
}
}
I believe you'll have better results. With the above changes, I was able to duplicate your expected results within the accuracy of float
vs. double
, I believe.
来源:https://stackoverflow.com/questions/22041477/cuda-matrix-inverse-gaussian-jordan