问题
This is a follow-up question to this other with the same title (I did a major edit to it, but I was told it should be another question - and I can't think of another title).
I am using Intel's MKL LAPACKE and CBLAS to calculate
yn = trans(a)*inv(zt)*a + trans(b)*inv(zl)*b
Where a
and b
are m-by-n real matrices, zt
and zl
are m-by-m complex matrices. The resulting complex matrix yn
is n-by-n.
Here is how I am doing it:
zt <- inv(zt)
zl <- inv(zl)
c <- zt*a
yn <- trans(a)*c
c <- zl*b
yn <- trans(b)*c + yn
The C code:
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>
int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
int i, j;
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
if (j < n - 1) fprintf(fp, ",");
}
fprintf(fp, "\n");
}
return 0;
}
int calc_yn(
_Complex double* yn, double* a, double *b, _Complex double* zl,
_Complex double* zt, int m, int n)
{
lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
free(ipiv);
const double alpha = 1.0;
const double beta = 0.0;
lapack_complex_double* c = (lapack_complex_double*) malloc(
sizeof(lapack_complex_double)*(m*n));
// c <- zt*a
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zt, m, a, n,
&beta, c, n);
FILE* fp = fopen("c1.csv", "w");
print_zmatrix_file(m, n, c, n, fp);
fclose(fp);
// yn <- aT*c
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, a, n, c, n,
&beta, yn, n);
// c <- zl*b
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zl, m, b, n,
&beta, c, n);
FILE* fp2 = fopen("c2.csv", "w");
print_zmatrix_file(m, n, c, n, fp2);
fclose(fp2);
// yn <- bT*c + yn
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, b, n, c, n,
&alpha, yn, n);
free(c);
return 0;
}
int main()
{
int m = 2;
int n = 3;
_Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
double a[] = {
0.5, 0.0, 0.5,
0.5, 0.5, 0.0
};
double b[] = {
1.0, 0.0, -1.0,
1.0, -1.0, 0.0
};
_Complex double zt[] = {
(0.004 + 0.09*I), (-0.004 - 0.12*I),
(-0.004 - 0.12*I), (0.005 + 0.11*I)
};
_Complex double zl[] = {
(0.1 + 2.13*I), (-124.004 - 800.12*I),
(-124.004 - 800.12*I), (0.4 + 4.08*I)
};
calc_yn(yn, a, b, zl, zt, m, n);
FILE* fp = fopen("yn.csv", "w");
print_zmatrix_file(n, n, yn, n, fp);
fclose(fp);
free(yn);
return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl
The code in the previous question had an error in the malloc
to yn
(it was using sizeof(_Complex double*)
instead of sizeof(_Complex double)
). Having that error corrected, the code compiles and runs successfully. After running it, I compared the results with the ones I get with SciPy. They do not agree.
import numpy
from scipy import linalg
a = numpy.array([[0.5, 0.0, 0.5],
[0.5, 0.5, 0.0]])
b = numpy.array([[1.0, 0.0, -1.0],
[1.0, -1.0, 0.0]])
zt = numpy.array([[0.004 + 0.09j, -0.004 - 0.12j],
[-0.004 - 0.12j, 0.005 + 0.11j]])
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j],
[124.004 - 800.12j, 0.4 + 4.08j]])
c1 = numpy.matmul(linalg.inv(zt), a)
m1 = numpy.matmul(a.T, c1)
c2 = numpy.matmul(linalg.inv(zl), b)
m2 = numpy.matmul(b.T, c2)
yn = m1 + m2
yn_file = numpy.genfromtxt('yn.csv', delimiter=',', dtype=numpy.complex128)
c1_file = numpy.genfromtxt('c1.csv', delimiter=',', dtype=numpy.complex128)
c2_file = numpy.genfromtxt('c2.csv', delimiter=',', dtype=numpy.complex128)
numpy.max(numpy.abs(yn)) #0.004958820819049211
numpy.max(numpy.abs(yn_file)) #60.4590237745794
numpy.max(numpy.abs(c1)) #25.549314567403204
numpy.max(numpy.abs(c1_file)) #41.278805716697306
numpy.max(numpy.abs(c2)) #0.0012411403762584482
numpy.max(numpy.abs(c2_file)) #0.03292682468747935
There is something wrong either in my C code or in the Python one. Why I am getting different results?
Edit: further testing as per @Bwebb suggestion. He noticed a copy-paste error where -124.004 - 800.12i
appears as +124.004 - 800.12i
in the Python code. Correcting that does not change the results
To make it easier to test, I used the matrices:
a = numpy.array([[1.0, 0.0],
[0.0, 1.0]])
b = numpy.array([[0.0, -1.0],
[-1.0, 0.0]])
zt = a
zl = b
Which results in
yn = [[1.0, -1.0]
[-1.0, 1.0]]
The Python code gives that result, but the C one gives
yn = [[0.0 + 2.0j, 1.0 + 2.0j]
[-1.0 + 2.0j, 0.0 + 0.0j]]
That makes me conclude that the C code is wrong, but I do not know where.
回答1:
From the code posted in your question:
Python:
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j], ## <==HERE
[124.004 - 800.12j, 0.4 + 4.08j]]) ## <==HERE ALSO
C:
_Complex double zl[] = {
(0.1 + 2.13*I), (-124.004 - 800.12*I), // <==HERE
(-124.004 - 800.12*I), (0.4 + 4.08*I) // <== HERE ALSO
I noticed that one is -124.004 - 800.12i and the other is 124.004 - 800.12i. Im not sure which one you are trying to use, but set them both to the same one and see if the results are still different. If they are still different, set them both to a unit testable value that you know what the outcome would be (a=[1 0 0; 0 1 0; 0 0 1] or something easy to compute). That will tell you which one (or both) is/are incorrect.
回答2:
That behavior (most likely undefined) is caused by passing double
array to zgemm
instead of _Complex double
. When I change matrices a
and b
to be complex, then I get the expected result.
Here is the fixed C code for testing:
#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>
int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
int i, j;
for( i = 0; i < m; i++ )
{
for( j = 0; j < n; j++ )
{
fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
if (j < n - 1) fprintf(fp, ",");
}
fprintf(fp, "\n");
}
return 0;
}
int calc_yn(
_Complex double* yn, _Complex double* a, _Complex double *b,
_Complex double* zl, _Complex double* zt, int m, int n)
{
lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
free(ipiv);
const double alpha = 1.0;
const double beta = 0.0;
lapack_complex_double* c = (lapack_complex_double*) malloc(
sizeof(lapack_complex_double)*(m*n));
// c <- zt*a
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zt, m, a, n,
&beta, c, n);
// yn <- aT*c
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, a, n, c, n,
&beta, yn, n);
// c <- zl*b
cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, m,
&alpha, zl, m, b, n,
&beta, c, n);
// yn <- bT*c + yn
cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
n, n, m,
&alpha, b, n, c, n,
&alpha, yn, n);
free(c);
return 0;
}
int main()
{
int m = 2;
int n = 2;
_Complex double a[] = {
1.0, 0.0,
0.0, 1.0
};
_Complex double b[] = {
0.0, -1.0,
-1.0, 0.0
};
_Complex double zt[] = {
1.0, 0.0,
0.0, 1.0
};
_Complex double zl[] = {
0.0, -1.0,
-1.0, 0.0
};
_Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
calc_yn(yn, a, b, zl, zt, m, n);
FILE* fp = fopen("yn.csv", "w");
print_zmatrix_file(n, n, yn, n, fp);
fclose(fp);
free(yn);
return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl
来源:https://stackoverflow.com/questions/53090142/proper-way-to-calculate-transainvba-with-intel-mkl-follow-up-question