Proper way to calculate `trans(a)*inv(b)*a` with Intel MKL (follow-up question)

问题

This is a follow-up question to this other with the same title (I did a major edit to it, but I was told it should be another question - and I can't think of another title).

I am using Intel's MKL LAPACKE and CBLAS to calculate

yn = trans(a)*inv(zt)*a + trans(b)*inv(zl)*b

Where a and b are m-by-n real matrices, zt and zl are m-by-m complex matrices. The resulting complex matrix yn is n-by-n.

Here is how I am doing it:

zt <- inv(zt)
zl <- inv(zl)
c <- zt*a
yn <- trans(a)*c
c <- zl*b
yn <- trans(b)*c + yn

The C code:

#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>

int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
    int i, j;
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < n; j++ )
        {
            fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
            if (j < n - 1) fprintf(fp, ",");
        }
        fprintf(fp, "\n");
    }
    return 0;
}

int calc_yn(
    _Complex double* yn, double* a, double *b, _Complex double* zl,
    _Complex double* zt, int m, int n)
{
    lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
    LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
    LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
    LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
    LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
    free(ipiv);
    const double alpha = 1.0;
    const double beta = 0.0;
    lapack_complex_double* c = (lapack_complex_double*) malloc(
        sizeof(lapack_complex_double)*(m*n));
    // c <- zt*a
    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, m,
                &alpha, zt, m, a, n,
                &beta, c, n);
    FILE* fp = fopen("c1.csv", "w");
    print_zmatrix_file(m, n, c, n, fp);
    fclose(fp);
    // yn <- aT*c
    cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                n, n, m,
                &alpha, a, n, c, n,
                &beta, yn, n);
    // c <- zl*b
    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, m,
                &alpha, zl, m, b, n,
                &beta, c, n);
    FILE* fp2 = fopen("c2.csv", "w");
    print_zmatrix_file(m, n, c, n, fp2);
    fclose(fp2);
    // yn <- bT*c + yn
    cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                n, n, m,
                &alpha, b, n, c, n,
                &alpha, yn, n);
    free(c);
    return 0;
}

int main()
{
    int m = 2;
    int n = 3;
    _Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
    double a[] = {
        0.5, 0.0, 0.5,
        0.5, 0.5, 0.0
    };
    double b[] = {
        1.0, 0.0, -1.0,
        1.0, -1.0, 0.0
    };
    _Complex double zt[] = {
        (0.004 + 0.09*I), (-0.004 - 0.12*I),
        (-0.004 - 0.12*I), (0.005 + 0.11*I)
    };
    _Complex double zl[] = {
        (0.1 + 2.13*I), (-124.004 - 800.12*I),
        (-124.004 - 800.12*I), (0.4 + 4.08*I)
    };
    calc_yn(yn, a, b, zl, zt, m, n);
    FILE* fp = fopen("yn.csv", "w");
    print_zmatrix_file(n, n, yn, n, fp);
    fclose(fp);
    free(yn);
    return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl

The code in the previous question had an error in the malloc to yn (it was using sizeof(_Complex double*) instead of sizeof(_Complex double)). Having that error corrected, the code compiles and runs successfully. After running it, I compared the results with the ones I get with SciPy. They do not agree.

import numpy
from scipy import linalg

a = numpy.array([[0.5, 0.0, 0.5],
                 [0.5, 0.5, 0.0]])
b = numpy.array([[1.0, 0.0, -1.0],
                 [1.0, -1.0, 0.0]])
zt = numpy.array([[0.004 + 0.09j, -0.004 - 0.12j],
                  [-0.004 - 0.12j, 0.005 + 0.11j]])
zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j],
                  [124.004 - 800.12j, 0.4 + 4.08j]])

c1 = numpy.matmul(linalg.inv(zt), a)
m1 = numpy.matmul(a.T, c1)
c2 = numpy.matmul(linalg.inv(zl), b)
m2 = numpy.matmul(b.T, c2)
yn = m1 + m2

yn_file = numpy.genfromtxt('yn.csv', delimiter=',', dtype=numpy.complex128)
c1_file = numpy.genfromtxt('c1.csv', delimiter=',', dtype=numpy.complex128)
c2_file = numpy.genfromtxt('c2.csv', delimiter=',', dtype=numpy.complex128)

numpy.max(numpy.abs(yn)) #0.004958820819049211
numpy.max(numpy.abs(yn_file)) #60.4590237745794

numpy.max(numpy.abs(c1)) #25.549314567403204
numpy.max(numpy.abs(c1_file)) #41.278805716697306

numpy.max(numpy.abs(c2)) #0.0012411403762584482
numpy.max(numpy.abs(c2_file)) #0.03292682468747935

There is something wrong either in my C code or in the Python one. Why I am getting different results?

Edit: further testing as per @Bwebb suggestion. He noticed a copy-paste error where -124.004 - 800.12i appears as +124.004 - 800.12i in the Python code. Correcting that does not change the results

To make it easier to test, I used the matrices:

a = numpy.array([[1.0, 0.0],
                 [0.0, 1.0]])
b = numpy.array([[0.0, -1.0],
                 [-1.0, 0.0]])
zt = a
zl = b

Which results in

yn = [[1.0, -1.0]
      [-1.0, 1.0]]

The Python code gives that result, but the C one gives

yn = [[0.0 + 2.0j, 1.0 + 2.0j]
      [-1.0 + 2.0j, 0.0 + 0.0j]]

That makes me conclude that the C code is wrong, but I do not know where.

回答1:

From the code posted in your question:

Python:

zl = numpy.array([[0.1 + 2.13j, 124.004 - 800.12j], ## <==HERE
                  [124.004 - 800.12j, 0.4 + 4.08j]]) ## <==HERE ALSO

_Complex double zl[] = {
    (0.1 + 2.13*I), (-124.004 - 800.12*I), // <==HERE
    (-124.004 - 800.12*I), (0.4 + 4.08*I) // <== HERE ALSO

I noticed that one is -124.004 - 800.12i and the other is 124.004 - 800.12i. Im not sure which one you are trying to use, but set them both to the same one and see if the results are still different. If they are still different, set them both to a unit testable value that you know what the outcome would be (a=[1 0 0; 0 1 0; 0 0 1] or something easy to compute). That will tell you which one (or both) is/are incorrect.

回答2:

That behavior (most likely undefined) is caused by passing double array to zgemm instead of _Complex double. When I change matrices a and b to be complex, then I get the expected result.

Here is the fixed C code for testing:

#include <math.h>
#include <complex.h>
#include <stdlib.h>
#include <stdio.h>
#include <mkl_types.h>
#define MKL_Complex16 _Complex double //overwrite type
#include <mkl.h>
#include <mkl_lapacke.h>

int print_zmatrix_file(int m, int n, _Complex double* a, int lda, FILE* fp)
{
    int i, j;
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < n; j++ )
        {
            fprintf(fp, "(%.6f%+.6fj)", creal(a[i*lda+j]), cimag(a[i*lda+j]) );
            if (j < n - 1) fprintf(fp, ",");
        }
        fprintf(fp, "\n");
    }
    return 0;
}

int calc_yn(
    _Complex double* yn, _Complex double* a, _Complex double *b,
    _Complex double* zl, _Complex double* zt, int m, int n)
{
    lapack_int* ipiv = (MKL_INT*) malloc(sizeof(lapack_int)*m);
    LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zt, m, ipiv);
    LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zt, m, ipiv);
    LAPACKE_zgetrf(LAPACK_ROW_MAJOR, m, m, zl, m, ipiv);
    LAPACKE_zgetri(LAPACK_ROW_MAJOR, m, zl, m, ipiv);
    free(ipiv);
    const double alpha = 1.0;
    const double beta = 0.0;
    lapack_complex_double* c = (lapack_complex_double*) malloc(
        sizeof(lapack_complex_double)*(m*n));
    // c <- zt*a
    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, m,
                &alpha, zt, m, a, n,
                &beta, c, n);
    // yn <- aT*c
    cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                n, n, m,
                &alpha, a, n, c, n,
                &beta, yn, n);
    // c <- zl*b
    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, m,
                &alpha, zl, m, b, n,
                &beta, c, n);
    // yn <- bT*c + yn
    cblas_zgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                n, n, m,
                &alpha, b, n, c, n,
                &alpha, yn, n);
    free(c);
    return 0;
}

int main()
{
    int m = 2;
    int n = 2;
    _Complex double a[] = {
        1.0, 0.0,
        0.0, 1.0
    };
    _Complex double b[] = {
        0.0, -1.0,
        -1.0, 0.0
    };
    _Complex double zt[] = {
        1.0, 0.0,
        0.0, 1.0
    };
    _Complex double zl[] = {
        0.0, -1.0,
        -1.0, 0.0
    };
    _Complex double* yn = (_Complex double*) malloc(sizeof(_Complex double)*(n*n));
    calc_yn(yn, a, b, zl, zt, m, n);
    FILE* fp = fopen("yn.csv", "w");
    print_zmatrix_file(n, n, yn, n, fp);
    fclose(fp);
    free(yn);
    return 0;
}
// compile command (MKLROOT is defined by a bash script that is shipped together with intel's MKL):
//gcc -std=c11 -DMKL_ILP64 -m64 -g -o test.a test.c -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl

来源：https://stackoverflow.com/questions/53090142/proper-way-to-calculate-transainvba-with-intel-mkl-follow-up-question

标签

python

scipy

intel-mkl