Solving dense linear systems AX = B with CUDA

前端 未结 1 1619
无人共我
无人共我 2021-01-31 23:13

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form

AX = B 

where A, X and B

相关标签:
1条回答
  • 2021-01-31 23:45

    Yes.

    Approach nr. 1

    In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.

    Approach nr. 2

    Alternatively, you can calculate the matrix inverse by the successive involation of

     cublas<t>getrfBatched() 
    

    which calculates the LU decomposition of a matrix, and

    cublas<t>getriBatched() 
    

    which calculates the inverse of the matrix starting from its LU decomposition.

    Approach nr. 3

    A final possibility is using

    cublas<t>getrfBatched() 
    

    followed by a twofold invocation of

    cublas<t>trsm() 
    

    which solves upper or lower triangular linear systems.

    As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.

    Code for approach nr. 1

    Please, see QR decomposition to solve linear systems in CUDA.

    Code for approaches nr. 2 and nr. 3

    Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.

    #include <stdio.h>
    #include <fstream>
    #include <iomanip>
    #include <stdlib.h>     /* srand, rand */
    #include <time.h>       /* time */
    
    #include "cuda_runtime.h" 
    #include "device_launch_parameters.h"
    
    #include "cublas_v2.h"
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    #define prec_save 10
    
    #define BLOCKSIZE 256
    
    #define BLOCKSIZEX 16
    #define BLOCKSIZEY 16
    
    /************************************/
    /* SAVE REAL ARRAY FROM CPU TO FILE */
    /************************************/
    template <class T>
    void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /************************************/
    /* SAVE REAL ARRAY FROM GPU TO FILE */
    /************************************/
    template <class T>
    void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
    
        T *h_in = (T *)malloc(M * sizeof(T));
    
        gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /***************************************************/
    /* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
    /***************************************************/
    // --- https://en.wikipedia.org/wiki/Hankel_matrix
    void setHankelMatrix(double * __restrict h_A, const int N) {
    
        double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
    
        // --- Initialize random seed
        srand(time(NULL));
    
        // --- Generate random numbers
        for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
    
        // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++)
                h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
    
        free(h_atemp);
    
    }
    
    /***********************************************/
    /* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
    /***********************************************/
    void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
                                   double * __restrict h_y, const int N) {
    
        for (int k = 0; k < N; k++) h_y[k] = 0.f;
    
        for (int m = 0; m < N; m++)
            for (int n = 0; n < N; n++)
                h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
    
    }
    
    /************************************/
    /* COEFFICIENT REARRANGING FUNCTION */
    /************************************/
    void rearrange(double *vec, int *pivotArray, int N){
        for (int i = 0; i < N; i++) {
            double temp = vec[i];
            vec[i] = vec[pivotArray[i] - 1];
            vec[pivotArray[i] - 1] = temp;
        }   
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        const unsigned int N = 1000;
    
        const unsigned int Nmatrices = 1;
    
        // --- CUBLAS initialization
        cublasHandle_t cublas_handle;
        cublasSafeCall(cublasCreate(&cublas_handle));
    
        TimingGPU timerLU, timerApproach1, timerApproach2;
        double timingLU, timingApproach1, timingApproach2;
    
        /***********************/
        /* SETTING THE PROBLEM */
        /***********************/
        // --- Matrices to be inverted (only one in this example)
        double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
    
        // --- Setting the Hankel matrix
        setHankelMatrix(h_A, N);
    
        // --- Defining the solution
        double *h_xref = (double *)malloc(N * sizeof(double));
        for (int k = 0; k < N; k++) h_xref[k] = 1.f;
    
        // --- Coefficient vectors (only one in this example)
        double *h_y = (double *)malloc(N * sizeof(double));
    
        computeCoefficientsVector(h_A, h_xref, h_y, N);
    
        // --- Result (only one in this example)
        double *h_x = (double *)malloc(N * sizeof(double));
    
        // --- Allocate device space for the input matrices 
        double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
        double *d_y; gpuErrchk(cudaMalloc(&d_y, N *                 sizeof(double)));
        double *d_x; gpuErrchk(cudaMalloc(&d_x, N *                 sizeof(double)));
    
        // --- Move the relevant matrices from host to device
        gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_y, h_y, N *                 sizeof(double), cudaMemcpyHostToDevice));
    
        /**********************************/
        /* COMPUTING THE LU DECOMPOSITION */
        /**********************************/
        timerLU.StartCounter();
    
        // --- Creating the array of pointers needed as input/output to the batched getrf
        double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
        for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
    
        double **d_inout_pointers;
        gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
        gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
        free(h_inout_pointers);
    
        int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
        int *d_InfoArray;  gpuErrchk(cudaMalloc(&d_InfoArray,      Nmatrices * sizeof(int)));
    
        int *h_InfoArray  = (int *)malloc(Nmatrices * sizeof(int));
    
        cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
        //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
    
        gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    
        for (int i = 0; i < Nmatrices; i++)
            if (h_InfoArray[i] != 0) {
                fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
                cudaDeviceReset();
                exit(EXIT_FAILURE);
            }
    
        timingLU = timerLU.GetCounter();
        printf("Timing LU decomposition %f [ms]\n", timingLU);
    
        /*********************************/
        /* CHECKING THE LU DECOMPOSITION */
        /*********************************/
        saveCPUrealtxt(h_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
        saveCPUrealtxt(h_y,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
        saveGPUrealtxt(d_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
        saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);
    
        /******************************************************************************/
        /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
        /******************************************************************************/
        timerApproach1.StartCounter();
    
        // --- Allocate device space for the inverted matrices 
        double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
    
        // --- Creating the array of pointers needed as output to the batched getri
        double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
        for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
    
        double **d_out_pointers;
        gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
        gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
        free(h_out_pointers);
    
        cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
    
        gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    
        for (int i = 0; i < Nmatrices; i++)
            if (h_InfoArray[i] != 0) {
            fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
            }
    
        double alpha1 = 1.f;
        double beta1 = 0.f;
    
        cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
    
        timingApproach1 = timingLU + timerApproach1.GetCounter();
        printf("Timing approach 1 %f [ms]\n", timingApproach1);
    
        /**************************/
        /* CHECKING APPROACH NR.1 */
        /**************************/
        saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);
    
        /*************************************************************/
        /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
        /*************************************************************/
        timerApproach2.StartCounter();
    
        double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
    
        gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
        int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
        gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    
        rearrange(h_y, h_pivotArray, N);
        gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    
        // --- Now P*A=L*U
        //     Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
    
        // --- 1st phase - solve Ly = b 
        const double alpha = 1.f;
    
        // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 
    
        // --- Lower triangular part
        cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
    
        // --- Upper triangular part
        cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
    
        timingApproach2 = timingLU + timerApproach2.GetCounter();
        printf("Timing approach 2 %f [ms]\n", timingApproach2);
    
        /**************************/
        /* CHECKING APPROACH NR.2 */
        /**************************/
        saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);
    
        return 0;
    }
    

    The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.

    Some useful references on the third approach:

    NAG Fortran Library Routine Document

    Scientific Computing Software Library (SCSL) User’s Guide

    https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

    EDIT

    Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).

    N        LU decomposition       Approach nr. 2       Approach nr. 3
    100      1.08                   2.75                 1.28
    500      45.4                   161                  45.7
    1000     302                    1053                 303
    

    As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:

    1. Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
    2. LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.

    The below Matlab code can be used for checking the results

    clear all
    close all
    clc
    
    warning off
    
    N = 1000;
    
    % --- Setting the problem solution
    x = ones(N, 1);
    
    %%%%%%%%%%%%%%%%%%%%%
    % NxN HANKEL MATRIX %
    %%%%%%%%%%%%%%%%%%%%%
    % --- https://en.wikipedia.org/wiki/Hankel_matrix
    load A.txt
    load y.txt
    
    A = reshape(A, N, N);
    
    yMatlab = A * x;
    fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % COMPUTATION OF THE LU DECOMPOSITION %
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    [Lmatlab, Umatlab] = lu(A);
    
    load Adecomposed.txt
    Adecomposed = reshape(Adecomposed, N, N);
    
    L = eye(N);
    for k = 1 : N
        L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
    end
    U = zeros(N);
    for k = 1 : N
        U(k, k : N) = Adecomposed(k, k : N);
    end
    
    load pivotArray.txt
    Pj = eye(N);
    for j = 1 : N
       tempVector = Pj(j, :);
       Pj(j, :) = Pj(pivotArray(j), :);
       Pj(pivotArray(j), :) = tempVector;
    end
    
    fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
    
    xprime      = inv(Lmatlab) * yMatlab;
    xMatlab     = inv(Umatlab) * xprime;
    
    fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
    
    load xApproach1.txt
    fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
    
    load xApproach2.txt
    fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));
    
    0 讨论(0)
提交回复
热议问题