LAPACK: Are operations on packed storage matrices faster?

前端 未结 1 807
隐瞒了意图╮
隐瞒了意图╮ 2021-01-04 20:07

I want to tridiagonalize a real symmetric matrix using Fortran and LAPACK. LAPACK basically provides two routines, one operating on the full matrix, the other on the matrix

相关标签:
1条回答
  • 2021-01-04 20:40

    It's an empirical question, of course: but in general, nothing comes for free, and less memory/more runtime is a pretty common tradeoff.

    In this case, the indexing for the data is more complex for the packed case, so as you traverse the matrix, the cost of getting your data is a little higher. (Complicating this picture is that for symmetric matrices, the lapack routines also assume a certain kind of packing - that you only have the upper or lower component of the matrix available).

    I was messing around with an eigenproblem earlier today, so I'll use that as a measurement benchmark; trying with a simple symmetric test case (The Herdon matrix, from http://people.sc.fsu.edu/~jburkardt/m_src/test_mat/test_mat.html ), and comparing ssyevd with sspevd

    $ ./eigen2 500
     Generating a Herdon matrix: 
     Unpacked array:
     Eigenvalues L_infty err =   1.7881393E-06
     Packed array:
     Eigenvalues L_infty err =   3.0994415E-06
     Packed time:   2.800000086426735E-002
     Unpacked time:   2.500000037252903E-002
    
    $ ./eigen2 1000
     Generating a Herdon matrix: 
     Unpacked array:
     Eigenvalues L_infty err =   4.5299530E-06
     Packed array:
     Eigenvalues L_infty err =   5.8412552E-06
     Packed time:   0.193900004029274     
     Unpacked time:   0.165000006556511  
    
    $ ./eigen2 2500
     Generating a Herdon matrix: 
     Unpacked array:
     Eigenvalues L_infty err =   6.1988831E-06
     Packed array:
     Eigenvalues L_infty err =   8.4638596E-06
     Packed time:    3.21040010452271     
     Unpacked time:    2.70149993896484 
    

    There's about an 18% difference, which I must admit is larger than I expected (also with a slightly larger error for the packed case?). This is with intel's MKL. The performance difference will depend on your matrix in general, of course, as eriktous points out, and on the problem you're doing; the more random access to the matrix you have to do, the worse the overhead would be. The code I used is as follows:

    program eigens
          implicit none
    
          integer :: nargs,n  ! problem size 
          real, dimension(:,:), allocatable :: A, B, Z
          real, dimension(:), allocatable :: PA
          real, dimension(:), allocatable :: work
          integer, dimension(:), allocatable :: iwork
          real, dimension(:), allocatable :: eigenvals, expected
          real :: c, p
          integer :: worksize, iworksize
          character(len=100) :: nstr
          integer :: unpackedclock, packedclock 
          double precision :: unpackedtime, packedtime
          integer :: i,j,info
    
    ! get filename
          nargs = command_argument_count()
          if (nargs /= 1) then
              print *,'Usage: eigen2 n'
              print *,'       Where n = size of array'
              stop
          endif
          call get_command_argument(1, nstr)
          read(nstr,'(I)') n
          if (n < 4 .or. n > 25000) then
              print *, 'Invalid n ', nstr
              stop
          endif
    
    
    ! Initialize local arrays    
    
          allocate(A(n,n),B(n,n))
          allocate(eigenvals(n)) 
    
    ! calculate the matrix - unpacked
    
          print *, 'Generating a Herdon matrix: '
    
          A = 0.
          c = (1.*n * (1.*n + 1.) * (2.*n - 5.))/6.
          forall (i=1:n-1,j=1:n-1)
            A(i,j) = -1.*i*j/c
          endforall
          forall (i=1:n-1)
            A(i,i) = (c - 1.*i*i)/c
            A(i,n) = 1.*i/c
          endforall
          forall (j=1:n-1)
            A(n,j) = 1.*j/c
          endforall
          A(n,n) = -1./c
          B = A
    
          ! expected eigenvalues
          allocate(expected(n))
          p = 3. + sqrt((4. * n - 3.) * (n - 1.)*3./(n+1.))
          expected(1) = p/(n*(5.-2.*n))
          expected(2) = 6./(p*(n+1.))
          expected(3:n) = 1.
    
          print *, 'Unpacked array:'
          allocate(work(1),iwork(1))
          call ssyevd('N','U',n,A,n,eigenvals,work,-1,iwork,-1,info)
          worksize = int(work(1))
          iworksize = int(work(1))
          deallocate(work,iwork)
          allocate(work(worksize),iwork(iworksize))
    
          call tick(unpackedclock)
          call ssyevd('N','U',n,A,n,eigenvals,work,worksize,iwork,iworksize,info)
          unpackedtime = tock(unpackedclock)
          deallocate(work,iwork)
    
          if (info /= 0) then
               print *, 'Error -- info = ', info
          endif
          print *,'Eigenvalues L_infty err = ', maxval(eigenvals-expected)
    
    
          ! pack array
    
          print *, 'Packed array:'
          allocate(PA(n*(n+1)/2))
          allocate(Z(n,n))
          do i=1,n 
            do j=i,n
               PA(i+(j-1)*j/2) = B(i,j)
            enddo
          enddo
    
          allocate(work(1),iwork(1))
          call sspevd('N','U',n,PA,eigenvals,Z,n,work,-1,iwork,-1,info)
          worksize = int(work(1))
          iworksize = iwork(1)
          deallocate(work,iwork)
          allocate(work(worksize),iwork(iworksize))
    
          call tick(packedclock)
          call sspevd('N','U',n,PA,eigenvals,Z,n,work,worksize,iwork,iworksize,info)
          packedtime = tock(packedclock)
          deallocate(work,iwork)
          deallocate(Z,A,B,PA)
    
          if (info /= 0) then
               print *, 'Error -- info = ', info
          endif
          print *,'Eigenvalues L_infty err = ', &
          maxval(eigenvals-expected)
    
          deallocate(eigenvals, expected)
    
    
          print *,'Packed time: ', packedtime
          print *,'Unpacked time: ', unpackedtime
    
    
    contains
        subroutine tick(t)
            integer, intent(OUT) :: t
    
            call system_clock(t)
        end subroutine tick
    
        ! returns time in seconds from now to time described by t
        real function tock(t)
            integer, intent(in) :: t
            integer :: now, clock_rate
    
            call system_clock(now,clock_rate)
    
            tock = real(now - t)/real(clock_rate)
        end function tock
    
    end program eigens
    
    0 讨论(0)
提交回复
热议问题