问题
How do I optimize numerical integration routine (comparing to C)?
What has been done to the moment:
- I replaced lists with unboxed vectors (obvious).
- I applied profiling techniques described in the book "Read World Haskell" http://book.realworldhaskell.org/read/profiling-and-optimization.html. I have inlined some trivial functions and inserted a lot of bangs everywhere. That gave about 10x speedup.
- I refactored the code (i.e. extracted
iterator
function). That gave 3x speedup. - I tried to replace polymorphic signatures with Floats as in the answer to this question Optimizing numerical array performance in Haskell. That gave almost 2x speedup.
- I compile like this
cabal exec ghc -- Simul.hs -O2 -fforce-recomp -fllvm -Wall
- UPDATE As suggested by cchalmers,
type Sample = (F, F)
was replaced withdata Sample = Sample {-# UNPACK #-} !F {-# UNPACK #-} !F
The performance now is almost as good as C code. Can we do better?
{-# LANGUAGE BangPatterns #-}
module Main
where
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as UM
import qualified Control.Monad.Primitive as PrimitiveM
import Dynamics.Nonlin ( birefrP )
type F = Float
type Delay = U.Vector F
type Input = U.Vector F
-- Sample can be a vector of any length (x, y, z, ...)
data Sample = Sample {-# UNPACK #-} !F {-# UNPACK #-} !F
-- Pair is used to define exactly a pair of values
data Pair = Pair {-# UNPACK #-} !F {-# UNPACK #-} !F
type ParametrizedDelayFunction = (Sample, F) -> Sample
getX :: Sample -> F
getX (Sample a _) = a
{-# INLINE getX #-}
toDelay :: [F] -> Delay
toDelay = U.fromList
stepsPerNode :: Int
stepsPerNode = 40 -- Number of integration steps per node
infixl 6 ..+..
(..+..) :: Sample -> Sample -> Sample
(..+..) (Sample x1 y1) (Sample x2 y2) = Sample (x1 + x2) (y1 + y2)
{-# INLINE (..+..) #-}
infixl 7 .*..
(.*..) :: F -> Sample -> Sample
(.*..) c (Sample x2 y2) = Sample (c * x2) (c * y2)
{-# INLINE (.*..) #-}
-- | Ikeda model (dynamical system, DDE)
ikeda_model2
:: (F -> F) -> (Sample, F) -> Sample
ikeda_model2 f (!(Sample x y), !x_h) = Sample x' y'
where
! x' = recip_epsilon * (-x + (f x_h))
y' = 0
recip_epsilon = 2^(6 :: Int)
-- | Integrate using improved Euler's method (fixed step).
--
-- hOver2 is already half of step size h
-- f is the function to integrate
-- x_i is current argument (x and y)
-- x_h is historical (delayed) value
-- x_h2 it the value after x_h
heun2 :: F -> ParametrizedDelayFunction
-> Sample -> Pair -> Sample
heun2 hOver2 f !x !(Pair x_h x_h2) = x_1
where
! f1 = f (x, x_h)
! x_1' = x ..+.. 2 * hOver2 .*.. f1
! f2 = f (x_1', x_h2)
! x_1 = x ..+.. hOver2 .*.. (f1 ..+.. f2)
initialCond :: Int -> (Sample, Delay, Int)
initialCond nodesN = (initialSampleXY, initialInterval, samplesPerDelay)
where cdi = 1.1247695e-4 :: F -- A fixed point for birefrP
initialInterval = U.replicate samplesPerDelay cdi
samplesPerDelay = nodesN * stepsPerNode
initialSampleXY = Sample 0.0 0.0
integrator
:: PrimitiveM.PrimMonad m =>
(Sample -> Pair -> Sample)
-> Int
-> Int
-> (Sample, (Delay, Input))
-> m (Sample, U.Vector F)
integrator iterate1 len total (xy0, (history0, input)) = do
! v <- UM.new total
go v 0 xy0
history <- U.unsafeFreeze v
-- Zero y value, currently not used
let xy = Sample (history `U.unsafeIndex` (total - 1)) 0.0
return (xy, history)
where
h i = history0 `U.unsafeIndex` i
go !v !i !xy
-- The first iteration
| i == 0 = do
let !r = iterate1 xy (Pair (h 0) (h 1))
UM.unsafeWrite v i (getX r)
go v 1 r
| i < len - 1 = do
let !r = iterate1 xy (Pair (h i) (h $ i + 1))
UM.unsafeWrite v i (getX r)
go v (i + 1) r
| i == total = do
return ()
-- Iterations after the initial history has been exhausted
| otherwise = do
! newX0 <- if i == len - 1
then return (getX xy0)
else UM.unsafeRead v (i - len - 1)
! newX <- UM.unsafeRead v (i - len)
let !r = iterate1 xy (Pair newX0 newX)
UM.unsafeWrite v i (getX r)
go v (i + 1) r
-- Not used in this version
zero :: Input
zero = U.fromList []
nodes :: Int
nodes = 306
main :: IO ()
main = do
let delays = 4000
(sample0, hist0, delayLength) = initialCond nodes
-- Iterator implements Heun's schema
iterator = heun2 (recip 2^(7::Int) :: F) (ikeda_model2 birefrP)
totalComputedIterations = delayLength * delays
-- Calculates all the time trace
(xy1, history1) <- integrator iterator delayLength totalComputedIterations (sample0, (hist0, zero))
putStrLn $ show $ getX xy1
return ()
The nonlinear function (imported) can look like this:
data Parameters = Parameters { beta :: Float
, alpha :: Float
, phi :: Float } deriving Show
paramA :: Parameters
paramA = Parameters { beta = 1.1
, alpha = 1.0
, phi = 0.01 }
birefr :: Parameters -> Float -> Float
birefr par !x = 0.5 * beta' * (1 - alpha' * (cos $ 2.0 * (x + phi')))
where
! beta' = beta par
! alpha' = alpha par
! phi' = phi par
birefrP :: Float -> Float
birefrP = birefr paramA
来源:https://stackoverflow.com/questions/33596474/how-do-i-optimize-numerical-integration-performance-in-haskell-with-example