This is a follow up of this question:
How to do a sum of sums of the square of sum of sums?
Where I was asking for help to use einsum (to achieve a great spe
The solution below presents the 3 different methods to do the simple sum of sums and 4 different methods to do the sum of squares.
sum of sums 3 methods - for loop, JIT for loops, einsum (none run into memory problems)
sum of square of sums 4 methods - for loop, JIT for loops, expanded einsum, intermediate einsum
Here the first three don't run into memory problems and the for loop and the expanded einsum run into speed problems. This leaves the JIT solution seeming the best.
import numpy as np
import time
from numba import jit
def fun1(Fu, Fv, Fx, Fy, P, B):
Nu = Fu.shape[0]
Nv = Fv.shape[0]
Nx = Fx.shape[0]
Ny = Fy.shape[0]
Nk = Fu.shape[1]
Nl = Fv.shape[1]
I1 = np.zeros([Nu, Nv])
for iu in range(Nu):
for iv in range(Nv):
for ix in range(Nx):
for iy in range(Ny):
S = 0.
for ik in range(Nk):
for il in range(Nl):
S += Fu[iu,ik]*Fv[iv,il]*Fx[ix,ik]*Fy[iy,il]*P[ix,iy]*B[ik,il]
I1[iu, iv] += S
return I1
def fun2(Fu, Fv, Fx, Fy, P, B):
Nu = Fu.shape[0]
Nv = Fv.shape[0]
Nx = Fx.shape[0]
Ny = Fy.shape[0]
Nk = Fu.shape[1]
Nl = Fv.shape[1]
I2 = np.zeros([Nu, Nv])
for iu in range(Nu):
for iv in range(Nv):
for ix in range(Nx):
for iy in range(Ny):
S = 0.
for ik in range(Nk):
for il in range(Nl):
S += Fu[iu,ik]*Fv[iv,il]*Fx[ix,ik]*Fy[iy,il]*P[ix,iy]*B[ik,il]
I2[iu, iv] += S**2.
return I2
if __name__ == '__main__':
Nx = 30
Ny = 40
Nk = 50
Nl = 60
Nu = 70
Nv = 8
Fx = np.random.rand(Nx, Nk)
Fy = np.random.rand(Ny, Nl)
Fu = np.random.rand(Nu, Nk)
Fv = np.random.rand(Nv, Nl)
P = np.random.rand(Nx, Ny)
B = np.random.rand(Nk, Nl)
fjit1 = jit(fun1)
fjit2 = jit(fun2)
# For loop - becomes too slow so commented out
# t = time.time()
# I1 = fun1(Fu, Fv, Fx, Fy, P, B)
# print 'fun1 :', time.time() - t
# JIT compiled for loop - After a certain point beats einsum
t = time.time()
I1jit = fjit1(Fu, Fv, Fx, Fy, P, B)
print 'jit1 :', time.time() - t
# einsum great solution when no squaring is needed
t = time.time()
I1_ = np.einsum('uk, vl, xk, yl, xy, kl->uv', Fu, Fv, Fx, Fy, P, B)
print '1 einsum:', time.time() - t
# For loop - becomes too slow so commented out
# t = time.time()
# I2 = fun2(Fu, Fv, Fx, Fy, P, B)
# print 'fun2 :', time.time() - t
# JIT compiled for loop - After a certain point beats einsum
t = time.time()
I2jit = fjit2(Fu, Fv, Fx, Fy, P, B)
print 'jit2 :', time.time() - t
# Expanded einsum - As the size increases becomes very very slow
# t = time.time()
# I2_ = np.einsum('uk,vl,xk,yl,um,vn,xm,yn,kl,mn,xy->uv', Fu,Fv,Fx,Fy,Fu,Fv,Fx,Fy,B,B,P**2)
# print '2 einsum:', time.time() - t
# Intermediate einsum - As the sizes increase memory can become an issue
t = time.time()
temp = np.einsum('uk, vl, xk, yl, xy, kl->uvxy', Fu, Fv, Fx, Fy, P, B)
I2__ = np.einsum('uvxy->uv', np.square(temp))
print '2 einsum:', time.time() - t
# print 'I1 == I1_ :', np.allclose(I1, I1_)
print 'I1_ == Ijit1_:', np.allclose(I1_, I1jit)
# print 'I2 == I2_ :', np.allclose(I2, I2_)
print 'I2_ == Ijit2_:', np.allclose(I2__, I2jit)
Comment: Please feel free to edit / improve this answer. It would be nice if someone had any suggestions with regards to making this parallel.