Is it possible to vectorize recursive calculation of a NumPy array where each element depends on the previous one?

后端 未结 5 767
猫巷女王i
猫巷女王i 2020-11-27 04:31
T(i) = Tm(i) + (T(i-1)-Tm(i))**(-tau(i))

Tm and tau are NumPy vectors of the same length that have been previously calcul

相关标签:
5条回答
  • 2020-11-27 05:04

    This is a good question. I am also interested to know if this is possible but so far I have not found a way to do it except in some simple cases.

    Option 1. numpy.ufunc.accumulate

    This seems to be a promising option as mentioned by @Karl Knechtel. You need to create a ufunc first. This web page explains how.

    In the simple case of a recurrent function that takes two scalars as input and outputs one scaler, it seems to work:

    import numpy as np
    
    def test_add(x, data):
        return x + data
    
    assert test_add(1, 2) == 3
    assert test_add(2, 3) == 5
    
    # Make a Numpy ufunc from my test_add function
    test_add_ufunc = np.frompyfunc(t_next, 2, 1)
    
    assert test_add_ufunc(1, 2) == 3
    assert test_add_ufunc(2, 3) == 5
    assert np.all(test_add_ufunc([1, 2], [2, 3]) == [3, 5])
    
    data_sequence = np.array([1, 2, 3, 4])
    f_out = test_add_ufunc.accumulate(data_sequence, dtype=object)
    assert np.array_equal(f_out, [1, 3, 6, 10])
    

    [Note the dtype=object argument which is necessary as explained on the web page linked above].

    But in your case (and mine) we want to compute a recurrent equation that has more than one data input (and potentially more than one state variable too).

    When I tried this using the ufunc.accumulate approach above I got ValueError: accumulate only supported for binary functions.

    If anyone knows a way round that constraint I would be very interested.

    Option 2. Python's builtin accumulate function

    In the mean time, this solution doesn't quite achieve what you wanted in terms of a vectorized calculation in numpy, but it does at least avoid a for loop.

    from itertools import accumulate, chain
    
    
    def t_next(t, data):
        Tm, tau = data  # Unpack more than one data input
        return Tm + (t - Tm)**tau
    
    assert t_next(2, (0.38, 0)) == 1.38
    
    t0 = 2  # Initial t
    Tm_values = np.array([0.38, 0.88, 0.56, 0.67, 0.45, 0.98, 0.58, 0.72, 0.92, 0.82])
    tau_values = np.linspace(0, 0.9, 10)
    
    # Combine the input data into a 2D array
    data_sequence = np.vstack([Tm_values, tau_values]).T
    t_out = np.fromiter(accumulate(chain([t0], data_sequence), t_next), dtype=float)
    print(t_out)
    # [2.         1.38       1.81303299 1.60614649 1.65039964 1.52579703
    #  1.71878078 1.66109554 1.67839293 1.72152195 1.73091672]
    
    # Slightly more readable version possible in Python 3.8+
    t_out = np.fromiter(accumulate(data_sequence, t_next, initial=t0), dtype=float)
    print(t_out)
    # [2.         1.38       1.81303299 1.60614649 1.65039964 1.52579703
    #  1.71878078 1.66109554 1.67839293 1.72152195 1.73091672]
    
    0 讨论(0)
  • 2020-11-27 05:10

    Update: 21-10-2018 I have corrected my answer based on comments.

    It is possible to vectorize operations on vectors as long as the calculation is not recursive. Because a recursive operation depends on the previous calculated value it is not possible to parallel process the operation. This does therefore not work:

    def calc_vect(Tm_, tau_):
        return Tm_[1:] - (Tm_[:-1] + Tm_[1:]) ** (-tau_[1:])
    

    Since (serial processing / a loop) is necessary, the best performance is gained by moving as close as possible to optimized machine code, therefore Numba and Cython are the best answers here.

    A Numba approach can be achieves as follows:

    init_string = """
    from math import pow
    import numpy as np
    from numba import jit, float32
    
    np.random.seed(0)
    n = 100000
    Tm = np.cumsum(np.random.uniform(0.1, 1, size=n).astype('float32'))
    tau = np.random.uniform(-1, 0, size=n).astype('float32')
    
    def calc_python(Tm_, tau_):
     tt = np.empty(len(Tm_))
     tt[0] = Tm_[0]
     for i in range(1, len(Tm_)):
         tt[i] = Tm_[i] - pow(tt[i-1] + Tm_[i], -tau_[i])
     return tt
    
    @jit(float32[:](float32[:], float32[:]), nopython=False, nogil=True)
    def calc_numba(Tm_, tau_):
      tt = np.empty(len(Tm_))
      tt[0] = Tm_[0]
      for i in range(1, len(Tm_)):
          tt[i] = Tm_[i] - pow(tt[i-1] + Tm_[i], -tau_[i])
      return tt
    """
    
    import timeit
    py_time = timeit.timeit('calc_python(Tm, tau)', init_string, number=100)
    numba_time = timeit.timeit('calc_numba(Tm, tau)', init_string, number=100)
    print("Python Solution: {}".format(py_time))
    print("Numba Soltution: {}".format(numba_time))
    

    Timeit comparison of the Python and Numba functions:

    Python Solution: 54.58057559299999
    Numba Soltution: 1.1389029540000024
    
    0 讨论(0)
  • 2020-11-27 05:23

    You might think this would work:

    import numpy as np
    n = len(Tm)
    t = np.empty(n)
    
    t[0] = 0  # or whatever the initial condition is 
    t[1:] = Tm[1:] + (t[0:n-1] - Tm[1:])**(-tau[1:])
    

    but it doesn't: you can't actually do recursion in numpy this way (since numpy calculates the whole RHS and then assigns it to the LHS).

    So unless you can come up with a non-recursive version of this formula, you're stuck with an explicit loop:

    tt = np.empty(n)
    tt[0] = 0.
    for i in range(1,n):
        tt[i] = Tm[i] + (tt[i-1] - Tm[i])**(-tau[i])
    
    0 讨论(0)
  • 2020-11-27 05:27

    To build on NPE's answer, I agree that there has to be a loop somewhere. Perhaps your goal is to avoid the overhead associated with a Python for loop? In that case, numpy.fromiter does beat out a for loop, but only by a little:

    Using the very simple recursion relation,

    x[i+1] = x[i] + 0.1
    

    I get

    #FOR LOOP
    def loopit(n):
         x = [0.0]
         for i in range(n-1): x.append(x[-1] + 0.1)
         return np.array(x)
    
    #FROMITER
    #define an iterator (a better way probably exists -- I'm a novice)
    def it():
         x = 0.0
         while True:
             yield x
             x += 0.1
    
    #use the iterator with np.fromiter
    def fi_it(n):
         return np.fromiter(it(), np.float, n)
    
    %timeit -n 100 loopit(100000)
    #100 loops, best of 3: 31.7 ms per loop
    
    %timeit -n 100 fi_it(100000)
    #100 loops, best of 3: 18.6 ms per loop
    

    Interestingly, pre-allocating a numpy array results in a substantial loss in performance. This is a mystery to me, though I would guess that there must be more overhead associated with accessing an array element than with appending to a list.

    def loopit(n):
         x = np.zeros(n)
         for i in range(n-1): x[i+1] = x[i] + 0.1
         return x
    
    %timeit -n 100 loopit(100000)
    #100 loops, best of 3: 50.1 ms per loop
    
    0 讨论(0)
  • 2020-11-27 05:29

    2019 Update. The Numba code broke with the new version of numba. Changing dtype="float32" to dtype=np.float32 solved it.

    I performed some benchmarks and in 2019 using Numba is the first option people should try to accelerate recursive functions in Numpy (adjusted proposal of Aronstef). Numba is already preinstalled in the Anaconda package and has one of the fastest times (about 20 times faster than any Python). In 2019 Python supports @numba annotations without additional steps (at least versions 3.6, 3.7, and 3.8). Here are three benchmarks: performed on 2019-12-05, 2018-10-20 and 2016-05-18.

    And, as mentioned by Jaffe, in 2018 it is still not possible to vectorize recursive functions. I checked the vectorization by Aronstef and it does NOT work.

    Benchmarks sorted by execution time:

    -------------------------------------------
    |Variant        |2019-12 |2018-10 |2016-05 |
    -------------------------------------------
    |Pure C         |   na   |   na   | 2.75 ms|
    |C extension    |   na   |   na   | 6.22 ms|
    |Cython float32 | 0.55 ms| 1.01 ms|   na   |
    |Cython float64 | 0.54 ms| 1.05 ms| 6.26 ms|
    |Fortran f2py   | 4.65 ms|   na   | 6.78 ms|
    |Numba float32  |73.0  ms| 2.81 ms|   na   |
    |(Aronstef)     |        |        |        |
    |Numba float32v2| 1.82 ms| 2.81 ms|   na   |
    |Numba float64  |78.9  ms| 5.28 ms|   na   |
    |Numba float64v2| 4.49 ms| 5.28 ms|   na   |
    |Append to list |73.3  ms|48.2  ms|91.0  ms|
    |Using a.item() |36.9  ms|58.3  ms|74.4  ms|
    |np.fromiter()  |60.8  ms|60.0  ms|78.1  ms|
    |Loop over Numpy|71.3  ms|71.9  ms|87.9  ms|
    |(Jaffe)        |        |        |        |
    |Loop over Numpy|74.6  ms|74.4  ms|   na   |
    |(Aronstef)     |        |        |        |
    -------------------------------------------
    

    Corresponding code is provided at the end of the answer.

    It seems that with time Numba and Cython times get better. Now both of them are faster than Fortran f2py. Cython is faster 8.6 times now and Numba 32bit is faster 2.5 times. Fortran was very hard to debug and compile in 2016. So now there is no reason to use Fortran at all.

    I did not check Pure C and C extension in 2019 and 2018, because it is not easy to compile them in Jupyter notebooks.

    I had the following setup in 2019:

    Processor: Intel i5-9600K 3.70GHz
    Versions:
    Python:  3.8.0
    Numba:  0.46.0
    Cython: 0.29.14
    Numpy:  1.17.4
    

    I had the following setup in 2018:

    Processor: Intel i7-7500U 2.7GHz
    Versions:
    Python:  3.7.0
    Numba:  0.39.0
    Cython: 0.28.5
    Numpy:  1.15.1
    

    The recommended Numba code using float32 (adjusted Aronstef):

    @numba.jit("float32[:](float32[:], float32[:])", nopython=True, nogil=True)
    def calc_py_jit32v2(Tm_, tau_):
        tt = np.empty(len(Tm_),dtype=np.float32)
        tt[0] = Tm_[0]
        for i in range(1, len(Tm_)):
            tt[i] = Tm_[i] - (tt[i-1] + Tm_[i])**(-tau_[i])
        return tt[1:]
    

    All the other code:

    Data creation (like Aronstef + Mike T comment):

    np.random.seed(0)
    n = 100000
    Tm = np.cumsum(np.random.uniform(0.1, 1, size=n).astype('float64'))
    tau = np.random.uniform(-1, 0, size=n).astype('float64')
    ar = np.column_stack([Tm,tau])
    Tm32 = Tm.astype('float32')
    tau32 = tau.astype('float32')
    Tm_l = list(Tm)
    tau_l = list(tau)
    

    The code in 2016 was slightly different as I used abs() function to prevent nans and not the variant of Mike T. In 2018 the function is exactly the same as OP (Original Poster) wrote.

    Cython float32 using Jupyter %% magic. The function can be used directly in Python. Cython needs a C++ compiler in which Python was compiled. Installation of the right version of Visual C++ compiler (for Windows) could be problematic:

    %%cython
    
    import cython
    import numpy as np
    cimport numpy as np
    from numpy cimport ndarray
    
    cdef extern from "math.h":
        np.float32_t exp(np.float32_t m)
    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    @cython.infer_types(True)
    @cython.initializedcheck(False)
    
    def cy_loop32(np.float32_t[:] Tm,np.float32_t[:] tau,int alen):
        cdef np.float32_t[:] T=np.empty(alen, dtype=np.float32)
        cdef int i
        T[0]=0.0
        for i in range(1,alen):
            T[i] = Tm[i] + (T[i-1] - Tm[i])**(-tau[i])
        return T
    

    Cython float64 using Jupyter %% magic. The function can be used directly in Python:

    %%cython
    
    cdef extern from "math.h":
        double exp(double m)
    import cython
    import numpy as np
    cimport numpy as np
    from numpy cimport ndarray
    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    @cython.infer_types(True)
    @cython.initializedcheck(False)
    
    def cy_loop(double[:] Tm,double[:] tau,int alen):
        cdef double[:] T=np.empty(alen)
        cdef int i
        T[0]=0.0
        for i in range(1,alen):
            T[i] = Tm[i] + (T[i-1] - Tm[i])**(-tau[i])
        return T
    

    Numba float64:

    @numba.jit("float64[:](float64[:], float64[:])", nopython=False, nogil=True)
    def calc_py_jitv2(Tm_, tau_):
        tt = np.empty(len(Tm_),dtype=np.float64)
        tt[0] = Tm_[0]
        for i in range(1, len(Tm_)):
            tt[i] = Tm_[i] - (tt[i-1] + Tm_[i])**(-tau_[i])
        return tt[1:]
    

    Append to list. Fastest non-compiled solution:

    def rec_py_loop(Tm,tau,alen):
         T = [Tm[0]]
         for i in range(1,alen):
            T.append(Tm[i] - (T[i-1] + Tm[i])**(-tau[i]))
         return np.array(T)
    

    Using a.item():

    def rec_numpy_loop_item(Tm_,tau_):
        n_ = len(Tm_)
        tt=np.empty(n_)
        Ti=tt.item
        Tis=tt.itemset
        Tmi=Tm_.item
        taui=tau_.item
        Tis(0,Tm_[0])
        for i in range(1,n_):
            Tis(i,Tmi(i) - (Ti(i-1) + Tmi(i))**(-taui(i)))
        return tt[1:]
    

    np.fromiter():

    def it(Tm,tau):
        T=Tm[0]
        i=0
        while True:
            yield T
            i+=1
            T=Tm[i] - (T + Tm[i])**(-tau[i])
    
    def rec_numpy_iter(Tm,tau,alen):
        return np.fromiter(it(Tm,tau), np.float64, alen)[1:]
    

    Loop over Numpy (based on the Jaffe's idea):

    def rec_numpy_loop(Tm,tau,alen):
        tt=np.empty(alen)
        tt[0]=Tm[0]
        for i in range(1,alen):
            tt[i] = Tm[i] - (tt[i-1] + Tm[i])**(-tau[i])
        return tt[1:]
    

    Loop over Numpy (Aronstef's code). On my computer float64 is the default type for np.empty.

    def calc_py(Tm_, tau_):
        tt = np.empty(len(Tm_),dtype="float64")
        tt[0] = Tm_[0]
        for i in range(1, len(Tm_)):
            tt[i] = (Tm_[i] - (tt[i-1] + Tm_[i])**(-tau_[i]))
        return tt[1:]
    

    Pure C without using Python at all. Version from year 2016 (with fabs() function):

    #include <stdio.h>
    #include <math.h>
    #include <stdlib.h>
    #include <windows.h>
    #include <sys\timeb.h> 
    
    double randn() {
        double u = rand();
        if (u > 0.5) {
            return sqrt(-1.57079632679*log(1.0 - pow(2.0 * u - 1, 2)));
        }
        else {
            return -sqrt(-1.57079632679*log(1.0 - pow(1 - 2.0 * u,2)));
        }
    }
    void rec_pure_c(double *Tm, double *tau, int alen, double *T)
    {
    
        for (int i = 1; i < alen; i++)
        {
            T[i] = Tm[i] + pow(fabs(T[i - 1] - Tm[i]), (-tau[i]));
        }
    }
    
    int main() {
        int N = 100000;
        double *Tm= calloc(N, sizeof *Tm);
        double *tau = calloc(N, sizeof *tau);
        double *T = calloc(N, sizeof *T);
        double time = 0;
        double sumtime = 0;
        for (int i = 0; i < N; i++)
        {
            Tm[i] = randn();
            tau[i] = randn();
        }
    
        LARGE_INTEGER StartingTime, EndingTime, ElapsedMicroseconds;
        LARGE_INTEGER Frequency;
        for (int j = 0; j < 1000; j++)
        {
            for (int i = 0; i < 3; i++)
            {
                QueryPerformanceFrequency(&Frequency);
                QueryPerformanceCounter(&StartingTime);
    
                rec_pure_c(Tm, tau, N, T);
    
                QueryPerformanceCounter(&EndingTime);
                ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
                ElapsedMicroseconds.QuadPart *= 1000000;
                ElapsedMicroseconds.QuadPart /= Frequency.QuadPart;
                if (i == 0)
                    time = (double)ElapsedMicroseconds.QuadPart / 1000;
                else {
                    if (time > (double)ElapsedMicroseconds.QuadPart / 1000)
                        time = (double)ElapsedMicroseconds.QuadPart / 1000;
                }
            }
            sumtime += time;
        }
        printf("1000 loops,best of 3: %.3f ms per loop\n",sumtime/1000);
    
        free(Tm);
        free(tau);
        free(T);
    }
    

    Fortran f2py. Function can be used from Python. Version from year 2016 (with abs() function):

    subroutine rec_fortran(tm,tau,alen,result)
        integer*8, intent(in) :: alen
        real*8, dimension(alen), intent(in) :: tm
        real*8, dimension(alen), intent(in) :: tau
        real*8, dimension(alen) :: res
        real*8, dimension(alen), intent(out) :: result
    
        res(1)=0
        do i=2,alen
            res(i) = tm(i) + (abs(res(i-1) - tm(i)))**(-tau(i))
        end do
        result=res    
    end subroutine rec_fortran
    
    0 讨论(0)
提交回复
热议问题