Fast logarithm calculation

后端 未结 1 866
挽巷
挽巷 2021-02-05 11:50

All the code was run on the same machine on linux.

In python:

import numpy as np
drr = abs(np.random.randn(100000,50))
%timeit np.log2(drr)
相关标签:
1条回答
  • 2021-02-05 12:46

    Note that ALL below are float32, not double precision.

    UPDATE: I've ditched gcc completely in favour of Intel's icc. It makes ALL the difference when performance is critical and when you don't have time to fine-tune your "compiler hints" to enforce gcc vectorization (see, e.g. here)

    log_omp.c,

    GCC: gcc -o log_omp.so -fopenmp log_omp.c -lm -O3 -fPIC -shared -std=c99

    ICC: icc -o log_omp.so -openmp loge_omp.c -lm -O3 -fPIC -shared -std=c99 -vec-report1 -xAVX -I/opt/intel/composer/mkl/include

    #include <math.h>
    #include "omp.h"
    #include "mkl_vml.h"
    
    #define restrict __restrict
    
    inline void log_omp(int m, float * restrict a, float * restrict c);
    
    void log_omp(int m, float * restrict a, float * restrict c)
    {
       int i;
    #pragma omp parallel for default(none) shared(m,a,c) private(i)
       for (i=0; i<m; i++) {
          a[i] = log(c[i]);
       }
    }
    
    // VML / icc only:
    void log_VML(int m, float * restrict a, float * restrict c)
    {
       int i;
       int split_to = 14;
       int iter = m / split_to;
       int additional = m % split_to;
    
    //   vsLn(m, c, a);
    #pragma omp parallel for default(none) shared(m,a,c, additional, iter) private(i) num_threads(split_to)
       for (i=0;i < (m-additional); i+=iter)
         vsLog10(iter,c+i,a+i);
         //vmsLn(iter,c+i,a+i, VML_HA);
    
       if (additional > 0)
         vsLog10(additional, c+m-additional, a+m-additional);
         //vmsLn(additional, c+m-additional, a+m-additional, VML_HA);
    }
    

    in python:

    from ctypes import CDLL, c_int, c_void_p
    def log_omp(xs, out):
        lib = CDLL('./log_omp.so')
        lib.log_omp.argtypes = [c_int, np.ctypeslib.ndpointer(dtype=np.float32), np.ctypeslib.ndpointer(dtype=np.float32)]
        lib.log_omp.restype  = c_void_p
        n = xs.shape[0]
        out = np.empty(n, np.float32)
        lib.log_omp(n, out, xs)
        return out
    

    Cython code (in ipython notebook, hence the %% magic):

    %%cython --compile-args=-fopenmp --link-args=-fopenmp
    import  numpy as np
    cimport numpy as np
    from libc.math cimport log
    
    from cython.parallel cimport prange
    import cython
    
    @cython.boundscheck(False)
    def cylog(np.ndarray[np.float32_t, ndim=1] a not None,
            np.ndarray[np.float32_t, ndim=1] out=None):
        if out is None:
            out = np.empty((a.shape[0]), dtype=a.dtype)
        cdef Py_ssize_t i
        with nogil:
            for i in prange(a.shape[0]):
                out[i] = log(a[i])
        return out
    

    Timings:

    numexpr.detect_number_of_cores() // 2
    28
    
    %env OMP_NUM_THREADS=28
    x = np.abs(np.random.randn(50000000).astype('float32'))
    y = x.copy()
    
    # GCC
    %timeit log_omp(x, y)
    10 loops, best of 3: 21.6 ms per loop
    # ICC
    %timeit log_omp(x, y)
    100 loops, best of 3: 9.6 ms per loop
    %timeit log_VML(x, y)
    100 loops, best of 3: 10 ms per loop
    
    %timeit cylog(x, out=y)
    10 loops, best of 3: 21.7 ms per loop
    
    numexpr.set_num_threads(28)
    %timeit out = numexpr.evaluate('log(x)')
    100 loops, best of 3: 13 ms per loop
    

    So numexpr seems to be doing a better job than (poorly) compiled gcc code, but icc wins.

    Some resources I found useful and shamefully used code from:

    http://people.duke.edu/~ccc14/sta-663/Optimization_Bakeoff.html

    https://gist.github.com/zed/2051661

    0 讨论(0)
提交回复
热议问题