SIMD optimization of cvtColor using ARM NEON intrinsics

前端 未结 1 1365
闹比i
闹比i 2021-02-14 07:49

I\'m working on a SIMD optimization of BGR to grayscale conversion which is equivalent to OpenCV\'s cvtColor() function. There is an Intel SSE version of this function and I\'m

1条回答
  •  梦毁少年i
    2021-02-14 08:19

    Ok, below is a FULLY OPTIMIZED version of that function I just wrote (Beware that this function simply returns if size is smaller than 32.)

    /*
     *  Created on: 2014. 7. 27.
     *      Author: Jake Lee
     *      Project FANIC - Fastest ARM NEON Implementaion Challenge
     */
    
    // void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
    // Y = 0.114*B + 0.587*G + 0.299*R
        .text
        .arm
        .global fanicCvtBGR2GrayNEON
    
        pDst    .req    r0
        pSrc    .req    r1
        size    .req    r2
    
        .align 5
        .func
    fanicCvtBGR2GrayNEON:
        pld     [pSrc]
        subs    size, size, #32
        pld     [pSrc, #64]
        bxmi    lr
        pld     [pSrc, #64*2]
        vmov.i8     d0, #29
        vmov.i8     d1, #150
        vmov.i8     d2, #77
    
        .align 5
    1:
        vld3.8      {d20, d21, d22}, [pSrc]!
        vld3.8      {d23, d24, d25}, [pSrc]!
        vld3.8      {d26, d27, d28}, [pSrc]!
        vld3.8      {d29, d30, d31}, [pSrc]!
    
        vmull.u8    q8, d20, d0
        vmlal.u8    q8, d21, d1
        vmlal.u8    q8, d22, d2
        vmull.u8    q9, d23, d0
        vmlal.u8    q9, d24, d1
        vmlal.u8    q9, d25, d2
        vmull.u8    q10, d26, d0
        vmlal.u8    q10, d27, d1
        vmlal.u8    q10, d28, d2
        vmull.u8    q11, d29, d0
        vmlal.u8    q11, d30, d1
        vmlal.u8    q11, d31, d2
    
        vrshrn.u16  d24, q8, #8
        vrshrn.u16  d25, q9, #8
        vrshrn.u16  d26, q10, #8
        vrshrn.u16  d27, q11, #8
    
        subs    size, size, #32
        pld     [pSrc, #64*3]
        pld     [pSrc, #64*4]
    
        vst1.8      {q12, q13}, [pDst]!
        bpl     1b
    
        cmp     size, #-32
        add     pSrc, pSrc, size
        bxle    lr
        add     pSrc, pSrc, size, lsl #1
        add     pDst, pDst, size
        b       1b
    
        .endfunc
        .end
    

    As you can see, it's so much easier and shorter writing NEON codes in assembly than in intrinsics despite the heavy unrolling.

    Have fun.

    0 讨论(0)
提交回复
热议问题