SIMD optimization of cvtColor using ARM NEON intrinsics

问题

I'm working on a SIMD optimization of BGR to grayscale conversion which is equivalent to OpenCV's cvtColor() function. There is an Intel SSE version of this function and I'm referring to it. (What I'm doing is basically converting SSE code to NEON code.)

I've almost finished writing the code, and can compile it with g++, but I can't get the proper output. Does anyone have any ideas what the error could be?

What I'm getting (incorrect):

What I should be getting:

Here's my code:

#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>

using namespace std;
//using namespace cv;

#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })

void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
  const int size = src.size().area()*src.channels();
  uchar* s = src.ptr<uchar>(0);
  uchar* d = dest.ptr<uchar>(0);

  const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
  const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
  const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};

  const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
  const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};

  const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
  const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
  const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
  const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};

  const int shift = 8;
  const int amp = 1<<shift;

  const int16_t _R_ = (int16_t)(amp*0.299);
  const int16_t _G_ = (int16_t)(amp*0.587);
  const int16_t _B_ = (int16_t)(amp*0.114);
  const int16x8_t R = vdupq_n_s16(_R_);
  const int16x8_t G = vdupq_n_s16(_G_);
  const int16x8_t B = vdupq_n_s16(_B_);
  const int8x16_t zero = vdupq_n_s8(0);

  for(int i = 0; i < size; i += 48)
    {
      int8x16_t a = vld1q_s8((int8_t *) s + i);
      int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
      int8x16_t c = vld1q_s8((int8_t *) s + i + 32);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));

      //BBBBBB
      const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));

      //GGGGGG
      const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));

      //RRRRRR
      const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);

      /*
      int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
      int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
      */

      int8x16_t a1 = aaaa;
      int8x16_t a2 = zero;
      int8x16x2_t temp1 =  vzipq_s8(a1, a2);
      a1 = temp1.val[0];
      a2 = temp1.val[1];
      int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
      int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);

      int8x16_t b1 = bbbb;
      int8x16_t b2 = zero;
      int8x16x2_t temp2 =  vzipq_s8(b1, b2);
      b1 = temp2.val[0];
      b2 = temp2.val[1];
      int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
      int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);

      int8x16_t c1 = cccc;
      int8x16_t c2 = zero;
      int8x16x2_t temp3 =  vzipq_s8(c1, c2);
      c1 = temp3.val[0];
      c2 = temp3.val[1];
      int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
      int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);

      aa1 = vaddq_s16(aa1, bb1);
      aa1 = vaddq_s16(aa1, cc1);
      aa2 = vaddq_s16(aa2, bb2);
      aa2 = vaddq_s16(aa2, cc2);

      const int shift1 = 8;
      aa1 = vshrq_n_s16(aa1, shift1);
      aa2 = vshrq_n_s16(aa2, shift1);

      uint8x8_t aaa1 = vqmovun_s16(aa1);
      uint8x8_t aaa2 = vqmovun_s16(aa2);

      uint8x16_t result = vcombine_u8(aaa1, aaa2);

      vst1q_u8((uint8_t *)(d), result);

      d+=16;
    }    
}

int main() 
{
  cv::Mat src = cv::imread("Lenna.bmp");
  cv::Mat dest(src.rows, src.cols, CV_8UC1);

  cvtBGR2GrayNEON(src, dest);

  cv::imwrite("grey.jpg", dest);

  return 0;
}

Here is equivalent SSE code (from here):

void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
    const int size = src.size().area()*src.channels();
    uchar* s = src.ptr<uchar>(0);
    uchar* d = dest.ptr<uchar>(0);

    //data structure
    //BGR BGR BGR BGR BGR B
    //GR BGR BGR BGR BGR BG
    //R BGR BGR BGR BGR BGR
    //shuffle to BBBBBBGGGGGRRRRR
    const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
    const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
    const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);

    //shuffle to GGGGGGBBBBBRRRRR
    const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
    //const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
    const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);

    //shuffle to RRRRRRGGGGGBBBBB
    //__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
    //const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
    //const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1

    //blend mask
    const __m128i bmask1 = _mm_setr_epi8
        (255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);

    const __m128i bmask2 = _mm_setr_epi8
        (255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);

    const __m128i bmask3 = _mm_setr_epi8
        (255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);

    const __m128i bmask4 = _mm_setr_epi8
        (255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);  

    const int shift = 8;
    const int amp = 1<<shift;
    const int _R_=(int)(amp*0.299);
    const int _G_=(int)(amp*0.587);
    const int _B_=(int)(amp*0.114);
    const __m128i R = _mm_set1_epi16(_R_);
    const __m128i G = _mm_set1_epi16(_G_);
    const __m128i B = _mm_set1_epi16(_B_);
    const __m128i zero = _mm_setzero_si128();   

    for(int i=0;i<size;i+=48)
    {
        __m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
        __m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
        __m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
        const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);

        a = _mm_shuffle_epi8(a,smask1);
        b = _mm_shuffle_epi8(b,smask1);
        c = _mm_shuffle_epi8(c,smask1);
        const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);

        a = _mm_shuffle_epi8(a,ssmask1);
        c = _mm_shuffle_epi8(c,ssmask1);
        b = _mm_shuffle_epi8(b,ssmask2);
        const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);

        __m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
        __m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
        a1 = _mm_mullo_epi16(a1,B);
        a2 = _mm_mullo_epi16(a2,B);
        __m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
        __m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
        b1 = _mm_mullo_epi16(b1,G);
        b2 = _mm_mullo_epi16(b2,G);

        __m128i c1 = _mm_unpackhi_epi8(cccc,zero);
        __m128i c2 = _mm_unpacklo_epi8(cccc,zero);
        c1 = _mm_mullo_epi16(c1,R);
        c2 = _mm_mullo_epi16(c2,R);

        a1 = _mm_add_epi16(a1,b1);
        a1 = _mm_add_epi16(a1,c1);
        a2 = _mm_add_epi16(a2,b2);
        a2 = _mm_add_epi16(a2,c2);

        a1 = _mm_srli_epi16(a1,8);
        a2 = _mm_srli_epi16(a2,8);

        a = _mm_packus_epi16(a1,a2);

        _mm_stream_si128((__m128i*)(d),a);
        d+=16;
    } 
}

回答1:

Ok, below is a FULLY OPTIMIZED version of that function I just wrote (Beware that this function simply returns if size is smaller than 32.)

/*
 *  Created on: 2014. 7. 27.
 *      Author: Jake Lee
 *      Project FANIC - Fastest ARM NEON Implementaion Challenge
 */

// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
    .text
    .arm
    .global fanicCvtBGR2GrayNEON

    pDst    .req    r0
    pSrc    .req    r1
    size    .req    r2

    .align 5
    .func
fanicCvtBGR2GrayNEON:
    pld     [pSrc]
    subs    size, size, #32
    pld     [pSrc, #64]
    bxmi    lr
    pld     [pSrc, #64*2]
    vmov.i8     d0, #29
    vmov.i8     d1, #150
    vmov.i8     d2, #77

    .align 5
1:
    vld3.8      {d20, d21, d22}, [pSrc]!
    vld3.8      {d23, d24, d25}, [pSrc]!
    vld3.8      {d26, d27, d28}, [pSrc]!
    vld3.8      {d29, d30, d31}, [pSrc]!

    vmull.u8    q8, d20, d0
    vmlal.u8    q8, d21, d1
    vmlal.u8    q8, d22, d2
    vmull.u8    q9, d23, d0
    vmlal.u8    q9, d24, d1
    vmlal.u8    q9, d25, d2
    vmull.u8    q10, d26, d0
    vmlal.u8    q10, d27, d1
    vmlal.u8    q10, d28, d2
    vmull.u8    q11, d29, d0
    vmlal.u8    q11, d30, d1
    vmlal.u8    q11, d31, d2

    vrshrn.u16  d24, q8, #8
    vrshrn.u16  d25, q9, #8
    vrshrn.u16  d26, q10, #8
    vrshrn.u16  d27, q11, #8

    subs    size, size, #32
    pld     [pSrc, #64*3]
    pld     [pSrc, #64*4]

    vst1.8      {q12, q13}, [pDst]!
    bpl     1b

    cmp     size, #-32
    add     pSrc, pSrc, size
    bxle    lr
    add     pSrc, pSrc, size, lsl #1
    add     pDst, pDst, size
    b       1b

    .endfunc
    .end

As you can see, it's so much easier and shorter writing NEON codes in assembly than in intrinsics despite the heavy unrolling.

Have fun.

来源：https://stackoverflow.com/questions/24977272/simd-optimization-of-cvtcolor-using-arm-neon-intrinsics

标签

c++

OpenCV

arm

sse

neon