I\'m working on a SIMD optimization of BGR to grayscale conversion which is equivalent to OpenCV\'s cvtColor() function. There is an Intel SSE version of this function and I\'m
Ok, below is a FULLY OPTIMIZED version of that function I just wrote (Beware that this function simply returns if size is smaller than 32.)
/*
* Created on: 2014. 7. 27.
* Author: Jake Lee
* Project FANIC - Fastest ARM NEON Implementaion Challenge
*/
// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
.text
.arm
.global fanicCvtBGR2GrayNEON
pDst .req r0
pSrc .req r1
size .req r2
.align 5
.func
fanicCvtBGR2GrayNEON:
pld [pSrc]
subs size, size, #32
pld [pSrc, #64]
bxmi lr
pld [pSrc, #64*2]
vmov.i8 d0, #29
vmov.i8 d1, #150
vmov.i8 d2, #77
.align 5
1:
vld3.8 {d20, d21, d22}, [pSrc]!
vld3.8 {d23, d24, d25}, [pSrc]!
vld3.8 {d26, d27, d28}, [pSrc]!
vld3.8 {d29, d30, d31}, [pSrc]!
vmull.u8 q8, d20, d0
vmlal.u8 q8, d21, d1
vmlal.u8 q8, d22, d2
vmull.u8 q9, d23, d0
vmlal.u8 q9, d24, d1
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d0
vmlal.u8 q10, d27, d1
vmlal.u8 q10, d28, d2
vmull.u8 q11, d29, d0
vmlal.u8 q11, d30, d1
vmlal.u8 q11, d31, d2
vrshrn.u16 d24, q8, #8
vrshrn.u16 d25, q9, #8
vrshrn.u16 d26, q10, #8
vrshrn.u16 d27, q11, #8
subs size, size, #32
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vst1.8 {q12, q13}, [pDst]!
bpl 1b
cmp size, #-32
add pSrc, pSrc, size
bxle lr
add pSrc, pSrc, size, lsl #1
add pDst, pDst, size
b 1b
.endfunc
.end
As you can see, it's so much easier and shorter writing NEON codes in assembly than in intrinsics despite the heavy unrolling.
Have fun.