问题
I need display a jpeg picture, and convert it to YUV420SP. First I use SkBitmap to parse jpeg and display it, then I use the code below to convert RGB565 to YUV420SP on android, but it spend 75ms to convert a 640*480 RGB565 picture, so anybody know the faster way to convert RGB565 to YUV420SP on android? or faster way to convert jpeg file to YUV420SP on android?
// Convert from RGB to YUV420
int RGB2YUV_YR[256], RGB2YUV_YG[256], RGB2YUV_YB[256];
int RGB2YUV_UR[256], RGB2YUV_UG[256], RGB2YUV_UBVR[256];
int RGB2YUV_VG[256], RGB2YUV_VB[256];
//
// Table used for RGB to YUV420 conversion
//
void InitLookupTable()
{
static bool hasInited = false;
if(hasInited)
return ;
hasInited = true;
int i;
for (i = 0; i < 256; i++)
RGB2YUV_YR[i] = (float) 65.481 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_YG[i] = (float) 128.553 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_YB[i] = (float) 24.966 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_UR[i] = (float) 37.797 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_UG[i] = (float) 74.203 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_VG[i] = (float) 93.786 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_VB[i] = (float) 18.214 * (i << 8);
for (i = 0; i < 256; i++)
RGB2YUV_UBVR[i] = (float) 112 * (i << 8);
}
int ConvertRGB5652YUV420SP(int w, int h, unsigned char *bmp, unsigned char *yuv)
{
unsigned char *u, *v, *y, *uu, *vv;
unsigned char *pu1, *pu2, *pu3, *pu4;
unsigned char *pv1, *pv2, *pv3, *pv4;
unsigned char rValue = 0, gValue = 0, bValue = 0;
uint16_t* bmpPtr;
int i, j;
printf("ConvertRGB5652YUV420SP begin,w=%d,h=%d,bmp=%p,yuv=%p\n", w, h, bmp, yuv);
struct timeval tpstart,tpend;
gettimeofday(&tpstart,NULL);
InitLookupTable();
gettimeofday(&tpend,NULL);
float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
timeuse/=1000;
printf("InitLookupTable used time=%f\n", timeuse);
gettimeofday(&tpstart,NULL);
uu = new unsigned char[w * h];
vv = new unsigned char[w * h];
if (uu == NULL || vv == NULL || yuv == NULL)
return 0;
y = yuv;
u = uu;
v = vv;
// Get r,g,b pointers from bmp image data....
bmpPtr = (uint16_t*)bmp;
//Get YUV values for rgb values...
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
uint16_t color = *bmpPtr;
unsigned int r = (color>>11) & 0x1f;
unsigned int g = (color>> 5) & 0x3f;
unsigned int b = (color ) & 0x1f;
rValue = (r<<3) | (r>>2);
gValue = (g<<2) | (g>>4);
bValue = (b<<3) | (b>>2);
*y++ = (RGB2YUV_YR[rValue] + RGB2YUV_YG[gValue] + RGB2YUV_YB[bValue] +
1048576) >> 16;
*u++ = (-RGB2YUV_UR[rValue] - RGB2YUV_UG[gValue] + RGB2YUV_UBVR[bValue] +
8388608) >> 16;
*v++ = (RGB2YUV_UBVR[rValue] - RGB2YUV_VG[gValue] - RGB2YUV_VB[bValue] +
8388608) >> 16;
bmpPtr++;
}
}
gettimeofday(&tpend,NULL);
timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
timeuse/=1000;
printf("Get YUV values used time=%f\n", timeuse);
gettimeofday(&tpstart,NULL);
// Now sample the U & V to obtain YUV 4:2:0 format
// Get the right pointers...
u = yuv + w * h;
v = u + 1;
// For U
pu1 = uu;
pu2 = pu1 + 1;
pu3 = pu1 + w;
pu4 = pu3 + 1;
// For V
pv1 = vv;
pv2 = pv1 + 1;
pv3 = pv1 + w;
pv4 = pv3 + 1;
// Do sampling....
for (i = 0; i < h; i += 2) {
for (j = 0; j < w; j += 2) {
*u = (*pu1 + *pu2 + *pu3 + *pu4) >> 2;
u += 2;
*v = (*pv1 + *pv2 + *pv3 + *pv4) >> 2;
v += 2;
pu1 += 2;
pu2 += 2;
pu3 += 2;
pu4 += 2;
pv1 += 2;
pv2 += 2;
pv3 += 2;
pv4 += 2;
}
pu1 += w;
pu2 += w;
pu3 += w;
pu4 += w;
pv1 += w;
pv2 += w;
pv3 += w;
pv4 += w;
}
gettimeofday(&tpend,NULL);
timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
timeuse/=1000;
printf("Do sampling used time=%f\n", timeuse);
gettimeofday(&tpstart,NULL);
delete uu;
delete vv;
return 1;
}
int main(int argc, char **argv) {
unsigned char bmp[640*480*2] = {0};
unsigned char yuv[(640*480*3)/2] = {0};
struct timeval tpstart,tpend;
gettimeofday(&tpstart,NULL);
ConvertRGB5652YUV420SP(640, 480, bmp, yuv);
gettimeofday(&tpend,NULL);
float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
timeuse/=1000;
printf("ConvertARGB2YUV420SP used time=%f\n", timeuse);
return 0;
}
output on android(armv6):
ConvertRGB5652YUV420SP begin,w=640,h=480,bmp=0xbe7314fc,yuv=0xbe7c74fc
InitLookupTable used time=0.383000
Get YUV values used time=61.394001
Do sampling used time=11.918000
ConvertARGB2YUV420SP used time=74.596001
cpu info:
$ cat /proc/cpuinfo
cat /proc/cpuinfo
Processor : ARMv6-compatible processor rev 5 (v6l)
BogoMIPS : 791.34
Features : swp half thumb fastmult vfp edsp java
CPU implementer : 0x41
CPU architecture: 6TEJ
CPU variant : 0x1
CPU part : 0xb36
CPU revision : 5
Hardware : IMAPX200
Revision : 0000
Serial : 0000000000000000
回答1:
On ARMv7, use NEON. It will do the job in less than 1ms. (VGA)
If you are stuck with ARMv6, optimize it in ARM assembly. (about 8ms on VGA)
Use fixed-point arithmetic instead of the lookup tables. Get rid of them.
make two masks :
- 0x001f001f : mask1
- 0x003f003f : mask2
then load two pixels at once into a 32bit register (which is a lot faster than 16bit read)
and red, mask1, pixel, lsr #11
and grn, mask2, pixel, lsr #5
and blu, mask1, pixel
now you have three registers, each containing two values - one in the lower, and the other in the upper 16 bits.
smulxy instructions will do some miracles from here on. (16bit multiply)
Good luck.
PS : your lookup table isn't that good either. Why are they all in length of 256? You could reduce them to 32 (r and b related) and 64 (g related) Which will increase the cache hit rate. Probably that will just do for the targeted 40ms without resorting to assembly. Yes, cache-misses are THAT painful.
回答2:
I have found a faster way in skia, it runs about 40ms.
#include "SkColorPriv.h"
#include "SkBitmap.h"
#include "SkCanvas.h"
#include "SkStream.h"
using namespace android;
// taken from jcolor.c in libjpeg
#if 0 // 16bit - precise but slow
#define CYR 19595 // 0.299
#define CYG 38470 // 0.587
#define CYB 7471 // 0.114
#define CUR -11059 // -0.16874
#define CUG -21709 // -0.33126
#define CUB 32768 // 0.5
#define CVR 32768 // 0.5
#define CVG -27439 // -0.41869
#define CVB -5329 // -0.08131
#define CSHIFT 16
#else // 8bit - fast, slightly less precise
#define CYR 77 // 0.299
#define CYG 150 // 0.587
#define CYB 29 // 0.114
#define CUR -43 // -0.16874
#define CUG -85 // -0.33126
#define CUB 128 // 0.5
#define CVR 128 // 0.5
#define CVG -107 // -0.41869
#define CVB -21 // -0.08131
#define CSHIFT 8
#endif
static void rgb2yuv_32(uint8_t dst[], SkPMColor c) {
int r = SkGetPackedR32(c);
int g = SkGetPackedG32(c);
int b = SkGetPackedB32(c);
int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
int u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
int v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;
dst[0] = SkToU8(y);
dst[1] = SkToU8(u + 128);
dst[2] = SkToU8(v + 128);
}
static void rgb2yuv_32_x(uint8_t *py, uint8_t *pu, uint8_t *pv, SkPMColor c) {
int r = SkGetPackedR32(c);
int g = SkGetPackedG32(c);
int b = SkGetPackedB32(c);
if(py != NULL){
int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
*py = SkToU8(y);
}
if(pu != NULL){
int u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
*pu = SkToU8(u + 128);
}
if(pv != NULL){
int v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;
*pv = SkToU8(v + 128);
}
}
static void rgb2yuv_4444(uint8_t dst[], U16CPU c) {
int r = SkGetPackedR4444(c);
int g = SkGetPackedG4444(c);
int b = SkGetPackedB4444(c);
int y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
int u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
int v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);
dst[0] = SkToU8(y);
dst[1] = SkToU8(u + 128);
dst[2] = SkToU8(v + 128);
}
static void rgb2yuv_4444_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
int r = SkGetPackedR4444(c);
int g = SkGetPackedG4444(c);
int b = SkGetPackedB4444(c);
if(py != NULL){
int y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
*py = SkToU8(y);
}
if(pu != NULL){
int u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
*pu = SkToU8(u + 128);
}
if(pv != NULL){
int v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);
*pv = SkToU8(v + 128);
}
}
static void rgb2yuv_16(uint8_t dst[], U16CPU c) {
int r = SkGetPackedR16(c);
int g = SkGetPackedG16(c);
int b = SkGetPackedB16(c);
int y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
int u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
int v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);
dst[0] = SkToU8(y);
dst[1] = SkToU8(u + 128);
dst[2] = SkToU8(v + 128);
}
static void rgb2yuv_16_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
int r = SkGetPackedR16(c);
int g = SkGetPackedG16(c);
int b = SkGetPackedB16(c);
if(py != NULL){
int y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
*py = SkToU8(y);
}
if(pu != NULL){
int u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
*pu = SkToU8(u + 128);
}
if(pv != NULL){
int v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);
*pv = SkToU8(v + 128);
}
}
int ConvertRGB5652YUV420SPBySkia(SkBitmap* bmp, unsigned char* dst) {
if(!bmp || !dst || bmp->getConfig() != SkBitmap::kRGB_565_Config)
return -1;
int width = bmp->width();
int height = bmp->height();
void *src = bmp->getPixels();
int src_rowbytes = bmp->rowBytes();
int stride = width;
int dstheight = height;
int i, j;
uint8_t *y_base = (uint8_t *)dst;
uint8_t *cb_base = (uint8_t *)((unsigned int)y_base + stride * dstheight);
uint8_t *cr_base = cb_base + 1;
uint8_t yuv[3];
uint8_t *y = NULL, *cb = NULL, *cr = NULL;
uint16_t *rgb = (uint16_t *)src;
for(i=0; i<height; i++){
rgb = (uint16_t *)((unsigned int)src + i * src_rowbytes);
y = (uint8_t *)((unsigned int)y_base + i * stride);
if((i & 0x1) == 0){
cb = (uint8_t *)((unsigned int)cb_base + ((i>>1) * stride));
cr = cb + 1;
}
for(j=0; j<width; j++){
if(i & 0x1){// valid y and cr
if(j & 0x01){ // only y
rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
}else{ // both y and cr
rgb2yuv_16_x(y++, NULL, cr++, *rgb++);
cr++;
}
}else{// valid y and cb
if(j & 0x01){ // only y
rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
}else{ // both y and cb
rgb2yuv_16_x(y++, cb++, NULL, *rgb++);
cb++;
}
}
}
}
return 0;
}
来源:https://stackoverflow.com/questions/8025621/how-to-convert-rgb565-to-yuv420sp-faster-on-android