How to convert RGB565 to YUV420SP faster on android?

一曲冷凌霜 提交于 2019-12-23 13:07:52

问题


I need display a jpeg picture, and convert it to YUV420SP. First I use SkBitmap to parse jpeg and display it, then I use the code below to convert RGB565 to YUV420SP on android, but it spend 75ms to convert a 640*480 RGB565 picture, so anybody know the faster way to convert RGB565 to YUV420SP on android? or faster way to convert jpeg file to YUV420SP on android?

// Convert from RGB to YUV420
int RGB2YUV_YR[256], RGB2YUV_YG[256], RGB2YUV_YB[256];
int RGB2YUV_UR[256], RGB2YUV_UG[256], RGB2YUV_UBVR[256];
int RGB2YUV_VG[256], RGB2YUV_VB[256];

//
// Table used for RGB to YUV420 conversion
//
void InitLookupTable()
{
    static bool hasInited = false;
    if(hasInited)
        return ;
    hasInited = true;
    int i;
    for (i = 0; i < 256; i++)
        RGB2YUV_YR[i] = (float) 65.481 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_YG[i] = (float) 128.553 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_YB[i] = (float) 24.966 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_UR[i] = (float) 37.797 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_UG[i] = (float) 74.203 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_VG[i] = (float) 93.786 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_VB[i] = (float) 18.214 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_UBVR[i] = (float) 112 * (i << 8);
}

int ConvertRGB5652YUV420SP(int w, int h, unsigned char *bmp, unsigned char *yuv)
{
    unsigned char *u, *v, *y, *uu, *vv;
    unsigned char *pu1, *pu2, *pu3, *pu4;
    unsigned char *pv1, *pv2, *pv3, *pv4;
    unsigned char rValue = 0, gValue = 0, bValue = 0;
    uint16_t* bmpPtr;
    int i, j;
    printf("ConvertRGB5652YUV420SP begin,w=%d,h=%d,bmp=%p,yuv=%p\n", w, h, bmp, yuv);

    struct timeval tpstart,tpend;
    gettimeofday(&tpstart,NULL);

    InitLookupTable();

    gettimeofday(&tpend,NULL);
    float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("InitLookupTable used time=%f\n", timeuse);
    gettimeofday(&tpstart,NULL);

    uu = new unsigned char[w * h];
    vv = new unsigned char[w * h];
    if (uu == NULL || vv == NULL || yuv == NULL)
        return 0;
    y = yuv;
    u = uu;
    v = vv;
    // Get r,g,b pointers from bmp image data....
    bmpPtr = (uint16_t*)bmp;

    //Get YUV values for rgb values...
    for (i = 0; i < h; i++) {
        for (j = 0; j < w; j++) {
            uint16_t color = *bmpPtr;
            unsigned int r = (color>>11) & 0x1f;
            unsigned int g = (color>> 5) & 0x3f;
            unsigned int b = (color    ) & 0x1f;
            rValue = (r<<3) | (r>>2);      
            gValue = (g<<2) | (g>>4);   
            bValue = (b<<3) | (b>>2);

            *y++ = (RGB2YUV_YR[rValue] + RGB2YUV_YG[gValue] + RGB2YUV_YB[bValue] +
                1048576) >> 16;
            *u++ = (-RGB2YUV_UR[rValue] - RGB2YUV_UG[gValue] + RGB2YUV_UBVR[bValue] +
                8388608) >> 16;
            *v++ = (RGB2YUV_UBVR[rValue] - RGB2YUV_VG[gValue] - RGB2YUV_VB[bValue] +
                8388608) >> 16;
            bmpPtr++;
        }
    }

    gettimeofday(&tpend,NULL);
    timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("Get YUV values used  time=%f\n", timeuse);
    gettimeofday(&tpstart,NULL);

    // Now sample the U & V to obtain YUV 4:2:0 format
    // Get the right pointers...
    u = yuv + w * h;
    v = u + 1;
    // For U
    pu1 = uu;
    pu2 = pu1 + 1;
    pu3 = pu1 + w;
    pu4 = pu3 + 1;
    // For V
    pv1 = vv;
    pv2 = pv1 + 1;
    pv3 = pv1 + w;
    pv4 = pv3 + 1;
    // Do sampling....
    for (i = 0; i < h; i += 2) {
        for (j = 0; j < w; j += 2) {
            *u = (*pu1 + *pu2 + *pu3 + *pu4) >> 2;
            u += 2;
            *v = (*pv1 + *pv2 + *pv3 + *pv4) >> 2;
            v += 2;
            pu1 += 2;
            pu2 += 2;
            pu3 += 2;
            pu4 += 2;
            pv1 += 2;
            pv2 += 2;
            pv3 += 2;
            pv4 += 2;
        }

        pu1 += w;
        pu2 += w;
        pu3 += w;
        pu4 += w;
        pv1 += w;
        pv2 += w;
        pv3 += w;
        pv4 += w;
    }

    gettimeofday(&tpend,NULL);
    timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("Do sampling used time=%f\n", timeuse);
    gettimeofday(&tpstart,NULL);

    delete uu;
    delete vv;
    return 1;
}

int main(int argc, char **argv) {
    unsigned char bmp[640*480*2] = {0};
    unsigned char yuv[(640*480*3)/2] = {0};
    struct timeval tpstart,tpend;
    gettimeofday(&tpstart,NULL);

    ConvertRGB5652YUV420SP(640, 480, bmp, yuv);

    gettimeofday(&tpend,NULL);
    float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("ConvertARGB2YUV420SP used time=%f\n", timeuse);
    return 0;
}

output on android(armv6):

ConvertRGB5652YUV420SP begin,w=640,h=480,bmp=0xbe7314fc,yuv=0xbe7c74fc
InitLookupTable used time=0.383000
Get YUV values used  time=61.394001
Do sampling used time=11.918000
ConvertARGB2YUV420SP used time=74.596001

cpu info:

$ cat /proc/cpuinfo
cat /proc/cpuinfo
Processor       : ARMv6-compatible processor rev 5 (v6l)
BogoMIPS        : 791.34
Features        : swp half thumb fastmult vfp edsp java
CPU implementer : 0x41
CPU architecture: 6TEJ
CPU variant     : 0x1
CPU part        : 0xb36
CPU revision    : 5

Hardware        : IMAPX200
Revision        : 0000
Serial          : 0000000000000000

回答1:


On ARMv7, use NEON. It will do the job in less than 1ms. (VGA)

If you are stuck with ARMv6, optimize it in ARM assembly. (about 8ms on VGA)

Use fixed-point arithmetic instead of the lookup tables. Get rid of them.

make two masks :

  • 0x001f001f : mask1
  • 0x003f003f : mask2

then load two pixels at once into a 32bit register (which is a lot faster than 16bit read)

and red, mask1, pixel, lsr #11
and grn, mask2, pixel, lsr #5
and blu, mask1, pixel

now you have three registers, each containing two values - one in the lower, and the other in the upper 16 bits.

smulxy instructions will do some miracles from here on. (16bit multiply)

Good luck.

PS : your lookup table isn't that good either. Why are they all in length of 256? You could reduce them to 32 (r and b related) and 64 (g related) Which will increase the cache hit rate. Probably that will just do for the targeted 40ms without resorting to assembly. Yes, cache-misses are THAT painful.




回答2:


I have found a faster way in skia, it runs about 40ms.

#include "SkColorPriv.h"
#include "SkBitmap.h"
#include "SkCanvas.h"
#include "SkStream.h"

using namespace android;

// taken from jcolor.c in libjpeg
#if 0   // 16bit - precise but slow
    #define CYR     19595   // 0.299
    #define CYG     38470   // 0.587
    #define CYB      7471   // 0.114

    #define CUR    -11059   // -0.16874
    #define CUG    -21709   // -0.33126
    #define CUB     32768   // 0.5

    #define CVR     32768   // 0.5
    #define CVG    -27439   // -0.41869
    #define CVB     -5329   // -0.08131

    #define CSHIFT  16
#else      // 8bit - fast, slightly less precise
    #define CYR     77    // 0.299
    #define CYG     150    // 0.587
    #define CYB      29    // 0.114

    #define CUR     -43    // -0.16874
    #define CUG    -85    // -0.33126
    #define CUB     128    // 0.5

    #define CVR      128   // 0.5
    #define CVG     -107   // -0.41869
    #define CVB      -21   // -0.08131

    #define CSHIFT  8
#endif

static void rgb2yuv_32(uint8_t dst[], SkPMColor c) {
    int r = SkGetPackedR32(c);
    int g = SkGetPackedG32(c);
    int b = SkGetPackedB32(c);

    int  y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
    int  u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
    int  v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_32_x(uint8_t *py, uint8_t *pu, uint8_t *pv, SkPMColor c) {
    int r = SkGetPackedR32(c);
    int g = SkGetPackedG32(c);
    int b = SkGetPackedB32(c);

    if(py != NULL){
         int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
     *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
    *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;
    *pv = SkToU8(v + 128);
    }
}

static void rgb2yuv_4444(uint8_t dst[], U16CPU c) {
    int r = SkGetPackedR4444(c);
    int g = SkGetPackedG4444(c);
    int b = SkGetPackedB4444(c);

    int  y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
    int  u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
    int  v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_4444_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
    int r = SkGetPackedR4444(c);
    int g = SkGetPackedG4444(c);
    int b = SkGetPackedB4444(c);

    if(py != NULL){
        int  y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
    *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
    *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);
    *pv = SkToU8(v + 128);
    }
}

static void rgb2yuv_16(uint8_t dst[], U16CPU c) {
    int r = SkGetPackedR16(c);
    int g = SkGetPackedG16(c);
    int b = SkGetPackedB16(c);

    int  y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
    int  u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
    int  v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_16_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
    int r = SkGetPackedR16(c);
    int g = SkGetPackedG16(c);
    int b = SkGetPackedB16(c);

    if(py != NULL){
        int  y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
        *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
        *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);
        *pv = SkToU8(v + 128);
    }
}


int ConvertRGB5652YUV420SPBySkia(SkBitmap* bmp, unsigned char* dst) {
    if(!bmp || !dst || bmp->getConfig() != SkBitmap::kRGB_565_Config)
         return -1;
    int width = bmp->width();
    int height = bmp->height();
    void *src = bmp->getPixels();
    int src_rowbytes = bmp->rowBytes();
    int stride = width;
    int dstheight = height;
    int i, j;
    uint8_t *y_base = (uint8_t *)dst;
    uint8_t *cb_base = (uint8_t *)((unsigned int)y_base + stride * dstheight);
    uint8_t *cr_base = cb_base + 1; 
    uint8_t yuv[3];
    uint8_t *y = NULL, *cb = NULL, *cr = NULL;
    uint16_t *rgb = (uint16_t *)src;
    for(i=0; i<height; i++){
        rgb = (uint16_t *)((unsigned int)src + i * src_rowbytes);
        y = (uint8_t *)((unsigned int)y_base + i * stride);
        if((i & 0x1) == 0){
            cb = (uint8_t *)((unsigned int)cb_base + ((i>>1) * stride));
            cr = cb +  1;
        }
        for(j=0; j<width; j++){
            if(i & 0x1){// valid y and cr
                if(j & 0x01){   // only y
                        rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
                }else{  // both y and cr
                        rgb2yuv_16_x(y++, NULL, cr++, *rgb++);
                        cr++;
                }
            }else{// valid y and cb
                if(j & 0x01){   // only y
                        rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
                }else{  // both y and cb
                        rgb2yuv_16_x(y++, cb++, NULL, *rgb++);
                        cb++;
                }
            }

        }
    }
    return 0;
}


来源:https://stackoverflow.com/questions/8025621/how-to-convert-rgb565-to-yuv420sp-faster-on-android

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!