Fast memory transpose with SSE, AVX, and OpenMP

前端 未结 3 351
清歌不尽
清歌不尽 2020-12-30 08:22

I need a fast memory transpose algorithm for my Gaussian convolution function in C/C++. What I do now is

convolute_1D
transpose
convolute_1D
transpose
         


        
3条回答
  •  生来不讨喜
    2020-12-30 09:16

    Consider this 4x4 transpose.

    struct MATRIX {
        union {
            float  f[4][4];
            __m128 m[4];
            __m256 n[2];
        };
    };
    MATRIX myTranspose(MATRIX in) {
    
        // This takes 15 assembler instructions (compile not inline), 
        // and is faster than XMTranspose
        // Comes in like this  1  2  3  4  5  6  7  8
        //                     9 10 11 12 13 14 15 16
        //
        // Want the result     1  5  9 13  2  6 10 14
        //                     3  7 11 15  4  8 12 16
    
        __m256 t0, t1, t2, t3, t4, t5, n0, n1;
        MATRIX result;
    
        n0 = in.n[0];                                               // n0 =  1,  2,  3,  4,  5,  6,  7,  8
        n1 = in.n[1];                                               // n1 =  9, 10, 11, 12, 13, 14, 15, 16
        t0 = _mm256_unpacklo_ps(n0, n1);                            // t0 =  1,  9,  2, 10,  5, 13,  6, 14
        t1 = _mm256_unpackhi_ps(n0, n1);                            // t1 =  3, 11,  4, 12,  7, 15,  8, 16
    
        t2 = _mm256_permute2f128_ps(t0, t1, 0x20);                  // t2 =  1,  9,  2, 10,  3, 11,  4, 12 
        t3 = _mm256_permute2f128_ps(t0, t1, 0x31);                  // t3 =  5, 13,  6, 14,  7, 15,  8, 16
    
        t4 = _mm256_unpacklo_ps(t2, t3);                            // t2 =  1,  5,  9, 13,  3,  7, 11, 15
        t5 = _mm256_unpackhi_ps(t2, t3);                            // t3 =  2,  6, 10, 14,  4,  8, 12, 16
    
        result.n[0] = _mm256_permute2f128_ps(t4, t5, 0x20);         // t6 =  1,  5,  9, 13,  2,  6, 10, 14
        result.n[1] = _mm256_permute2f128_ps(t4, t5, 0x31);         // t7 =  3,  7, 11, 15,  4,  8, 12, 16
        return result;
    }
    

提交回复
热议问题