128-bit shifts using assembly language?

后端 未结 2 612
南笙
南笙 2021-01-05 01:08

What is the most efficient way to do 128 bit shift on a modern Intel CPU (core i7, sandy bridge).

A similar code is in my most inner loop:

u128 a[N];         


        
相关标签:
2条回答
  • 2021-01-05 01:36

    In this particular case you could use a combination of x86 SHR and RCR instructions:

    ; a0 - bits 0-31 of a[i]
    ; a1 - bits 32-63 of a[i]
    ; a2 - bits 64-95 of a[i]
    ; a3 - bits 96-127 of a[i]
    mov eax, a0
    mov ebx, a1
    mov ecx, a2
    mov ecx, a3
    
    shr eax, 1
    rcr ebx, 1
    rcr ecx, 1
    rcr edx, 1
    
    ; b0 - bits 0-31 of b[i] := a[i] >> 1
    ; b1 - bits 32-63 of b[i] := a[i] >> 1
    ; b2 - bits 64-95 of b[i] := a[i] >> 1
    ; b3 - bits 96-127 of b[i] := a[i] >> 1
    mov b0, eax
    mov b1, ebx
    mov b2, ecx
    mov b3, edx
    
    shr eax, 1
    rcr ebx, 1
    rcr ecx, 1
    rcr edx, 1
    
    ; c0 - bits 0-31 of c[i] := a[i] >> 2 = b[i] >> 1
    ; c1 - bits 32-63 of c[i] := a[i] >> 2 = b[i] >> 1
    ; c2 - bits 64-95 of c[i] := a[i] >> 2 = b[i] >> 1
    ; c3 - bits 96-127 of c[i] := a[i] >> 2 = b[i] >> 1
    mov c0, eax
    mov c1, ebx
    mov c2, ecx
    mov c3, edx
    

    If your target is x86-64 this simplifies to:

    ; a0 - bits 0-63 of a[i]
    ; a1 - bits 64-127 of a[i]
    mov rax, a0
    mov rbx, a1
    
    shr rax, 1
    rcr rbx, 1
    
    ; b0 - bits 0-63 of b[i] := a[i] >> 1
    ; b1 - bits 64-127 of b[i] := a[i] >> 1
    mov b0, rax
    mov b1, rbx
    
    shr rax, 1
    rcr rbx, 1
    
    ; c0 - bits 0-63 of c[i] := a[i] >> 2 = b[i] >> 1
    ; c1 - bits 64-127 of c[i] := a[i] >> 2 = b[i] >> 1
    mov c0, rax
    mov c1, rbx
    

    Update: corrected typos in 64-bit version

    0 讨论(0)
  • 2021-01-05 01:46

    Using instruction Shift Double.

    So SHLD or SHRD instruction, because SSE isn't intended for this purpose. There is a clasic method, here are you have test cases for 128 bit left shift by 16 bits under 32 and 64 bit CPU mode.

    On this way you can perform unlimited size shift for up to 32/64 bits. Yoo can shift for immediate number of bits or for number in cl register. First instruction operant can also address variable in memory.

    128 bit left shift by 16 bits under 32 bit x86 CPU mode:

        mov     eax, $04030201;
        mov     ebx, $08070605;
        mov     ecx, $0C0B0A09;
        mov     edx, $100F0E0D;
    
        shld    edx, ecx, 16
        shld    ecx, ebx, 16
        shld    ebx, eax, 16
        shl     eax, 16
    

    And 128 bit left shift by 16 bits under 64 bit x86 CPU mode:

        mov    rax, $0807060504030201;
        mov    rdx, $100F0D0E0B0C0A09;
    
        shld   rdx, rax, 16
        shl    rax, 16
    
    0 讨论(0)
提交回复
热议问题