Most performant way to subtract one array from another

后端 未结 5 962
别跟我提以往
别跟我提以往 2021-02-08 23:32

I have the following code which is the bottleneck in one part of my application. All I do is subtract on Array from another. Both of these arrays have more around 100000 element

5条回答
  •  面向向阳花
    2021-02-08 23:58

    I was very curious about speed optimisation in this simple case. So I have made 6 simple procedures and measure CPU tick and time at array size 100000;

    1. Pascal procedure with compiler option Range and Overflow Checking On
    2. Pascal procedure with compiler option Range and Overflow Checking off
    3. Classic x86 assembler procedure.
    4. Assembler procedure with SSE instructions and unaligned 16 byte move.
    5. Assembler procedure with SSE instructions and aligned 16 byte move.
    6. Assembler 8 times unrolled loop with SSE instructions and aligned 16 byte move.

    Check results on picture and code for more information. enter image description here

    To get 16 byte memory alignment first delite the dot in file 'FastMM4Options.inc' directive {$.define Align16Bytes} !

    program SubTest;
    
    {$APPTYPE CONSOLE}
    
    uses
    //In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
    //to get 16 byte memory alignment!
      FastMM4,
      windows,
      SysUtils;
    
    var
      Ar1                   :array of integer;
      Ar2                   :array of integer;
      ArLength              :integer;
      StartTicks            :int64;
      EndTicks              :int64;
      TicksPerMicroSecond   :int64;
    
    function GetCpuTicks: int64;
    asm
      rdtsc
    end;
    {$R+}
    {$Q+}
    procedure SubArPasRangeOvfChkOn(length: integer);
    var
      n: integer;
    begin
      for n := 0 to length -1 do
        Ar1[n] := Ar1[n] - Ar2[n];
    end;
    {$R-}
    {$Q-}
    procedure SubArPas(length: integer);
    var
      n: integer;
    begin
      for n := 0 to length -1 do
        Ar1[n] := Ar1[n] - Ar2[n];
    end;
    
    procedure SubArAsm(var ar1, ar2; length: integer);
    asm
    //Length must be > than 0!
       push ebx
       lea  ar1, ar1 - 4
       lea  ar2, ar2 - 4
    @Loop:
       mov  ebx, [ar2 + length * 4]
       sub  [ar1 + length * 4], ebx
       dec  length
       jnz  @Loop
    @exit:
       pop  ebx
    end;
    
    procedure SubArAsmSimdU(var ar1, ar2; length: integer);
    asm
    //Prepare length
      push     length
      shr      length, 2
      jz       @Finish
    @Loop:
      movdqu   xmm1, [ar1]
      movdqu   xmm2, [ar2]
      psubd    xmm1, xmm2
      movdqu   [ar1], xmm1
      add      ar1, 16
      add      ar2, 16
      dec      length
      jnz      @Loop
    @Finish:
      pop      length
      push     ebx
      and      length, 3
      jz       @Exit
    //Do rest, up to 3 subtractions...
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 8]
      sub      [ar1 + 8], ebx
    @Exit:
      pop      ebx
    end;
    
    procedure SubArAsmSimdA(var ar1, ar2; length: integer);
    asm
      push     ebx
    //Unfortunately delphi use first 8 bytes for dinamic array length and reference
    //counter, from that reason the dinamic array address should start with $xxxxxxx8
    //instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @exit
      add      ar1, 8
      add      ar2, 8
    //Prepare length for 16 byte data transfer
      push     length
      shr      length, 2
      jz       @Finish
    @Loop:
      movdqa   xmm1, [ar1]
      movdqa   xmm2, [ar2]
      psubd    xmm1, xmm2
      movdqa   [ar1], xmm1
      add      ar1, 16
      add      ar2, 16
      dec      length
      jnz      @Loop
    @Finish:
      pop      length
      and      length, 3
      jz       @Exit
    //Do rest, up to 3 subtractions...
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @Exit
      mov      ebx, [ar2 + 8]
      sub      [ar1 + 8], ebx
    @Exit:
      pop      ebx
    end;
    
    procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
    asm
      push     ebx
    //Unfortunately delphi use first 8 bytes for dinamic array length and reference
    //counter, from that reason the dinamic array address should start with $xxxxxxx8
    //instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
      mov      ebx, [ar2]
      sub      [ar1], ebx
      dec      length
      jz       @exit
      mov      ebx, [ar2 + 4]
      sub      [ar1 + 4], ebx
      dec      length
      jz       @exit
      add      ar1, 8                       //Align pointer to 16 byte
      add      ar2, 8                       //Align pointer to 16 byte
    //Prepare length for 16 byte data transfer
      push     length
      shr      length, 5                    //8 * 4 subtructions per loop
      jz       @Finish                      //To small for LoopUnrolled
    @LoopUnrolled:
    //Unrolle 1, 2, 3, 4
      movdqa   xmm4, [ar2]
      movdqa   xmm5, [16 + ar2]
      movdqa   xmm6, [32 + ar2]
      movdqa   xmm7, [48 + ar2]
    //
      movdqa   xmm0, [ar1]
      movdqa   xmm1, [16 + ar1]
      movdqa   xmm2, [32 + ar1]
      movdqa   xmm3, [48 + ar1]
    //
      psubd    xmm0, xmm4
      psubd    xmm1, xmm5
      psubd    xmm2, xmm6
      psubd    xmm3, xmm7
    //
      movdqa   [48 + ar1], xmm3
      movdqa   [32 + ar1], xmm2
      movdqa   [16 + ar1], xmm1
      movdqa   [ar1], xmm0
    //Unrolle 5, 6, 7, 8
      movdqa   xmm4, [64 + ar2]
      movdqa   xmm5, [80 + ar2]
      movdqa   xmm6, [96 + ar2]
      movdqa   xmm7, [112 + ar2]
    //
      movdqa   xmm0, [64 + ar1]
      movdqa   xmm1, [80 + ar1]
      movdqa   xmm2, [96 + ar1]
      movdqa   xmm3, [112 + ar1]
    //
      psubd    xmm0, xmm4
      psubd    xmm1, xmm5
      psubd    xmm2, xmm6
      psubd    xmm3, xmm7
    //
      movdqa   [112 + ar1], xmm3
      movdqa   [96 + ar1], xmm2
      movdqa   [80 + ar1], xmm1
      movdqa   [64 + ar1], xmm0
    //
      add      ar1, 128
      add      ar2, 128
      dec      length
      jnz      @LoopUnrolled
    @FinishUnrolled:
      pop      length
      and      length, $1F
    //Do rest, up to 31 subtractions...
    @Finish:
      mov      ebx, [ar2]
      sub      [ar1], ebx
      add      ar1, 4
      add      ar2, 4
      dec      length
      jnz      @Finish
    @Exit:
      pop      ebx
    end;
    
    procedure WriteOut(EndTicks: Int64; Str: string);
    begin
      WriteLn(Str + IntToStr(EndTicks - StartTicks)
        + ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
      Sleep(5);
      SwitchToThread;
      StartTicks := GetCpuTicks;
    end;
    
    begin
      ArLength := 100000;
    //Set TicksPerMicroSecond
      QueryPerformanceFrequency(TicksPerMicroSecond);
      TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
    //
      SetLength(Ar1, ArLength);
      SetLength(Ar2, ArLength);
    //Fill arrays
    //...
    //Tick time info
      WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
      Sleep(5);
      SwitchToThread;
      StartTicks := GetCpuTicks;
    //Test 1
      SubArPasRangeOvfChkOn(ArLength);
      WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
    //Test 2
      SubArPas(ArLength);
      WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
    //Test 3
      SubArAsm(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
    //Test 4
      SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
    //Test 5
      SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
    //Test 6
      SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
      WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
    //
      ReadLn;
      Ar1 := nil;
      Ar2 := nil;
    end.
    

    ...

    The fastest asm procedure with 8 times unrolled SIMD instructions takes only 68us and is about 4 time faster than Pascal procedure.

    As we can see the Pascal loop procedure probably isn't critical, it takes only about 277us (Overflow and Range checking off) on 2,4GHz CPU at 100000 subtractions.

    So this code can't be bottleneck?

提交回复
热议问题