I have the following code which is the bottleneck in one part of my application. All I do is subtract on Array from another. Both of these arrays have more around 100000 element
I was very curious about speed optimisation in this simple case. So I have made 6 simple procedures and measure CPU tick and time at array size 100000;
Check results on picture and code for more information.
To get 16 byte memory alignment first delite the dot in file 'FastMM4Options.inc' directive {$.define Align16Bytes} !
program SubTest;
{$APPTYPE CONSOLE}
uses
//In file 'FastMM4Options.inc' delite the dot in directive {$.define Align16Bytes}
//to get 16 byte memory alignment!
FastMM4,
windows,
SysUtils;
var
Ar1 :array of integer;
Ar2 :array of integer;
ArLength :integer;
StartTicks :int64;
EndTicks :int64;
TicksPerMicroSecond :int64;
function GetCpuTicks: int64;
asm
rdtsc
end;
{$R+}
{$Q+}
procedure SubArPasRangeOvfChkOn(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
{$R-}
{$Q-}
procedure SubArPas(length: integer);
var
n: integer;
begin
for n := 0 to length -1 do
Ar1[n] := Ar1[n] - Ar2[n];
end;
procedure SubArAsm(var ar1, ar2; length: integer);
asm
//Length must be > than 0!
push ebx
lea ar1, ar1 - 4
lea ar2, ar2 - 4
@Loop:
mov ebx, [ar2 + length * 4]
sub [ar1 + length * 4], ebx
dec length
jnz @Loop
@exit:
pop ebx
end;
procedure SubArAsmSimdU(var ar1, ar2; length: integer);
asm
//Prepare length
push length
shr length, 2
jz @Finish
@Loop:
movdqu xmm1, [ar1]
movdqu xmm2, [ar2]
psubd xmm1, xmm2
movdqu [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@Finish:
pop length
push ebx
and length, 3
jz @Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
procedure SubArAsmSimdA(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @exit
add ar1, 8
add ar2, 8
//Prepare length for 16 byte data transfer
push length
shr length, 2
jz @Finish
@Loop:
movdqa xmm1, [ar1]
movdqa xmm2, [ar2]
psubd xmm1, xmm2
movdqa [ar1], xmm1
add ar1, 16
add ar2, 16
dec length
jnz @Loop
@Finish:
pop length
and length, 3
jz @Exit
//Do rest, up to 3 subtractions...
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @Exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @Exit
mov ebx, [ar2 + 8]
sub [ar1 + 8], ebx
@Exit:
pop ebx
end;
procedure SubArAsmSimdAUnrolled8(var ar1, ar2; length: integer);
asm
push ebx
//Unfortunately delphi use first 8 bytes for dinamic array length and reference
//counter, from that reason the dinamic array address should start with $xxxxxxx8
//instead &xxxxxxx0. So we must first align ar1, ar2 pointers!
mov ebx, [ar2]
sub [ar1], ebx
dec length
jz @exit
mov ebx, [ar2 + 4]
sub [ar1 + 4], ebx
dec length
jz @exit
add ar1, 8 //Align pointer to 16 byte
add ar2, 8 //Align pointer to 16 byte
//Prepare length for 16 byte data transfer
push length
shr length, 5 //8 * 4 subtructions per loop
jz @Finish //To small for LoopUnrolled
@LoopUnrolled:
//Unrolle 1, 2, 3, 4
movdqa xmm4, [ar2]
movdqa xmm5, [16 + ar2]
movdqa xmm6, [32 + ar2]
movdqa xmm7, [48 + ar2]
//
movdqa xmm0, [ar1]
movdqa xmm1, [16 + ar1]
movdqa xmm2, [32 + ar1]
movdqa xmm3, [48 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [48 + ar1], xmm3
movdqa [32 + ar1], xmm2
movdqa [16 + ar1], xmm1
movdqa [ar1], xmm0
//Unrolle 5, 6, 7, 8
movdqa xmm4, [64 + ar2]
movdqa xmm5, [80 + ar2]
movdqa xmm6, [96 + ar2]
movdqa xmm7, [112 + ar2]
//
movdqa xmm0, [64 + ar1]
movdqa xmm1, [80 + ar1]
movdqa xmm2, [96 + ar1]
movdqa xmm3, [112 + ar1]
//
psubd xmm0, xmm4
psubd xmm1, xmm5
psubd xmm2, xmm6
psubd xmm3, xmm7
//
movdqa [112 + ar1], xmm3
movdqa [96 + ar1], xmm2
movdqa [80 + ar1], xmm1
movdqa [64 + ar1], xmm0
//
add ar1, 128
add ar2, 128
dec length
jnz @LoopUnrolled
@FinishUnrolled:
pop length
and length, $1F
//Do rest, up to 31 subtractions...
@Finish:
mov ebx, [ar2]
sub [ar1], ebx
add ar1, 4
add ar2, 4
dec length
jnz @Finish
@Exit:
pop ebx
end;
procedure WriteOut(EndTicks: Int64; Str: string);
begin
WriteLn(Str + IntToStr(EndTicks - StartTicks)
+ ' Time: ' + IntToStr((EndTicks - StartTicks) div TicksPerMicroSecond) + 'us');
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
end;
begin
ArLength := 100000;
//Set TicksPerMicroSecond
QueryPerformanceFrequency(TicksPerMicroSecond);
TicksPerMicroSecond := TicksPerMicroSecond div 1000000;
//
SetLength(Ar1, ArLength);
SetLength(Ar2, ArLength);
//Fill arrays
//...
//Tick time info
WriteLn('CPU ticks per mikro second: ' + IntToStr(TicksPerMicroSecond));
Sleep(5);
SwitchToThread;
StartTicks := GetCpuTicks;
//Test 1
SubArPasRangeOvfChkOn(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas Range and Overflow Checking On, Ticks: ');
//Test 2
SubArPas(ArLength);
WriteOut(GetCpuTicks, 'SubAr Pas, Ticks: ');
//Test 3
SubArAsm(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm, Ticks: ');
//Test 4
SubArAsmSimdU(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm SIMD mem unaligned, Ticks: ');
//Test 5
SubArAsmSimdA(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned, Ticks: ');
//Test 6
SubArAsmSimdAUnrolled8(Ar1[0], Ar2[0], ArLength);
WriteOut(GetCpuTicks, 'SubAr Asm with SIMD mem aligned 8*unrolled, Ticks: ');
//
ReadLn;
Ar1 := nil;
Ar2 := nil;
end.
...
The fastest asm procedure with 8 times unrolled SIMD instructions takes only 68us and is about 4 time faster than Pascal procedure.
As we can see the Pascal loop procedure probably isn't critical, it takes only about 277us (Overflow and Range checking off) on 2,4GHz CPU at 100000 subtractions.
So this code can't be bottleneck?