Speed difference between using int and unsigned int when mixed with doubles

后端 未结 4 1991
鱼传尺愫
鱼传尺愫 2021-01-01 14:22

I have an application where part of the inner loop was basically:

double sum = 0;
for (int i = 0; i != N; ++i, ++data, ++x) sum += *data * x;
相关标签:
4条回答
  • 2021-01-01 14:29

    I ran this on gcc 4.7.0 on a 64 bit machine running Linux. I replaced the time calls with calls to clock_gettime.

    CPU: Intel X5680 @3.33 GHZ

    GCC flags: -Wall -pedantic -O3 -std=c++11

    Results:

    With int time per operation in ns: 11996, total time sec: 1.57237
    Avg values: 1.06353e+09
    With unsigned int time per operation in ns: 11539, total time sec: 1.5125
    Avg values: 1.06353e+09
    With int time per operation in ns: 11994, total time sec: 1.57217
    Avg values: 1.06353e+09
    

    Evidently on my machine/compiler unsigned is faster.

    0 讨论(0)
  • 2021-01-01 14:38

    output with visual studio 2010 with intel Q6600... (note: I increased the loop count from 128*1024 to 512*1024)

    release mode...

    With int: 4.23944e+009 in 9secs
    With unsigned int: 4.23944e+009 in 18secs
    With int: 4.23944e+009 in 9secs
    

    debug mode...

    With int: 4.23944e+009 in 34secs
    With unsigned int: 4.23944e+009 in 58secs
    With int: 4.23944e+009 in 34secs
    

    The ASM in release mode... (unsigned)

        for (int i = 0; i != Nr_Samples; ++i) { 
    011714A1  fldz  
    011714A3  mov         edx,dword ptr [esi+4]  
    011714A6  add         esp,4  
    011714A9  xor         edi,edi  
    011714AB  sub         edx,dword ptr [esi]  
            moments_results[i] = moments<T>(dataptr, data.size(), 128); 
    011714AD  mov         ecx,dword ptr [ebp-1388Ch]  
    011714B3  fld         st(0)  
    011714B5  xor         eax,eax  
    011714B7  test        edx,edx  
    011714B9  je          measure<unsigned int>+79h (11714E9h)  
    011714BB  mov         esi,edx  
    011714BD  movzx       ebx,byte ptr [ecx]  
    011714C0  imul        ebx,eax  
    011714C3  mov         dword ptr [ebp-138A4h],ebx  
    011714C9  fild        dword ptr [ebp-138A4h]  //only in unsigned
    011714CF  test        ebx,ebx  //only in unsigned
    011714D1  jns         measure<unsigned int>+69h (11714D9h)  //only in unsigned
    011714D3  fadd        qword ptr [__real@41f0000000000000 (11731C8h)]  //only in unsigned
    011714D9  inc         eax  
    011714DA  faddp       st(1),st  
    011714DC  cmp         eax,80h  
    011714E1  jne         measure<unsigned int>+75h (11714E5h)  
    011714E3  xor         eax,eax  
    011714E5  inc         ecx  
    011714E6  dec         esi  
    011714E7  jne         measure<unsigned int>+4Dh (11714BDh)  
    011714E9  fstp        qword ptr [ebp+edi*8-13888h]  
    011714F0  inc         edi  
    011714F1  cmp         edi,2710h  
    011714F7  jne         measure<unsigned int>+3Dh (11714ADh)  
        } 
    

    The ASM in release mode... (signed)

        for (int i = 0; i != Nr_Samples; ++i) { 
    012A1351  fldz  
    012A1353  mov         edx,dword ptr [esi+4]  
    012A1356  add         esp,4  
    012A1359  xor         edi,edi  
    012A135B  sub         edx,dword ptr [esi]  
            moments_results[i] = moments<T>(dataptr, data.size(), 128); 
    012A135D  mov         ecx,dword ptr [ebp-13890h]  
    012A1363  fld         st(0)  
    012A1365  xor         eax,eax  
    012A1367  test        edx,edx  
    012A1369  je          measure<int>+6Fh (12A138Fh)  
    012A136B  mov         esi,edx  
    012A136D  movzx       ebx,byte ptr [ecx]  
    012A1370  imul        ebx,eax  
    012A1373  mov         dword ptr [ebp-1388Ch],ebx  
    012A1379  inc         eax  
    012A137A  fild        dword ptr [ebp-1388Ch]  //only in signed
    012A1380  faddp       st(1),st  
    012A1382  cmp         eax,80h  
    012A1387  jne         measure<int>+6Bh (12A138Bh)  
    012A1389  xor         eax,eax  
    012A138B  inc         ecx  
    012A138C  dec         esi  
    012A138D  jne         measure<int>+4Dh (12A136Dh)  
    012A138F  fstp        qword ptr [ebp+edi*8-13888h]  
    012A1396  inc         edi  
    012A1397  cmp         edi,2710h  
    012A139D  jne         measure<int>+3Dh (12A135Dh)  
        } 
    

    interesting... with release mode and SSE enabled..... (fld and flds instructions removed but 4 instructions added)

    With int: 4.23944e+009 in 8secs
    With unsigned int: 4.23944e+009 in 10secs
    With int: 4.23944e+009 in 8secs
    
    
        for (int i = 0; i != Nr_Samples; ++i) { 
    00F614C1  mov         edx,dword ptr [esi+4]  
    00F614C4  xorps       xmm0,xmm0  //added in sse version
    00F614C7  add         esp,4  
    00F614CA  xor         edi,edi  
    00F614CC  sub         edx,dword ptr [esi]  
            moments_results[i] = moments<T>(dataptr, data.size(), 128); 
    00F614CE  mov         ecx,dword ptr [ebp-13894h]  
    00F614D4  xor         eax,eax  
    00F614D6  movsd       mmword ptr [ebp-13890h],xmm0  //added in sse version
    00F614DE  test        edx,edx  
    00F614E0  je          measure<unsigned int>+8Ch (0F6151Ch)  
    00F614E2  fld         qword ptr [ebp-13890h]  //added in sse version
    00F614E8  mov         esi,edx  
    00F614EA  movzx       ebx,byte ptr [ecx]  
    00F614ED  imul        ebx,eax  
    00F614F0  mov         dword ptr [ebp-1388Ch],ebx  
    00F614F6  fild        dword ptr [ebp-1388Ch]  
    00F614FC  test        ebx,ebx  
    00F614FE  jns         measure<unsigned int>+76h (0F61506h)  
    00F61500  fadd        qword ptr [__real@41f0000000000000 (0F631C8h)]  
    00F61506  inc         eax  
    00F61507  faddp       st(1),st  
    00F61509  cmp         eax,80h  
    00F6150E  jne         measure<unsigned int>+82h (0F61512h)  
    00F61510  xor         eax,eax  
    00F61512  inc         ecx  
    00F61513  dec         esi  
    00F61514  jne         measure<unsigned int>+5Ah (0F614EAh)  
    00F61516  fstp        qword ptr [ebp-13890h]  
    00F6151C  movsd       xmm1,mmword ptr [ebp-13890h]  //added in sse version
    00F61524  movsd       mmword ptr [ebp+edi*8-13888h],xmm1  //added in sse version
    00F6152D  inc         edi  
    00F6152E  cmp         edi,2710h  
    00F61534  jne         measure<unsigned int>+3Eh (0F614CEh)  
        } 
    
    0 讨论(0)
  • 2021-01-01 14:41

    Here's some code produced by VC++ 6.0 - no optimisation:

    4:        int x = 12345;
    0040E6D8   mov         dword ptr [ebp-4],3039h
    5:        double d1 = x;
    0040E6DF   fild        dword ptr [ebp-4]
    0040E6E2   fstp        qword ptr [ebp-0Ch]
    6:        unsigned int y = 12345;
    0040E6E5   mov         dword ptr [ebp-10h],3039h
    7:        double d2 = y;
    0040E6EC   mov         eax,dword ptr [ebp-10h]
    0040E6EF   mov         dword ptr [ebp-20h],eax
    0040E6F2   mov         dword ptr [ebp-1Ch],0
    0040E6F9   fild        qword ptr [ebp-20h]
    0040E6FC   fstp        qword ptr [ebp-18h]
    

    As you can see, converting the unsigned does quite a bit more work.

    0 讨论(0)
  • 2021-01-01 14:50

    Here's why: many common architectures (including x86) have a hardware instruction to convert signed int to doubles, but do not have a hardware conversion from unsigned to double, so the compiler needs to synthesize the conversion in software. Furthermore, the only unsigned multiply on Intel is a full width multiply, whereas signed multiplies can use the signed multiply low instruction.

    GCC's software conversion from unsigned int to double may very well be suboptimal (it almost certainly is, given the magnitude of the slowdown that you observed), but it is expected behavior for the code to be faster when using signed integers.

    Assuming a smart compiler, the difference should be much smaller on a 64-bit system, because a 64-bit signed integer -> double conversion can be used to efficiently do a 32-bit unsigned conversion.

    Edit: to illustrate, this:

    sum += *data * x;
    

    if the integer variables are signed, should compile into something along these lines:

    mov       (data),   %eax
    imul      %ecx,     %eax
    cvtsi2sd  %eax,     %xmm1
    addsd     %xmm1,    %xmm0
    

    on the other hand, if the integer variables are unsigned, cvtsi2sd can't be used to do the conversion, so a software workaround is required. I would expect to see something like this:

        mov       (data),   %eax
        mul       %ecx            // might be slower than imul
        cvtsi2sd  %eax,     %xmm1 // convert as though signed integer
        test      %eax,     %eax  // check if high bit was set
        jge       1f              // if it was, we need to adjust the converted
        addsd     (2^32),   %xmm1 // value by adding 2^32
    1:  addsd     %xmm1,    %xmm0
    

    That would be "acceptable" codegen for the unsigned -> double conversion; it could easily be worse.

    All of this is assuming floating-point code generation to SSE (I believe this is the default on the Ubuntu tools, but I could be wrong).

    0 讨论(0)
提交回复
热议问题