Why is this version of strcmp slower?

后端 未结 2 718
小蘑菇
小蘑菇 2021-02-05 04:34

I have been trying experiment with improving performance of strcmp under certain conditions. However, I unfortunately cannot even get an implementation of plain van

2条回答
  •  盖世英雄少女心
    2021-02-05 05:13

    Excerpts from my disassembly of glibc v2.2.5, x86_64 linux:

    0000000000089cd0 :
       89cd0:   48 8b 15 99 a1 33 00    mov    0x33a199(%rip),%rdx        # 3c3e70 <_IO_file_jumps@@GLIBC_2.2.5+0x790>
       89cd7:   48 8d 05 92 58 01 00    lea    0x15892(%rip),%rax        # 9f570 
       89cde:   f7 82 b0 00 00 00 10    testl  $0x10,0xb0(%rdx)
       89ce5:   00 00 00 
       89ce8:   75 1a                   jne    89d04 
       89cea:   48 8d 05 9f 48 0c 00    lea    0xc489f(%rip),%rax        # 14e590 <__nss_passwd_lookup@@GLIBC_2.2.5+0x9c30>
       89cf1:   f7 82 80 00 00 00 00    testl  $0x200,0x80(%rdx)
       89cf8:   02 00 00 
       89cfb:   75 07                   jne    89d04 
       89cfd:   48 8d 05 0c 00 00 00    lea    0xc(%rip),%rax        # 89d10 
       89d04:   c3                      retq
       89d05:   90                      nop
       89d06:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
       89d0d:   00 00 00 
       89d10:   89 f1                   mov    %esi,%ecx
       89d12:   89 f8                   mov    %edi,%eax
       89d14:   48 83 e1 3f             and    $0x3f,%rcx
       89d18:   48 83 e0 3f             and    $0x3f,%rax
       89d1c:   83 f9 30                cmp    $0x30,%ecx
       89d1f:   77 3f                   ja     89d60 
       89d21:   83 f8 30                cmp    $0x30,%eax
       89d24:   77 3a                   ja     89d60 
       89d26:   66 0f 12 0f             movlpd (%rdi),%xmm1
       89d2a:   66 0f 12 16             movlpd (%rsi),%xmm2
       89d2e:   66 0f 16 4f 08          movhpd 0x8(%rdi),%xmm1
       89d33:   66 0f 16 56 08          movhpd 0x8(%rsi),%xmm2
       89d38:   66 0f ef c0             pxor   %xmm0,%xmm0
       89d3c:   66 0f 74 c1             pcmpeqb %xmm1,%xmm0
       89d40:   66 0f 74 ca             pcmpeqb %xmm2,%xmm1
       89d44:   66 0f f8 c8             psubb  %xmm0,%xmm1
       89d48:   66 0f d7 d1             pmovmskb %xmm1,%edx
       89d4c:   81 ea ff ff 00 00       sub    $0xffff,%edx
    ...
    

    The real thing is 1183 lines of assembly, with lots of potential cleverness about detecting system features and vectorized instructions. libc maintainers know that they can get an edge by just optimizing some of the functions called thousands of times by applications.

    For comparison, your version at -O3:

    comparisons.o:     file format elf64-x86-64
    
    
    Disassembly of section .text:
    
    0000000000000000 <_Z13strcmp_customPKcS0_>:
    int strcmp_custom(const char* a, const char* b) {
        while (*b == *a) {
       0:   8a 0e                   mov    (%rsi),%cl
       2:   8a 07                   mov    (%rdi),%al
       4:   38 c1                   cmp    %al,%cl
       6:   75 1e                   jne    26 <_Z13strcmp_customPKcS0_+0x26>
            if (*a == '\0') return 0;
       8:   48 ff c6                inc    %rsi
       b:   48 ff c7                inc    %rdi
       e:   66 90                   xchg   %ax,%ax
      10:   31 c0                   xor    %eax,%eax
      12:   84 c9                   test   %cl,%cl
      14:   74 18                   je     2e <_Z13strcmp_customPKcS0_+0x2e>
    int strcmp_custom(const char* a, const char* b) {
        while (*b == *a) {
      16:   0f b6 0e                movzbl (%rsi),%ecx
      19:   0f b6 07                movzbl (%rdi),%eax
      1c:   48 ff c6                inc    %rsi
      1f:   48 ff c7                inc    %rdi
      22:   38 c1                   cmp    %al,%cl
      24:   74 ea                   je     10 <_Z13strcmp_customPKcS0_+0x10>
      26:   0f be d0                movsbl %al,%edx
      29:   0f be c1                movsbl %cl,%eax
            if (*a == '\0') return 0;
            a++;
            b++;
        }
        return *b - *a;
      2c:   29 d0                   sub    %edx,%eax
    }
      2e:   c3                      retq   
    

提交回复
热议问题