I have been trying experiment with improving performance of strcmp
under certain conditions. However, I unfortunately cannot even get an implementation of plain van
Excerpts from my disassembly of glibc
v2.2.5, x86_64 linux:
0000000000089cd0 :
89cd0: 48 8b 15 99 a1 33 00 mov 0x33a199(%rip),%rdx # 3c3e70 <_IO_file_jumps@@GLIBC_2.2.5+0x790>
89cd7: 48 8d 05 92 58 01 00 lea 0x15892(%rip),%rax # 9f570
89cde: f7 82 b0 00 00 00 10 testl $0x10,0xb0(%rdx)
89ce5: 00 00 00
89ce8: 75 1a jne 89d04
89cea: 48 8d 05 9f 48 0c 00 lea 0xc489f(%rip),%rax # 14e590 <__nss_passwd_lookup@@GLIBC_2.2.5+0x9c30>
89cf1: f7 82 80 00 00 00 00 testl $0x200,0x80(%rdx)
89cf8: 02 00 00
89cfb: 75 07 jne 89d04
89cfd: 48 8d 05 0c 00 00 00 lea 0xc(%rip),%rax # 89d10
89d04: c3 retq
89d05: 90 nop
89d06: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
89d0d: 00 00 00
89d10: 89 f1 mov %esi,%ecx
89d12: 89 f8 mov %edi,%eax
89d14: 48 83 e1 3f and $0x3f,%rcx
89d18: 48 83 e0 3f and $0x3f,%rax
89d1c: 83 f9 30 cmp $0x30,%ecx
89d1f: 77 3f ja 89d60
89d21: 83 f8 30 cmp $0x30,%eax
89d24: 77 3a ja 89d60
89d26: 66 0f 12 0f movlpd (%rdi),%xmm1
89d2a: 66 0f 12 16 movlpd (%rsi),%xmm2
89d2e: 66 0f 16 4f 08 movhpd 0x8(%rdi),%xmm1
89d33: 66 0f 16 56 08 movhpd 0x8(%rsi),%xmm2
89d38: 66 0f ef c0 pxor %xmm0,%xmm0
89d3c: 66 0f 74 c1 pcmpeqb %xmm1,%xmm0
89d40: 66 0f 74 ca pcmpeqb %xmm2,%xmm1
89d44: 66 0f f8 c8 psubb %xmm0,%xmm1
89d48: 66 0f d7 d1 pmovmskb %xmm1,%edx
89d4c: 81 ea ff ff 00 00 sub $0xffff,%edx
...
The real thing is 1183 lines of assembly, with lots of potential cleverness about detecting system features and vectorized instructions. libc maintainers know that they can get an edge by just optimizing some of the functions called thousands of times by applications.
For comparison, your version at -O3
:
comparisons.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z13strcmp_customPKcS0_>:
int strcmp_custom(const char* a, const char* b) {
while (*b == *a) {
0: 8a 0e mov (%rsi),%cl
2: 8a 07 mov (%rdi),%al
4: 38 c1 cmp %al,%cl
6: 75 1e jne 26 <_Z13strcmp_customPKcS0_+0x26>
if (*a == '\0') return 0;
8: 48 ff c6 inc %rsi
b: 48 ff c7 inc %rdi
e: 66 90 xchg %ax,%ax
10: 31 c0 xor %eax,%eax
12: 84 c9 test %cl,%cl
14: 74 18 je 2e <_Z13strcmp_customPKcS0_+0x2e>
int strcmp_custom(const char* a, const char* b) {
while (*b == *a) {
16: 0f b6 0e movzbl (%rsi),%ecx
19: 0f b6 07 movzbl (%rdi),%eax
1c: 48 ff c6 inc %rsi
1f: 48 ff c7 inc %rdi
22: 38 c1 cmp %al,%cl
24: 74 ea je 10 <_Z13strcmp_customPKcS0_+0x10>
26: 0f be d0 movsbl %al,%edx
29: 0f be c1 movsbl %cl,%eax
if (*a == '\0') return 0;
a++;
b++;
}
return *b - *a;
2c: 29 d0 sub %edx,%eax
}
2e: c3 retq