GCC has 128-bit integers. Using these I can get the compiler to use the mul
(or imul
with only one operand) instructions. For example
I found a solution with ICC 13.0.01 using the _addcarry_u64
intrinsic
void add256(uint256 *x, uint256 *y) {
unsigned char c = 0;
c = _addcarry_u64(c, x->x1, y->x1, &x->x1);
c = _addcarry_u64(c, x->x2, y->x2, &x->x2);
c = _addcarry_u64(c, x->x3, y->x3, &x->x3);
_addcarry_u64(c, x->x4, y->x4, &x->x4);
}
produces
L__routine_start_add256_0:
add256:
xorl %r9d, %r9d #25.9
movq (%rsi), %rax #22.9
addq %rax, (%rdi) #22.9
movq 8(%rsi), %rdx #23.9
adcq %rdx, 8(%rdi) #23.9
movq 16(%rsi), %rcx #24.9
adcq %rcx, 16(%rdi) #24.9
movq 24(%rsi), %r8 #25.9
adcq %r8, 24(%rdi) #25.9
setb %r9b #25.9
ret #26.1
I compiled with -O3
. I don't know how to enable adx
with ICC. Maybe I need ICC 14?
That's exactly 1 addq
and three adcq
like I expect.
With Clang the result using -O3 -madx
is a mess
add256(uint256*, uint256*): # @add256(uint256*, uint256*)
movq (%rsi), %rax
xorl %ecx, %ecx
xorl %edx, %edx
addb $-1, %dl
adcq %rax, (%rdi)
addb $-1, %cl
movq (%rdi), %rcx
adcxq %rax, %rcx
setb %al
movq 8(%rsi), %rcx
movb %al, %dl
addb $-1, %dl
adcq %rcx, 8(%rdi)
addb $-1, %al
movq 8(%rdi), %rax
adcxq %rcx, %rax
setb %al
movq 16(%rsi), %rcx
movb %al, %dl
addb $-1, %dl
adcq %rcx, 16(%rdi)
addb $-1, %al
movq 16(%rdi), %rax
adcxq %rcx, %rax
setb %al
movq 24(%rsi), %rcx
addb $-1, %al
adcq %rcx, 24(%rdi)
retq
Without enabling -madx
in Clang the result is not much better.
Edit:
Apperently MSVC already has _addcarry_u64. I tried it and it's as good as ICC (1x add
and 3x adc
).