I\'m searching for an algorithm to multiply two integer numbers that is better than the one below. Do you have a good idea about that? (The MCU - AT Tiny 84/85 or similar - wher
A non-answer, tinyARM assembler (web doc) instead of C++ or C. I modified a pretty generic multiply-by-squares-lookup for speed (< 50 cycles excluding call&return overhead) at the cost of only fitting into AVRs with no less than 1KByte of RAM, using 512 aligned bytes for a table of the lower half of squares. At 20 MHz, that would nicely meet the 2 max 3 usec
time limit still not showing up in the question proper - but Sergio Formiggini wanted 16 MHz. As of 2015/04, there is just one ATtiny from Atmel with that much RAM, and that is specified up to 8 MHz … (Rolling your "own" (e.g., from OpenCores) your FPGA probably has a bunch of fast multipliers (18×18 bits seems popular), if not processor cores.)
For a stab at fast shift-and-add, have a look at shift and add, factor shifting left, unrolled 16×16→16 and/or improve on it (wiki post). (You might well create that community wiki answer begged for in the question.)
.def a0 = r16 ; factor low byte
.def a1 = r17
#warning two warnings about preceding definitions of
#warning r16 and r17 are due and may as well be ignored
.def a = r16 ; 8-bit factor
.def b = r17 ; 8-bit factor ; or r18, rather?
.def b0 = r18 ; factor low byte
.def b1 = r19
.def p0 = r20 ; product low byte
.def p1 = r21
; "squares table" SqTab shall be two 512 Byte tables of
; squares of 9-bit natural numbers, divided by 4
; Idea: exploit p = a * b = Squares[a+b] - Squares[a-b]
init:
ldi r16, 0x73
ldi r17, 0xab
ldi r18, 23
ldi r19, 1
ldi r20, HIGH(SRAM_SIZE)
cpi r20, 2
brsh fillSqTable ; ATtiny 1634?
rjmp mpy16T16
fillSqTable:
ldi r20, SqTabH
subi r20, -2
ldi zh, SqTabH
clr zl
; generate sqares by adding up odd numbers starting at 1 += -1
ldi r22, 1
clr r23
ser r26
ser r27
fillLoop:
add r22, r26
adc r23, r27
adiw r26, 2
mov r21, r23
lsr r21 ; get bits 9:2
mov r21, r22
ror r21
lsr r21
bst r23, 1
bld r21, 7
st z+, r21
cp zh, r20
brne fillLoop
rjmp mpy16F16
; assembly lines are marked up with cycle count
; and (latest) start cycle in block.
; If first line in code block, the (latest) block start cycle
; follows; else if last line, the (max) block cycle total
;**************************************************************
;*
;* "mpy16F16" - 16x16->16 Bit Unsigned Multiplication
;* using table lookup
;* Sergio Formiggini special edition
;* Multiplies two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*
;* Number of flash words: 318 + return =
;* (40 + 256(flash table) + 22(RAM init))
;* Number of cycles : 49 + return
;* Low registers used : None
;* High registers used : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;* + Z(r31:r30))
;* RAM bytes used : 512 (squares table)
;*
;**************************************************************
mpy16F16:
ldi ZH, SqTabH>>1;1 0 0 squares table>>1
mov ZL, a0 ; 1 1
add ZL, b0 ; 1 2 a0+b0
rol ZH ; 1 3 9 bit offset
ld p0, Z ; 2 4 a0+b0l 1
lpm p1, Z ; 3 6 9 a0+b0h 2
ldi ZH, SqTabH ; 1 0 9 squares table
mov ZL, a1 ; 1 0 10
sub ZL, b0 ; 1 1 a1-b0
brcc noNegF10 ; 1 2
neg ZL ; 1 3
noNegF10:
ld sq, Z ; 2 4 a1-b0l 3
sub p1, sq ; 1 6 7
mov ZL, a0 ; 1 0 17
sub ZL, b1 ; 1 1 a0-b1
brcc noNegF01 ; 1 2
neg ZL ; 1 3
noNegF01:
ld sq, Z ; 2 4 a0-b1l 4
sub p1, sq ; 1 6 7
mov ZL, a0 ; 1 0 24
sub ZL, b0 ; 1 1 a0-b0
brcc noNegF00 ; 1 2
neg ZL ; 1 3
noNegF00:
ld sq, Z ; 2 4 a0-b0l 5
sub p0, sq ; 1 6
lpm sq, Z ; 3 7 a0-b0h 6*
sbc p1, sq ; 1 10 11
ldi ZH, SqTabH>>1;1 0 35
mov ZL, a1 ; 1 1
add ZL, b0 ; 1 2 a1+b0
rol ZH ; 1 3
ld sq, Z ; 2 4 a1+b0l 7
add p1, sq ; 1 6 7
ldi ZH, SqTabH>>1;1 0 42
mov ZL, a0 ; 1 1
add ZL, b1 ; 1 2 a0+b1
rol ZH ; 1 3
ld sq, Z ; 2 4 a0+b1l 8
add p1, sq ; 1 6 7
ret ; 49
.CSEG
.org 256; words?!
SqTableH:
.db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.db 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
.db 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
.db 2, 2, 2, 2, 2, 2, 3, 3, 3, 3
.db 3, 3, 3, 3, 4, 4, 4, 4, 4, 4
.db 4, 4, 5, 5, 5, 5, 5, 5, 5, 6
.db 6, 6, 6, 6, 6, 7, 7, 7, 7, 7
.db 7, 8, 8, 8, 8, 8, 9, 9, 9, 9
.db 9, 9, 10, 10, 10, 10, 10, 11, 11, 11
.db 11, 12, 12, 12, 12, 12, 13, 13, 13, 13
.db 14, 14, 14, 14, 15, 15, 15, 15, 16, 16
.db 16, 16, 17, 17, 17, 17, 18, 18, 18, 18
.db 19, 19, 19, 19, 20, 20, 20, 21, 21, 21
.db 21, 22, 22, 22, 23, 23, 23, 24, 24, 24
.db 25, 25, 25, 25, 26, 26, 26, 27, 27, 27
.db 28, 28, 28, 29, 29, 29, 30, 30, 30, 31
.db 31, 31, 32, 32, 33, 33, 33, 34, 34, 34
.db 35, 35, 36, 36, 36, 37, 37, 37, 38, 38
.db 39, 39, 39, 40, 40, 41, 41, 41, 42, 42
.db 43, 43, 43, 44, 44, 45, 45, 45, 46, 46
.db 47, 47, 48, 48, 49, 49, 49, 50, 50, 51
.db 51, 52, 52, 53, 53, 53, 54, 54, 55, 55
.db 56, 56, 57, 57, 58, 58, 59, 59, 60, 60
.db 61, 61, 62, 62, 63, 63, 64, 64, 65, 65
.db 66, 66, 67, 67, 68, 68, 69, 69, 70, 70
.db 71, 71, 72, 72, 73, 73, 74, 74, 75, 76
.db 76, 77, 77, 78, 78, 79, 79, 80, 81, 81
.db 82, 82, 83, 83, 84, 84, 85, 86, 86, 87
.db 87, 88, 89, 89, 90, 90, 91, 92, 92, 93
.db 93, 94, 95, 95, 96, 96, 97, 98, 98, 99
.db 100, 100, 101, 101, 102, 103, 103, 104, 105, 105
.db 106, 106, 107, 108, 108, 109, 110, 110, 111, 112
.db 112, 113, 114, 114, 115, 116, 116, 117, 118, 118
.db 119, 120, 121, 121, 122, 123, 123, 124, 125, 125
.db 126, 127, 127, 128, 129, 130, 130, 131, 132, 132
.db 133, 134, 135, 135, 136, 137, 138, 138, 139, 140
.db 141, 141, 142, 143, 144, 144, 145, 146, 147, 147
.db 148, 149, 150, 150, 151, 152, 153, 153, 154, 155
.db 156, 157, 157, 158, 159, 160, 160, 161, 162, 163
.db 164, 164, 165, 166, 167, 168, 169, 169, 170, 171
.db 172, 173, 173, 174, 175, 176, 177, 178, 178, 179
.db 180, 181, 182, 183, 183, 184, 185, 186, 187, 188
.db 189, 189, 190, 191, 192, 193, 194, 195, 196, 196
.db 197, 198, 199, 200, 201, 202, 203, 203, 204, 205
.db 206, 207, 208, 209, 210, 211, 212, 212, 213, 214
.db 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
.db 225, 225, 226, 227, 228, 229, 230, 231, 232, 233
.db 234, 235, 236, 237, 238, 239, 240, 241, 242, 243
.db 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
.db 254, 255
; word addresses, again?!
.equ SqTabH = (high(SqTableH) << 1)
.DSEG
RAMTab .BYTE 512
One approach is to unroll the loop. I don't have a compiler for the platform you're using so I can't look at the generated code, but an approach like this could help.
The performance of this code is less data-dependent -- you go faster in the worst case by not checking to see if you're in the best case. Code size is a bit bigger but not the size of a lookup table.
(Note code untested, off the top of my head. I'm curious about what the generated code looks like!)
#define UMUL16_STEP(a, b, shift) \
if ((b) & (1U << (shift))) result += ((a) << (shift)));
uint16_t umul16(uint16_t a, uint16_t b)
{
uint16_t result = 0;
UMUL16_STEP(a, b, 0);
UMUL16_STEP(a, b, 1);
UMUL16_STEP(a, b, 2);
UMUL16_STEP(a, b, 3);
UMUL16_STEP(a, b, 4);
UMUL16_STEP(a, b, 5);
UMUL16_STEP(a, b, 6);
UMUL16_STEP(a, b, 7);
UMUL16_STEP(a, b, 8);
UMUL16_STEP(a, b, 9);
UMUL16_STEP(a, b, 10);
UMUL16_STEP(a, b, 11);
UMUL16_STEP(a, b, 12);
UMUL16_STEP(a, b, 13);
UMUL16_STEP(a, b, 14);
UMUL16_STEP(a, b, 15);
return result;
}
Update:
Depending on what your compiler does, the UMUL16_STEP macro can change. An alternative might be:
#define UMUL16_STEP(a, b, shift) \
if ((b) & (1U << (shift))) result += (a); (a) << 1;
With this approach the compiler might be able to use the sbrc
instruction to avoid branches.
My guess for how the assembler should look per bit, r0:r1 is the result, r2:r3 is a
and r4:r5 is b
:
sbrc r4, 0
add r0, r2
sbrc r4, 0
addc r1, r3
lsl r2
rol r3
This should execute in constant time without a branch. Test the bits in r4
and then test the bits in r5
for the higher eight bits. This should execute the multiplication in 96 cycles based on my reading of the instruction set manual.
The compiler might be able to produce shorter code by using the ternary operator to choose whether to add 'a' to your accumulator, depends upon cost of test and branch, and how your compiler generates code.
Swap the arguments to reduce the loop count.
uint16_t umul16_(uint16_t op1, uint16_t op2)
{
uint16_t accum=0;
uint16_t a, b;
a=op1; b=op2;
if( op1<op2 ) { a=op2; b=0p1; } //swap operands to loop on smaller
while (b) {
accum += (b&1)?a:0;
b>>=1;
a+=a;
}
return accum;
}
Many years ago, I wrote "forth", which promoted a compute rather than branch approach, and that suggests picking which value to use,
You can use an array to avoid the test completely, which your compiler can likely use to generate as a load from offset. Define an array containing two values, 0 and a, and update the value for a at the end of the loop,
uint16_t umul16_(uint16_t op1, uint16_t op2)
{
uint16_t accum=0;
uint16_t pick[2];
uint16_t a, b;
a=op1; b=op2;
if( op1<op2 ) { a=op2; b=0p1; } //swap operands to loop on smaller
pick[0]=0;
pick[1]=a;
while (b) {
accum += pick[(b&1)]; //avoid test completely
b>>=1;
pick[1] += pick[1]; //(a+=a);
}
return accum;
}
Yeah, evil. But I don't normally write code like that.
Edit - revised to add swap to loop on smaller of op1 or op2 (fewer passes). That would eliminate the usefulness of testing for an argument =0.
Well, mix of LUT and shift usually works
Something along the line, multiplying 8 bit entities. Lets consider them made up of two quads
uint4_t u1, l1, u2, l2;
uint8_t a = 16*u1 + l1;
uint8_t b = 16*u2 + l2;
product = 256*u1*u2 + 16*u1*l2 + 16*u2*l1 + l1*l1;
inline uint4_t hi( uint8_t v ) { return v >> 4; }
inline uint4_t lo( uint8_t v ) { return v & 15; }
inline uint8_t LUT( uint4_t x, uint4_t y ) {
static uint8_t lut[256] = ...;
return lut[x | y << 4]
}
uint16_t multiply(uint8_t a, uint8_t b) {
return (uint16_t)LUT(hi(a), hi(b)) << 8 +
((uint16_t)LUT(hi(a), lo(b)) + (uint16_t)LUT(lo(a), hi(b)) << 4 +
(uint16_t)LUT(lo(a), lo(b));
}
just fill lut[] with results of multiplication. In your case depending on memory you could go with quads (256 sized LUT) or with bytes (65536 size LUT) or anything in between
Summary
a
and b
(Original proposal)a
and b
One improvement would be to first compare a and b, and swap them if a<b
: you should use as b
the smaller of the two, so that you have the minimum number of cycles. Note that you can avoid swapping by duplicating the code (if (a<b)
then jump to a mirrored code section), but I doubt it's worth.
Try:
uint16_t umul16_(uint16_t a, uint16_t b)
{
///Here swap if necessary
uint16_t accum=0;
while (b) {
accum += ((b&1) * uint16_t(0xffff)) & a; //Hopefully this multiplication is optimized away
b>>=1;
a+=a;
}
return accum;
}
From Sergio's feedback, this didn't bring improvements.
Considering that the target architecture has basically only 8bit instructions, if you separate the upper and bottom 8 bit of the input variables, you can write:
a = a1 * 0xff + a0;
b = b1 * 0xff + b0;
a * b = a1 * b1 * 0xffff + a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0
Now, the cool thing is that we can throw away the term a1 * b1 * 0xffff
, because the 0xffff
send it out of your register.
(16bit) a * b = a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0
Furthermore, the a0*b1
and a1*b0
term can be treated as 8bit multiplications, because of the 0xff
: any part exceeding 256 will be sent out of the register.
So far exciting! ... But, here comes reality striking: a0 * b0
has to be treated as a 16 bit multiplication, as you'll have to keep all resulting bits. a0
will have to be kept on 16 bit to allow shift lefts. This multiplication has half of the iterations of a * b
, it is in part 8bit (because of b0) but you still have to take into account the 2 8bit multiplications mentioned before, and the final result composition. We need further reshaping!
So now I collect b0
.
(16bit) a * b = a0 * b1 * 0xff + b0 * (a0 + a1 * 0xff)
But
(a0 + a1 * 0xff) = a
So we get:
(16bit) a * b = a0 * b1 * 0xff + b0 * a
If N were the cycles of the original a * b
, now the first term is an 8bit multiplication with N/2 cycles, and the second a 16bit * 8bit multiplication with N/2 cycles. Considering M the number of instructions per iteration in the original a*b
, the 8bit*8bit iteration has half of the instructions, and the 16bit*8bit about 80% of M (one shift instruction less for b0 compared to b). Putting together we have N/2*M/2+N/2*M*0.8 = N*M*0.65
complexity, so an expected saving of ~35% with respect to the original N*M
. Sounds promising.
This is the code:
uint16_t umul16_(uint16_t a, uint16_t b)
{
uint8_t res1 = 0;
uint8_t a0 = a & 0xff; //This effectively needs to copy the data
uint8_t b0 = b & 0xff; //This should be optimized away
uint8_t b1 = b >>8; //This should be optimized away
//Here a0 and b1 could be swapped (to have b1 < a0)
while (b1) {///Maximum 8 cycles
if ( (b1 & 1) )
res1+=a0;
b1>>=1;
a0+=a0;
}
uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!
//Here swapping wouldn't make much sense
while (b0) {///Maximum 8 cycles
if ( (b0 & 1) )
res+=a;
b0>>=1;
a+=a;
}
return res;
}
Also, the splitting in 2 cycles should double, in theory, the chance of skipping some cycles: N/2 might be a slight overestimate.
A tiny further improvement consist in avoiding the last, unnecessary shift for the a
variables. Small side note: if either b0 or b1 are zero it causes 2 extra instructions. But it also saves the first check of b0 and b1, which is the most expensive because it cannot check the zero flag
status from the shift operation for the conditional jump of the for loop.
uint16_t umul16_(uint16_t a, uint16_t b)
{
uint8_t res1 = 0;
uint8_t a0 = a & 0xff; //This effectively needs to copy the data
uint8_t b0 = b & 0xff; //This should be optimized away
uint8_t b1 = b >>8; //This should be optimized away
//Here a0 and b1 could be swapped (to have b1 < a0)
if ( (b1 & 1) )
res1+=a0;
b1>>=1;
while (b1) {///Maximum 7 cycles
a0+=a0;
if ( (b1 & 1) )
res1+=a0;
b1>>=1;
}
uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!
//Here swapping wouldn't make much sense
if ( (b0 & 1) )
res+=a;
b0>>=1;
while (b0) {///Maximum 7 cycles
a+=a;
if ( (b0 & 1) )
res+=a;
b0>>=1;
}
return res;
}
Is there still space for improvement? Yes, as the bytes in a0
gets shifted two times. So there should be a benefit in combining the two loops. It might be a little bit tricky to convince the compiler to do exactly what we want, especially with the result register.
So, we process in the same cycle b0
and b1
. The first thing to handle is, which is the loop exit condition? So far using b0
/b1
cleared status has been convenient because it avoids using a counter. Furthermore, after the shift right, a flag might be already set if the operation result is zero, and this flag might allow a conditional jump without further evaluations.
Now the loop exit condition could be the failure of (b0 || b1)
. However this could require expensive computation. One solution is to compare b0 and b1 and jump to 2 different code sections: if b1 > b0
I test the condition on b1
, else I test the condition on b0
. I prefer another solution, with 2 loops, the first exit when b0
is zero, the second when b1
is zero. There will be cases in which I will do zero iterations in b1
. The point is that in the second loop I know b0
is zero, so I can reduce the number of operations performed.
Now, let's forget about the exit condition and try to join the 2 loops of the previous section.
uint16_t umul16_(uint16_t a, uint16_t b)
{
uint16_t res = 0;
uint8_t b0 = b & 0xff; //This should be optimized away
uint8_t b1 = b >>8; //This should be optimized away
//Swapping probably doesn't make much sense anymore
if ( (b1 & 1) )
res+=(uint16_t)((uint8_t)(a && 0xff))*256;
//Hopefully the compiler understands it has simply to add the low 8bit register of a to the high 8bit register of res
if ( (b0 & 1) )
res+=a;
b1>>=1;
b0>>=1;
while (b0) {///N cycles, maximum 7
a+=a;
if ( (b1 & 1) )
res+=(uint16_t)((uint8_t)(a & 0xff))*256;
if ( (b0 & 1) )
res+=a;
b1>>=1;
b0>>=1; //I try to put as last the one that will leave the carry flag in the desired state
}
uint8_t a0 = a & 0xff; //Again, not a real copy but a register selection
while (b1) {///P cycles, maximum 7 - N cycles
a0+=a0;
if ( (b1 & 1) )
res+=(uint16_t) a0 * 256;
b1>>=1;
}
return res;
}
Thanks Sergio for providing the assembly generated (-Ofast). At first glance, considering the outrageous amount of mov
in the code, it seems the compiler did not interpret as I wanted the hints I gave to him to interpret the registers.
Inputs are: r22,r23 and r24,25.
AVR Instruction Set: Quick reference, Detailed documentation
sbrs //Tests a single bit in a register and skips the next instruction if the bit is set. Skip takes 2 clocks.
ldi // Load immediate, 1 clock
sbiw // Subtracts immediate to *word*, 2 clocks
00000010 <umul16_Antonio5>:
10: 70 ff sbrs r23, 0
12: 39 c0 rjmp .+114 ; 0x86 <__SREG__+0x47>
14: 41 e0 ldi r20, 0x01 ; 1
16: 00 97 sbiw r24, 0x00 ; 0
18: c9 f1 breq .+114 ; 0x8c <__SREG__+0x4d>
1a: 34 2f mov r19, r20
1c: 20 e0 ldi r18, 0x00 ; 0
1e: 60 ff sbrs r22, 0
20: 07 c0 rjmp .+14 ; 0x30 <umul16_Antonio5+0x20>
22: 28 0f add r18, r24
24: 39 1f adc r19, r25
26: 04 c0 rjmp .+8 ; 0x30 <umul16_Antonio5+0x20>
28: e4 2f mov r30, r20
2a: 45 2f mov r20, r21
2c: 2e 2f mov r18, r30
2e: 34 2f mov r19, r20
30: 76 95 lsr r23
32: 66 95 lsr r22
34: b9 f0 breq .+46 ; 0x64 <__SREG__+0x25>
36: 88 0f add r24, r24
38: 99 1f adc r25, r25
3a: 58 2f mov r21, r24
3c: 44 27 eor r20, r20
3e: 42 0f add r20, r18
40: 53 1f adc r21, r19
42: 70 ff sbrs r23, 0
44: 02 c0 rjmp .+4 ; 0x4a <__SREG__+0xb>
46: 24 2f mov r18, r20
48: 35 2f mov r19, r21
4a: 42 2f mov r20, r18
4c: 53 2f mov r21, r19
4e: 48 0f add r20, r24
50: 59 1f adc r21, r25
52: 60 fd sbrc r22, 0
54: e9 cf rjmp .-46 ; 0x28 <umul16_Antonio5+0x18>
56: e2 2f mov r30, r18
58: 43 2f mov r20, r19
5a: e8 cf rjmp .-48 ; 0x2c <umul16_Antonio5+0x1c>
5c: 95 2f mov r25, r21
5e: 24 2f mov r18, r20
60: 39 2f mov r19, r25
62: 76 95 lsr r23
64: 77 23 and r23, r23
66: 61 f0 breq .+24 ; 0x80 <__SREG__+0x41>
68: 88 0f add r24, r24
6a: 48 2f mov r20, r24
6c: 50 e0 ldi r21, 0x00 ; 0
6e: 54 2f mov r21, r20
70: 44 27 eor r20, r20
72: 42 0f add r20, r18
74: 53 1f adc r21, r19
76: 70 fd sbrc r23, 0
78: f1 cf rjmp .-30 ; 0x5c <__SREG__+0x1d>
7a: 42 2f mov r20, r18
7c: 93 2f mov r25, r19
7e: ef cf rjmp .-34 ; 0x5e <__SREG__+0x1f>
80: 82 2f mov r24, r18
82: 93 2f mov r25, r19
84: 08 95 ret
86: 20 e0 ldi r18, 0x00 ; 0
88: 30 e0 ldi r19, 0x00 ; 0
8a: c9 cf rjmp .-110 ; 0x1e <umul16_Antonio5+0xe>
8c: 40 e0 ldi r20, 0x00 ; 0
8e: c5 cf rjmp .-118 ; 0x1a <umul16_Antonio5+0xa>
With all this information, let's try to understand what would be the "optimal" solution given the architecture constraints. "Optimal" is quoted because what is "optimal" depends a lot on the input data and what we want to optimize. Let's assume we want to optimize on number of cycles on the worst case. If we go for the worst case, loop unrolling is a reasonable choice: we know we have 8 cycles, and we remove all tests to understand if we finished (if b0 and b1 are zero). So far we used the trick "we shift, and we check the zero flag" to check if we had to exit a loop. Removed this requirement, we can use a different trick: we shift, and we check the carry bit (the bit we sent out of the register when shifting) to understand if I should update the result. Given the instruction set, in assembly "narrative" code the instructions become the following.
//Input: a = a1 * 256 + a0, b = b1 * 256 + b0
//Output: r = r1 * 256 + r0
Preliminary:
P0 r0 = 0 (CLR)
P1 r1 = 0 (CLR)
Main block:
0 Shift right b0 (LSR)
1 If carry is not set skip 2 instructions = jump to 4 (BRCC)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Shift right b1 (LSR)
5 If carry is not set skip 1 instruction = jump to 7 (BRCC)
6 r1 = r1 + a0 (ADD)
7 a0 = a0 + a0 (ADD)
8 a1 = a1 + a1 + carry from prev. (ADC)
[Repeat same instructions for another 7 times]
Branching takes 1 instruction if no jump is caused, 2 otherwise. All other instructions are 1 cycle. So b1 state has no influence on the number of cycles, while we have 9 cycles if b0 = 1, and 8 cycles if b0 = 0. Counting the initialization, 8 iterations and skipping the last update of a0 and a1, in the worse case (b0 = 11111111b), we have a total of 8 * 9 + 2 - 2 =
72 cycles. I wouldn't know which C++ implementation would convince the compiler to generate it. Maybe:
void iterate(uint8_t& b0,uint8_t& b1,uint16_t& a, uint16_t& r) {
const uint8_t temp0 = b0;
b0 >>=1;
if (temp0 & 0x01) {//Will this convince him to use the carry flag?
r += a;
}
const uint8_t temp1 = b1;
b1 >>=1;
if (temp1 & 0x01) {
r+=(uint16_t)((uint8_t)(a & 0xff))*256;
}
a += a;
}
uint16_t umul16_(uint16_t a, uint16_t b) {
uint16_t r = 0;
uint8_t b0 = b & 0xff;
uint8_t b1 = b >>8;
iterate(b0,b1,a,r);
iterate(b0,b1,a,r);
iterate(b0,b1,a,r);
iterate(b0,b1,a,r);
iterate(b0,b1,a,r);
iterate(b0,b1,a,r);
iterate(b0,b1,a,r);
iterate(b0,b1,a,r); //Hopefully he understands he doesn't need the last update for variable a
return r;
}
But, given the previous result, to really obtain the desired code one should really switch to assembly!
Finally one could also consider a more extreme interpretation of the loop unrolling: the sbrc/sbrs instructions allows to test on a specific bit of a register. We can therefore avoid shifting b0 and b1, and at each cycle check a different bit. The only problem is that those instructions only allow to skip the next instruction, and not for a custom jump. So, in "narrative code" it will look like this:
Main block:
0 Test Nth bit of b0 (SBRS). If set jump to 2 (+ 1cycle) otherwise continue with 1
1 Jump to 4 (RJMP)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Test Nth bit of (SBRC). If cleared jump to 6 (+ 1cycle) otherwise continue with 5
5 r1 = r1 + a0 (ADD)
6 a0 = a0 + a0 (ADD)
7 a1 = a1 + a1 + carry from prev. (ADC)
While the second substitution allows to save 1 cycle, there's no clear advantage in the second substitution. However, I believe the C++ code might be easier to interpret for the compiler. Considering 8 cycles, initialization and skipping last update of a0 and a1, we have now 64 cycles.
C++ code:
template<uint8_t mask>
void iterateWithMask(const uint8_t& b0,const uint8_t& b1, uint16_t& a, uint16_t& r) {
if (b0 & mask)
r += a;
if (b1 & mask)
r+=(uint16_t)((uint8_t)(a & 0xff))*256;
a += a;
}
uint16_t umul16_(uint16_t a, const uint16_t b) {
uint16_t r = 0;
const uint8_t b0 = b & 0xff;
const uint8_t b1 = b >>8;
iterateWithMask<0x01>(b0,b1,a,r);
iterateWithMask<0x02>(b0,b1,a,r);
iterateWithMask<0x04>(b0,b1,a,r);
iterateWithMask<0x08>(b0,b1,a,r);
iterateWithMask<0x10>(b0,b1,a,r);
iterateWithMask<0x20>(b0,b1,a,r);
iterateWithMask<0x40>(b0,b1,a,r);
iterateWithMask<0x80>(b0,b1,a,r);
//Hopefully he understands he doesn't need the last update for a
return r;
}
Note that in this implementation the 0x01, 0x02 are not a real value, but just a hint to the compiler to know which bit to test. Therefore, the mask cannot be obtained by shifting right: differently from all other functions seen so far, this has really no equivalent loop version.
One big problem is that
r+=(uint16_t)((uint8_t)(a & 0xff))*256;
It should be just a sum of the upper register of r
with the lower register of a
.
Does not get interpreted as I would like. Other option:
r+=(uint16_t) 256 *((uint8_t)(a & 0xff));
We can also keep a
constant, and shift instead the result r
. In this case we process b
starting from the most significant bit. The complexity is equivalent, but it might be easier for the compiler to digest. Also, this time we have to be careful to write explicitly the last loop, which must not do a further shift right for r
.
template<uint8_t mask>
void inverseIterateWithMask(const uint8_t& b0,const uint8_t& b1,const uint16_t& a, const uint8_t& a0, uint16_t& r) {
if (b0 & mask)
r += a;
if (b1 & mask)
r+=(uint16_t)256*a0; //Hopefully easier to understand for the compiler?
r += r;
}
uint16_t umul16_(const uint16_t a, const uint16_t b) {
uint16_t r = 0;
const uint8_t b0 = b & 0xff;
const uint8_t b1 = b >>8;
const uint8_t a0 = a & 0xff;
inverseIterateWithMask<0x80>(b0,b1,a,r);
inverseIterateWithMask<0x40>(b0,b1,a,r);
inverseIterateWithMask<0x20>(b0,b1,a,r);
inverseIterateWithMask<0x10>(b0,b1,a,r);
inverseIterateWithMask<0x08>(b0,b1,a,r);
inverseIterateWithMask<0x04>(b0,b1,a,r);
inverseIterateWithMask<0x02>(b0,b1,a,r);
//Last iteration:
if (b0 & 0x01)
r += a;
if (b1 & 0x01)
r+=(uint16_t)256*a0;
return r;
}
At long last, an answer, if a cheeky one: I couldn't (yet) get the AVR-C-compiler from the GCC fit it into 8K code. (For an assembler rendition, see AVR multiplication: No Holds Barred).
The approach is what everyone who used Duff's device tried for a second attempt:
use a switch. Using macros, the source code looks entirely harmless, if massaged:
#define low(mp) case mp: p = a0 * (uint8_t)(mp) << 8; break
#define low4(mp) low(mp); low(mp + 1); low(mp + 2); low(mp + 3)
#define low16(mp) low4(mp); low4(mp + 4); low4(mp + 8); low4(mp + 12)
#define low64(mp) low16(mp); low16(mp + 16); low16(mp + 32); low16(mp + 48)
#if preShift
# define CASE(mp) case mp: return p + a * (mp)
#else
# define CASE(mp) case mp: return (p0<<8) + a * (mp)
#endif
#define case4(mp) CASE(mp); CASE(mp + 1); CASE(mp + 2); CASE(mp + 3)
#define case16(mp) case4(mp); case4(mp + 4); case4(mp + 8); case4(mp + 12)
#define case64(mp) case16(mp); case16(mp + 16); case16(mp + 32); case16(mp + 48)
extern "C" __attribute__ ((noinline))
uint16_t mpy16NHB16(uint16_t a, uint16_t b)
{
uint16_t p = 0;
uint8_t b0 = (uint8_t)b, b1 = (uint8_t)(b>>8);
uint8_t a0 = (uint8_t)a, p0;
switch (b1) {
case64(0);
case64(64);
case64(128);
case64(192);
}
#if preShift
p = p0 << 8;
#endif
#if preliminaries
if (0 == b0) {
p = -a;
if (b & 0x8000)
p += a << 9;
if (b & 0x4000)
p += a << 8;
return p;
}
while (b0 & 1) {
a <<= 1;
b0 >>= 1;
}
#endif
switch (b0) {
low64(0);
low64(64);
low64(128);
low64(192);
}
return ~0;
}
int main(int ac, char const *const av[])
{
char buf[22];
for (uint16_t a = 0 ; a < a+1 ; a++)
for (uint16_t m = 0 ; m <= a ; m++)
puts(itoa(//shift4(ac)+shift3MaskAdd((uint16_t)av[0], ac)
// +shift4Add(ac, (uint16_t)av[0])
// + mpy16NHB16(ac, (ac + 105))
mpy16NHB16(a, m), buf, 10));
}