tinyAVR: best known multiplication routines for 8-bit and 16-bit factors? [closed]

大城市里の小女人 提交于 2019-11-29 08:20:44
greybeard

unsigned, shift and add, partial product shifting right, unrolled 8×8→16, 16×16→32 (This is what avr200b presents. To keep from boring, I shaved off a cycle here, a cycle there, and threw in 16×16→16.)

;**************************************************************
;*
;* "mpy8u" - 8x8 Bit Unsigned Multiplication
;*
;* Multiplies the two register variables mp8u and mc8u.
;* The result is placed in registers m8uH, m8uL
;
;* Number of words  : 39 + return
;* Number of cycles : 33 + return
;* Low registers used   : None
;* High registers used  : 3 (mc8u,mp8u/m8uL,m8uH)
;*
;* Note: Result and multiplier Low byte are the same register.
;* This causes the multiplier to be overwritten by the result.
;*
;**************************************************************

.def    mc8u    = r16   ; multiplicand
.def    mp8u    = r17   ; multiplier (mp, m'plier)
.def    m8uL    = r17   ; partial product (pp)/result Low byte
.def    m8uH    = r18   ; result High byte

.CSEG
; deadlines: noad81: 8, noad82: 11
even:               ;      4    do not shift pp, yet
    lsr   mp8u      ; 1    5    shift m'plier to process bit 1
    brcs  noad81    ;1/2   6/7  multiplicand already in pp
    sbrs  mp8u, 0   ;1/2   7/8  if m'plier original bit 2 clear
    clr   m8uH      ; 1    8        mp bits 0-2 0: clear pp
    lsr   mp8u      ; 1    9    shift multiplier for alignment
    rjmp  noad82    ; 2   11    just in time ... again.

mpy8u:
    mov   m8uH, mc8u; 1    1    move m'cand to res High byte
                    ;           (not yet knowing it to be used)

    lsr   mp8u      ; 1    2    shift m'plier to process bit 0
    brcc  even      ;1/2   3/4  if carry set
    lsr   m8uH      ; 1    4        shift right res High byte
    ror   m8uL      ; 1    5    rot right result Low and m'plier 1

    brcc  noad81    ;1/2   6/7  if carry set
    add   m8uH, mc8u; 1    7        add m'cand to result High byte
noad81:             ;      7
    ror   m8uH      ; 1    8    rotate right result High byte
    ror   m8uL      ; 1    9    ror result Low and m'plier 2

    brcc  noad82    ;1/2  10/11 if carry set
    add   m8uH, mc8u; 1   11        add m'cand to result High byte
noad82:             ;     11
    ror   m8uH      ; 1   12    rotate right result High byte
done82:             ;     12
    ror   m8uL      ; 1   13    result Low byte and multiplier 3

    brcc  noad83    ;1/2  14/15 if carry set
    add   m8uH, mc8u; 1   15        add m'cand to result High byte
noad83:
    ror   m8uH      ; 1   16    rotate right result High byte
    ror   m8uL      ; 1   17    result Low byte and multiplier

    brcc  noad84    ;1/2  18/19 if carry set
    add   m8uH, mc8u; 1   19        add m'cand to result High byte
noad84: ror m8uH    ; 1   20    rotate right result High byte
    ror   m8uL      ; 1   21    result Low byte and multiplier

    brcc  noad85    ;1/2  22/23 if carry set
    add   m8uH, mc8u; 1   23        add m'cand to result High byte
noad85: ror m8uH    ; 1   24    shift right result High byte
    ror   m8uL      ; 1   25    result Low byte and multiplier

    brcc  noad86    ;1/2  26/27 if carry set
    add   m8uH, mc8u; 1   27        add m'cand to result High byte
noad86: ror m8uH    ; 1   28    rotate right result High byte
    ror   m8uL      ; 1   29    result Low byte and multiplier

    brcc  noad87    ;1/2  30/31 if carry set
    add   m8uH, mc8u; 1   31        add m'cand to result High byte
noad87: ror m8uH    ; 1   32    rotate right result High byte
    ror   m8uL      ; 1   33    rotate right result Low byte

    ret

16×16→32

;***************************************************************
;*
;* "mpy16u" - 16x16->32 Bit Unsigned Multiplication
;*
;* This subroutine multiplies the two 16-bit register variables
;*  mp16uH:mp16uL (replaced by low product) and mc16uH:mc16uL.
; (madd16u would have added in the value in acc1:acc0 (lame-?))
;* The result is placed in m16u3:m16u2:m16u1:m16u0.
;*
;* Number of words  :135 + return (configuration dependent)
;* Number of cycles : 97 + return
;* (avr200b.asm mpy16u improved: 100, as-is: 116)
;* Low registers used   : None
;* High registers used  : 6 (mp16uL,mp16uH,
;*                           mc16uL/m16u0,mc16uH/m16u1,
;*                           acc0/m16u2, acc1/m16u3)
;*
;***************************************************************

#if OrderedMultiply
.def    mcinL   = r16   ; multiplicand (m'cand) low byte
.def    mcinH   = r17   ; multiplicand high byte
.def    mc16uL  = r22   ; multiplicand low byte
.def    mc16uH  = r23   ; multiplicand high byte
.def    acc0    = r20   ; accumulator byte 0
.def    acc1    = r21   ; accumulator byte 1 (MSB)
# warning two warnings about preceding definitions of
# warning  r20 and r21 are due and may as well be ignored
#else
.def    mc16uL  = r16   ; multiplicand low byte
.def    mc16uH  = r17   ; multiplicand high byte
#endif
.def    mp16uL  = r18   ; multiplier (m'plier) low byte
.def    mp16uH  = r19   ; multiplier high byte
# warning two warnings about preceding definitions of
# warning  r18 and r19 are due and may as well be ignored
.def    m16u0   = r18   ; result byte 0 (LSB)
.def    m16u1   = r19   ; result byte 1
.def    m16u2   = r20   ; result byte 2
.def    m16u3   = r21   ; result byte 3 (MSB)

.CSEG
code0: rjmp mpy16u
; deadlines: done1: 11, noadd1: 9, 2: 15, 3: 21, 4: 25
mp00:                   ;     4     bit 0 of m'plier 0
; trick: jmp to conditional one clock late: duplicate condition
    lsr   mp16uL        ; 1   5     bit 1 into C
    brcs  noadd1        ;1/2  6/7   if set, start accumulating
#if ShortHeadStart
; four cycles saved neither rotating nor adding:
;    enough to clear & jump back into "normal control flow"
    clr   m16u2         ; 1   7     clear 2 highest result bytes
    clr   m16u3         ; 1   8     (no sub to clear C: sets Z)
    brne  done1         ;1/2  9/10  C must be cleared - it is
    rjmp  mp7Done       ; 2  11
#else
; instead of clearing the partial product when non-zero,
;  one could go on looking for that first bit set
    lsr   mp16uL        ; 1   8     bit 2 into C
    brcs  noadd2        ;1/2  9/10  if set, start accumulating
    lsr   mp16uL        ; 1  10     bit 3
    brcs  noadd3        ;1/2 11/12  if set, start accumulating
    lsr   mp16uL        ; 1  12     bit 4
    brcs  noadd4        ;1/2 13/14  if set, start accumulating
    lsr   mp16uL        ; 1  14     bit 5
    brcs  noadd5        ;1/2 15/16  if set, start accumulating
    lsr   mp16uL        ; 1  16     bit 6
    brcs  noadd6        ;1/2 17/18  if set, start accumulating
    lsr   mp16uL        ; 1  18     bit 7
    brcs  noadd7        ;1/2 19/20  if set, start accumulating
; weeeell, there's the zero in m16u0 and next will be mp16uH
    lsr   mp16uH        ; 1  20     bit 8
    brcs  noClear       ; 1  21/22  if not set
    clr   m16u2         ; 1  22         clear pp -
    clr   m16u3         ; 1  23         at long last
noClear:
    brne  noadd8        ;1/2 24/25  some upper bits set. Duh!
    mov   m16u2, m16u1  ; 1  25
    mov   m16u3, m16u2  ; 1  26
    clr   m16u3         ; 1  27     for *0, *256 is faster ...
    ret

#endif
#if OrderedMultiply
ompy16u:    ; ordering may be faster on average (see *Early*),
            ; but adds 6 cycles and 7 words
    cp    mcinL, mp16uL ;-6
    cpc   mcinH, mp16uH ;-5
    brlo  mcLower       ;-4     if m'cand not lower
    movw  mc16uL, mcinL ;-3         just use m'cand
    rjmp  ordered       ;-2
mcLower:                ;-3     if m'cand lower
    movw  mc16uL, mp16uL;-2         use m'plier as m'cand
    movw  mp16uL, mcinL ;-1         and vice-versa
ordered:
#endif

mpy16u:                 ;     0     not knowing it will be used:
    movw  m16u2, mc16uL ; 1   1     set partial product to mc
; using asr sets overflow flag to b0^b7: any use?
    asr   mp16uL        ; 1   2     shift m'plier Low, keep bit7
    brcc  mp00          ;1/2  3/4   if m'plier low bit one
noadd0:
    ror   m16u3         ; 1   4     rotate right result byte 3
    ror   m16u2         ; 1   5     rotate right result byte 2
;   ror   m16u1         ; rotate res byte 1 and m'plier High
    ror   m16u0         ; 1   6     res byte 0 and m'plier Low
; in a seizure of coding exuberance,
;  one could check for consecutive one bits ...
    brcc  noadd1        ;1/2  7/8   if carry set
    add   m16u2, mc16uL ; 1   8         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1   9         add mc hi to res byte 3
noadd1:
    ror   m16u3         ; 1  10     rotate right result byte 3
    ror   m16u2         ; 1  11     rotate right result byte 2
;   ror   m16u1         ; rotate res byte 1 and m'plier High
done1:                  ;    11
    ror   m16u0         ; 1  12     res byte 0 and m'plier Low

    brcc  noadd2        ;1/2 13/14  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd2: ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u0         ; 1  18     res byte 0 and m'plier Low

    brcc  noadd3        ;1/2 19/20  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd3: ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u0         ; 1  24     res byte 0 and m'plier Low

    brcc  noadd4        ;1/2 25/26  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd4: ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u0         ; 1  30     res byte 0 and m'plier Low

    brcc  noadd5        ;1/2 31/32  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd5: ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u0         ; 1  36     res byte 0 and m'plier Low

    brcc  noadd6        ;1/2 37/38  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd6: ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u0         ; 1  42     res byte 0 and m'plier Low

    brcc  noadd7        ;1/2 43/44  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd7: ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u0         ; 1  48     res byte 0 and m'plier Low

; instead of the least significant bit of the next byte,
;  which never got rotated into this one,
;  this shifted into C the highest bit from mp16uL - again.
; using asr mp16uH would set overflow flag to b8^b15,
;  asr mp16uH would set overflow flag to b8^b7: any use?
; A quick check suggested a "Booth subtraction" on bit 7 is
;  entirely possible. Booth gains from runs of identical bits
;  - two of which would be signified by overflow cleared.
; Denoting addition by +, subtractions by -
; (the one just done trailing) and "do nothings" (fast!) by 0,
; relevant sequences would be
; (left: before recoding, right: after)
; 000+   00+-   (-1+2        1)
; 00++   0+0-   (-1  +4      3)
; 0+0+   +0--   (-1-2  +8    5) 0++- (-1+2+4) fully equivalent
; 0+++   +00-   (-1    +8    7)
; +00+   +0+-   (-1+2  +8    9)
; +0++   ++0-   (-1  +4+8   11) (-1-4+16 looks worse)
; ++0+  +00--   (-1-2   +16 13) (-1+8+ 8 looks worse)
; ++++  +000-   (-1     +16 15)
; Deeming cases with few non-zeros (operations) less critical,
; what are the critical sequences? Only 0, 1, 3 look good,
;  7 sort of. Promises to get massive and messy - manana.

; So, just start over with the next byte of the m'plier:
    lsr   mp16uH        ; 1  49     shift multiplier High
;   brcc  noadd8        ;1/2 50/51  if carry set

#if !NoCeckEarly
; (Now wouldn't it be fun to have an early out? Even more so
; if we knew the multiplier to (have a good chance to)
;  be less than multiplicand ...)
;   breq  earlyOut      ;1/2 50/51
; how not hurt to the worst case?
; zero after shift is easy: never come back!
;  "Worst" thing that might happen would be that ninth bit set
;  - just do that final addition and be done.
; hard part is non-zero, and C set: would get delayed 1 cycle
;   (where is that Branch-on-Zero-or-Cary-Clear instruction?!)
    brcc  checkEarly    ;1/2 50/51  if carry set
#else
; 'til i know how to
.define NoEarlyOut 1
    brcc  noadd8        ;1/2 50/51  if carry set
#endif
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd8:
    ror   m16u3         ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  54     res byte 1 and m'plier High
;   ror   m16u0         ;rotate res byte 0 and m'plier Low

    brcc  noadd9        ;1/2 55/56  if carry set
add9:
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd9:
    ror   m16u3         ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
mp9Done:
    ror   m16u1         ; 1  61     res byte 1 and m'plier High

    brcc  noadd10       ;1/2 62/63  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd10:ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  67     res byte 1 and m'plier High

    brcc  noadd11       ;1/2 68/69  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd11:ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  73     res byte 1 and m'plier High

    brcc  noadd12       ;1/2 74/75  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd12:ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  79     res byte 1 and m'plier High

    brcc  noadd13       ;1/2 80/81  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd13:ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  85     res byte 1 and m'plier High

    brcc  noadd14       ;1/2 86/83  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd14:ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  91     res byte 1 and m'plier High

    brcc  noadd15       ;1/2 92/93  if carry set
    add   m16u2, mc16uL ; 1  +1         add mc Lo to res byte 2
    adc   m16u3, mc16uH ; 1  +2         add mc hi to res byte 3
noadd15:ror m16u3       ; 1  +3     rotate right result byte 3
    ror   m16u2         ; 1  +4     rotate right result byte 2
    ror   m16u1         ; 1  97     res byte 1 and m'plier High

    ret                 ; 3 100 :-/
#if !NoCheckEarly
; deadlines: add9 56, noadd9 58
checkEarly:             ;    51     adds 15 words for early out
; this would make the jump to add9 one cycle late, so:
; check after ror
; (which is a real pity for needing 8 zero bits instead of 7)
;   breq  highZero      ;1/2        if m'plier high byte <> 0
    ror   m16u3         ; 1  +1         rotate right res byte 3
    ror   m16u2         ; 1  +2         rotate right res byte 2
    ror   m16u1         ; 1  +3         res byte 1 and mp High
    brcs  add9          ;1/2 56     if carry clear
    brne  noadd9        ; 2  57     don't add this time
; partial product in m16u3:m16u2:m16u1
; - seven bits left of position needed
;   lsl   m16u1         ;           no need: all zero.
    lsl   m16u2         ; 1  57     roll back ...
    rol   m16u3         ; 1  58
#endif
#if !NoCheckEarly || !NoEarlyOut
earlyOut:               ;    51
; multiplier high 7 bits all zero
; can't use movw here because of overlap, if not alignment
    mov   m16u1, m16u2  ; 1  59     place byte 1 of product
    mov   m16u2, m16u3  ; 1  60     place byte 2 of product
    clr   m16u3         ; 1  61     clear highest byte of result
# if !noEarlyOut
; if reaching earlyOut with C (possibly) set from bit 8,
;     this would account for it
    brcc  noFinaladd    ;1/2 62     if bit 8 of m'plier clear
    add   m16u1, mc16uL ; 1  63         add mc Lo to res byte 1
    adc   m16u2, mc16uH ; 1  64         add mc hi to res byte 2
    adc   m16u3, m16u3  ; 1  65         one bit might be pending
noFinalAdd:
# endif
    ret                     ; 3 68  pretend to be done
#endif

To be continued (16×16→16) - don't hold your breath.

greybeard

partial product look-up (for pairs of (bit-) quartets/nibbles)

/* multiply accumulating partial products looked up in a table,
 *  "product scanning, decreasing significance",
 *  non-aligned first (non-aligned partial products need
 *                     to be adjusted "bit-wise").
 * Aligned ones are "starred" below for the need to shift one
 *  of the operand nibbles for combination into a table index.
 * 78 cycles, 256 bytes RAM, 83(69) words  (caveat emptor)
 *(88 cycles, 197 words with table in flash)
 */
.equ    L   = 0x0f
.equ    H   = 0xf0
; if indexing is to work by just setting the low byte, this is the
.equ    PTable  = 1 ; only value possibly working with 512 bytes of RAM
.def    a10 = r16
.def    a32 = r17
.def    b10 = r18
.def    b32 = r19
.def    p10 = r20
.def    p32 = r21
.def    p   = r25
.def    t   = r24
.def    a0  = r23
.def    b0  = r22

    rcall   nibbleFiller
mpy16n16:
    mov     b0, b10     ; 1
    andi    b0, L       ; 2
    mov     a0, a10     ; 3
    andi    a0, L       ; 4
    ldi     ZH, PTable  ; 5
/* (values to the left of the gap shown for completeness, only)
         03 
    13*    02*
  23     12  01
33* 22*    11* 00*
  32     21  10
    31*    20*
         30         */
;03
    mov     ZL, b32 ; 1
    andi    ZL, H   ; 2
    or      ZL, a0  ; 3
    ld      p32, Z  ; 5 10
;12
    mov     ZL, b32 ; 1
    andi    ZL, L   ; 2
    mov     t, a10  ; 3
    andi    t, H    ; 4
    or      ZL, t   ; 5
    ld      t, Z    ; 7
    add     p32, t  ; 8 18
;21
    mov     ZL, a32 ; 1
    andi    ZL, L   ; 2
    mov     t, b10  ; 3
    andi    t, H    ; 4
    or      ZL, t   ; 5
    ld      t, Z    ; 7
    add     p32, t  ; 8 26
;30
    mov     ZL, a32 ; 1
    andi    ZL, H   ; 2
    or      ZL, b0  ; 3
    ld      p, Z    ; 5 31

;01
    mov     ZL, b10 ; 1
    andi    ZL, H   ; 2
    or      ZL, a0  ; 3
    ld      p10, Z  ; 5 36
;10
    mov     ZL, a10 ; 1
    andi    ZL, H   ; 2
    or      ZL, b0  ; 3
    ld      t, Z    ; 5
    add     p10, t  ; 6
    adc     p32, p  ; 7 43
; align nibbles
    swap    p10     ; 1
    swap    p32     ; 2
    mov     p, p10  ; 3
; separate nibbles
    andi    p10, H  ; 4
    andi    p32, H  ; 5
    andi    p, L    ; 6 49  postpone nibble addition

;00
    swap    a0      ; 1
    mov     ZL, a0  ; 2
    or      ZL, b0  ; 3
    ld      t, Z    ; 4
    add     p10, t  ; 5
    adc     p32, p  ; 6 55  nibble addition here

;11
    mov     ZL, a10 ; 1
    andi    ZL, H   ; 2
    swap    ZL      ; 3
    mov     t, b10  ; 4
    andi    t, H    ; 5
    or      ZL, t   ; 6
    ld      t, Z    ; 8
    add     p32, t  ; 9 64
;02
    mov     ZL, b32 ; 1
    andi    ZL, L   ; 2
    or      ZL, a0  ; 3
    ld      t, Z    ; 5
    add     p32, t  ; 6 70
;20
    mov     ZL, a32 ; 1
    swap    ZL      ; 2
    andi    ZL, H   ; 3
    or      ZL, b0  ; 4
    ld      t, Z    ; 6
    add     p32, t  ; 8 78 - 256 bytes of RAM, add 10 for flash RAM

    ret

nibbleFiller:
    ldi     ZH, PTable
    clr     ZL
    ser     t
outer:
    inc     t
    clr     p
inner:
    st      Z+, p
    sbrs    ZH, 0
    ret
    mov     a0, ZL
    andi    a0, L
    breq    outer
    add     p, t
    rjmp    inner
    break
greybeard

unsigned 8×8→8, shifting factor left and add, unrolled.

; factors a0, b0 and product p0
.MACRO step8
    sbrc    b0, @0
    add     p0, a0
    add     a0, a0  ; +3
.EndM
; 8x8->8 bit unsigned multiply, factor shift, unrolled.
; 24 cycles & words + return (caveat emptor)
mpy8U8:
    clr     p0      ; 1
    step8   0       ; 4
    step8   1       ; 7
    step8   2       ; 10
    step8   3       ; 13
    step8   4       ; 16
    step8   5       ; 19
    step8   6       ; 22
    sbrc    b0, 7   ; 23/24
    add     p0, a0  ; 24
    ret
# endif

unsigned 16×16→16, shifting factor left and add, unrolled.
Now showing macros; three flavours: plain and - well, not.

; mpy16A16: 16x16->16 bit unsigned multiply, shifting
; one factor bit-by-bit, testing same bits in
; different bytes of the other; idea due to Antonio
; (http://stackoverflow.com/users/2436175/antonio)
; in <http://stackoverflow.com/a/29812254/3789665>
; <= 62/61/60 cycles, 62/87/155 words + return (caveat emptor)
; (57.5, 56.75 and 55.75 expected _for a uniform distribution_)
; "middle" variant assembled with neither Plain nor Need4Speed
; defined, shown without separate "timing comments"
; ("without speed345", just add one to the Need4Speed timings)

; some macros using factors a1:a0, b1:b0 and product p1:p0
.MACRO addA     ;   adds (weighted) factor "a" into product
    add     p0, a0  ; +1
    adc     p1, a1  ; +2
.EndM
.MACRO doubleA  ;   adds (shifts/weights) factor "a"
    add     a0, a0  ; +1
    adc     a1, a1  ; +2
.EndM
.MACRO doHighB  ;   "does" bit in b1, bit number as a parameter
    sbrc    b1, @0  ; 1
    add     p1, a0  ; 2
.EndM
; "do" 2 bits, bit numbers in b1 and b0 as parameters
.MACRO stepS
    bst     b0, @1  ; +1
    brtc    noadd   ; +2/3
    addA            ; +4
noadd:  ;   gets decorated; almost as neat as "numeric labels"
    doHighB @0      ; +6
    doubleA         ; +8
.EndM
.MACRO step16; "do" 2 bits, bit# in b1 and b0 as a parameter
    stepS   @0, @0
.EndM
; empty if no Need4Speed; speed3do45, really
.MACRO speed345
#if Need4Speed
    brhc    noadd   ; 1/2
; kkbb1     (b starts with two Known bit, bit 3 just checked)
    addA            ; 3
noadd:              ;   2/3
    doHighB 3       ; 5
    doubleA         ; 7
    stepS   4       ; 15
    stepS   5       ; 23
#endif
.EndM

# if !Plain
; showing up here due to limited branch offset
no67:               ;       29
; 00
    speed345        ; 23
    doHighB 6       ; 2     54
    sbrs    b1, 7   ; 1/2       doHighB 7 with early out
    ret             ;       55  last to start, first to finish
    add     a0, a0  ; 3
    add     p1, a0  ; 4     58
    ret

no7:                ;       27
; 0
    brpl    no67    ; 1/2   29
; 01
    speed345        ; 23    51
    addA            ; 2
    doHighB 6       ; 4     55
    sbrs    b1, 7   ; 1/2       doHighB 7 with early out
    ret             ;       56
    add     a0, a0  ; 3
    add     p1, a0  ; 4     59
    ret
# endif

In a separate code block in an attempt to ease browsing:

mpy16A16:
    clr     p0      ; 1
    clr     p1      ; 2 ; p1:p0 = 0

    sbrc    b0, 0   ; 3
; "fast-laning the trailing zeroes case" isn't as attractive as
; in a shift pp variant: no gain from avoiding "shift pp", here
    movw    p0, a0  ; 4 ; p1:p0 "+=" a1:a0
    doHighB 0       ; 6
    add     a0, a0  ; 7 ; breq a0zero for early out added 1
                            ; (+you'd have to handle the carry)
    adc     a1, a1  ; 8 ; breq a1zero for early out added 1
                    ;       8
    step16  1       ; +8
    step16  2       ; +8    24
# if !Need4Speed
    step16  3       ; +8
    step16  4       ; +8
    step16  5       ; +8    48
#  if Plain
    step16  6       ; +8
                    ;       56
    doHighB 7       ; +2
    sbrs    b0, 7   ; +3/4
    ret             ;       59  top for 1bbbbbbb01bbbbbb ;-)
    addA            ; +6    62 _worst case_!
    ret
#  endif
# endif
# if !Plain
    lsl     b0      ; 1     24  make bit 7, 6(&3) "branchable"
; takes one cycle, but each conditional branch takes one less
; than skip-over-rjmp or bst b0,i brtc - netting 1 cycle off
; (at the cost of multiplying code)
    brcc    no7     ; 2/3   27
; 1
    brpl    no6but7 ; 3/4   28
; 11
    speed345        ; 23    50
    addA            ; 2
    doHighB 6       ; 2     54
    doubleA         ; 2
    addA            ; 2
    doHighB 7       ; 2     60
    ret

no6but7:            ;       28
; 10
    speed345        ; 23    51
    doHighB 6       ; 2     53
    doubleA         ; 2
    addA            ; 2
    doHighB 7       ; 2     59
    ret
#endif
; for an analysis of expected case cycle count, assume half of

; bits b0:5-1 to be zero for 2.5 cycles less. b0:7 off needs
; 1 cycle less with b1:7 set(.5), another 3 if off(.75). b0:6
; off needs 1 cycle less(.5). _for a uniform distribution_, I'd
; _expect 55.75 cycles_. For a distribution with lower numbers
; more likely (upper bits more likely to be 0, remember b0 to
;  be the least significant eight), expect this to be
; _finished in less than 55 cycles_.

What's next? (modified Booth, and no holds barred (currently favouring computed goto).)

greybeard

Here's a shot at squares lookup, common preliminaries first:

;.def   ZL  = r30
;.def   ZH  = r31

; register assignment trying to follow "avr200b.asm";
;  gcc would use 25 down in stead of 16 up
; (and same registers for parameters and result, requiring
;  two or one movw for another four or two bytes & cycles)
.def    a0  = r16   ; factor low byte
.def    a1  = r17
.def    a   = r16   ; 8-bit factor
.def    b   = r17   ; 8-bit factor ; or r18, rather?
.def    b0  = r18   ; factor low byte
.def    b1  = r19
.def    p0  = r20   ; product low byte
.def    p1  = r21
.def    p2  = r22
.def    p3  = r23
.def    sq  = r25   ; tmp, might have used r0
;                          & parameterless LPM
.def    s0  = r0
.def    s1  = r24

; "squares table" shall be a 1 KByte table of squares of
;  9-bit natural numbers, divided by 4;
;  aligned on a 1K border in program memory,
;  organised as 512 lower bytes followed by the high bytes.

; the idea is to exploit
;  p = a * b = Squares[a+b] - Squares[a-b]

; assembly lines are marked up with cycle count and
; (latest) start cycle in block.
;  If first line in code block, the (latest)
;  block start cycle follows;
;  else if last line, the (max) block cycle total

8×8→16 bits:

;**********************************************************
;*
;* "mpy8T" - 8x8->16 Bit Unsigned Multiplication
;*                                using table lookup
;* (mpy8u: 34 words/cycles (avr200b.asm))
;* Multiplies two 8-bit register values a and b.
;* The result is placed in p1:p0.
;*  
;* Number of words  : 17 + 512(table)=553 + return
;* Number of cycles : 25 + return (table coming preset ...)
;* Low  registers used  : None
;* High registers used  : 5+2 (a, b, p1:p0, sq;
;*                             + Z(r31:r30))    
;*
;*********************************************************
mpy8T:
; p = a * b = Squares[a+b] - Squares[a-b]
    ldi     ZH, 2       ; 1 0   0   squares table / 2
    mov     ZL, a       ; 1 1
    add     ZL, b       ; 1 2       a+b
    rol     ZH          ; 1 3       9 bit offset
    lpm     p0, Z       ; 3 4       a+bl            1
    sbr     ZH, 1       ; 1 7
    lpm     p1, Z       ; 1 8   11  a+bh            2*

    ldi     ZH, 4       ; 1 0   11  squares table

    mov     ZL, a       ; 1 0   12
    sub     ZL, b       ; 1 1       a-b
    brcc    pos         ; 1 2
    neg     ZL          ; 1 3
pos:
    lpm     sq, Z       ; 3 4       a-bl            3
    sub     p0, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       (ldi ZH, 6)
    lpm     sq, Z       ; 3 9       a-bh            4*
    sbc     p1, sq      ; 1 12  13

    ret                 ; 3 25

16×16→16/32 bits:

;**********************************************************
;*
;* "mpy16T" - 16x16->32 Bit Unsigned Multiplication
;*                                   using table lookup
;*
;* Multiplies two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p3:p2:p1:p0.
;*  
;* Number of words  :  74 + 512(table) = 553
;*                        + return (+ push/pop)
;* Number of cycles : 106 + return (+ push/pop)
;*                         (table coming preset ...)
;* (avr200b.asm mpy16u improved: 100, as-is: 116)
;* Low  registers used  : 2 (s0, zero) (could use r26&r27)
;* High registers used  : 10+2 (a1:a0, b1:b0, p3:p2:p1:p0,
;*                              sq, s1; + Z(r31:r30))   
;*
;*********************************************************
mpy16T:
.def    zero = r2 ; (gcc's choice of r1 is funny given mul)
;   push    zero
    clr     zero        ; 1 0   0
; initialise p1:p0
    ldi     ZH, 2       ; 1 0   1   squares table / 2
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    lpm     p0, Z       ; 3 4       a0+b0l          1
    sbr     ZH, 1       ; 1 7       squares table 2nd half
    lpm     p1, Z       ; 3 8   11  a0+b0h          2
; initialise p3:p2
    ldi     ZH, 2       ; 1 0   12
    mov     ZL, a1      ; 1 1
    add     ZL, b1      ; 1 2       a1+b1
    rol     ZH          ; 1 3
    lpm     p2, Z       ; 3 4       a1+b1l          3
    sbr     ZH, 1       ; 1 7
    lpm     p3, Z       ; 3 8   11  a1+b1h          4

; all differences are 8 bit abs: save index carry handling
    ldi     ZH, 6       ; 1 0   23  squares table 2nd half
; do highest square first for carry handling
    mov     ZL, a1      ; 1 0   24
    sub     ZL, b1      ; 1 1       a1-b1
    brcc    pos11       ; 1 2
    neg     ZL          ; 1 3
pos11:
    lpm     s1, Z       ; 3 4       a1-b1h          5
    ldi     ZH, 4       ; 1 7       squares table 1st half
    lpm     s0, Z       ; 3 8   11  a1-b1l          6

    mov     ZL, a0      ; 1 0   35
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    pos00       ; 1 2
    neg     ZL          ; 1 3
pos00:
    lpm     sq, Z       ; 3 4       a0-b0l          7
    sub     p0, sq      ; 1 7
    ldi     ZH, 6       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a0-b0h          8
    sbc     p1, sq      ; 1 12
    sbc     p2, s0      ; 1 13
    sbc     p3, s1      ; 1 14  15

    mov     ZL, a1      ; 1 0   50
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    pos10       ; 1 2
    neg     ZL          ; 1 3
pos10:
    lpm     s1, Z       ; 3 4       a1-b0h          9
    ldi     ZH, 4       ; 1 7       squares table 1st half
    lpm     sq, Z       ; 3 8       a1-b0h          10
    sub     p1, sq      ; 1 11
    sbc     p2, s1      ; 1 12
    sbc     p3, zero    ; 1 13  14

    mov     ZL, a0      ; 1 0   64
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    pos01       ; 1 2
    neg     ZL          ; 1 3
pos01:
    lpm     sq, Z       ; 3 4       a0-b1l          11
    sub     p1, sq      ; 1 7
    ldi     ZH, 6       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a0-b1h          12
    sbc     p2, sq      ; 1 12
    sbc     p3, zero    ; 1 13  14

    ldi     ZH, 2       ; 1 0   78
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a1+b0l          13
    add     p1, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a1+b0h          14
    adc     p2, sq      ; 1 12
    adc     p3, zero    ; 1 13  14

    ldi     ZH, 2       ; 1 0   92
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a0+b1l          15
    add     p1, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a0+b1h          16
    adc     p2, sq      ; 1 12
    adc     p3, zero    ; 1 13  14

;   pop     zero
    ret                 ;       106

16×16→16 bits:

;*********************************************************
;*
;* "mpy16T16" - 16x16->16 Bit Unsigned Multiplication
;*                                     using table lookup
;*
;* Multiplies  two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*  
;* Number of words  :  41 + 512(table)=553 + return
;* Number of cycles :<=57 + return
;* Low  registers used  : None
;* High registers used  : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;*                             + Z(r31:r30))    
;*
;*********************************************************
mpy16T16:
    ldi     ZH, 2       ; 1 0   0   squares table / 2
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    lpm     p0, Z       ; 3 4       a0+b0l          1
    sbr     ZH, 1       ; 1 7
    lpm     p1, Z       ; 1 8   11  a0+b0h          2*

    ldi     ZH, 4       ; 1 0   11  squares table

    mov     ZL, a1      ; 1 0   12
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    noNeg10     ; 1 2
    neg     ZL          ; 1 3
noNeg10:
    lpm     sq, Z       ; 3 4       a1-b0l          3
    sub     p1, sq      ; 1 7   8

    mov     ZL, a0      ; 1 0   20
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    noNeg01     ; 1 2
    neg     ZL          ; 1 3
noNeg01:
    lpm     sq, Z       ; 3 4       a0-b1l          4
    sub     p1, sq      ; 1 7   8

    mov     ZL, a0      ; 1 0   28
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    noNeg00     ; 1 2
    neg     ZL          ; 1 3
noNeg00:
    lpm     sq, Z       ; 3 4       a0-b0l          5
    sub     p0, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       (ldi ZH, 6)
    lpm     sq, Z       ; 3 9       a0-b0h          6*
    sbc     p1, sq      ; 1 12  13

    ldi     ZH, 2       ; 1 0   41
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a1+b0l          7
    add     p1, sq      ; 1 7   8

    ldi     ZH, 2       ; 1 0   49
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a0+b1l          8
    add     p1, sq      ; 1 7   8

    ret                 ;       57

Big-endian modified Booth-2, unrolled. Work in progress, ToDo: decent test rig, hard stare at the critical pathes (58 cycles?!), lucid comments (and a good idea about those keeping book), shift labels to save but one copy of common instructions.
16×16→16 bits (nothing to be gained for ×8bits):

.MACRO doubleP  ;   adds (shifts/weights) (partial) product
    add     p0, p0  ; +1
    adc     p1, p1  ; +2
.EndM

b_010:              ;       9 -1
    sbrs    b1, 7   ;1/2
    rjmp    nob20   ;2/3
    add     p1, a0  ; 3
    add     p1, a0  ; 4
nob20:              ;       13
    doHighB 6       ; 2     15  :-(( 14 if b1:7 off
b20:                ;       15
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b200    ; 7     22
    sbrs    b0, 4   ;7/8
    rjmp    b2010   ; 9     24
b2011:              ;       23
    addA            ; 2
    doubleP         ; 4     2
    doHighB 4       ; 6 29
b41:                ;       29
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b410    ; 7     36
b411:
    sbrc    b0, 2   ;7/8
    rjmp    b4111   ; 9     38
b4110:              ;       37
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     43
b60:                ;       43
    doubleP         ; 2
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     50
b601:               ;       49
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     55 57
    ret

mpy16BEB16:
    lsl     b0      ; 1
    brcc    b_0     ;2/3    3
b_1:                ;       2
    brpl    b_10    ;1/2    4
b_11:               ;       3
    sbrc    b0, 6   ;1/2
    rjmp    b_111   ; 3     6
b_110:              ;       5
    movw    p0, a0  ; 1
    doHighB 7       ; 3
    doubleP         ; 5
    addA            ; 7
    doHighB 6       ; 9     14  :-|
;b20:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b200    ; 7     21
    sbrs    b0, 4   ;7/8
    rjmp    b2010   ; 9     23
;b2011:             ;       22
    addA            ; 2
    doubleP         ; 4
    doHighB 4       ; 6     28
    rjmp    b41     ; 8     30

b_0:                ;       3
    brmi    b_01    ;4/5    5
b_00:               ;
    sbrc    b0, 6   ;5/6
    rjmp    b_001   ; 7     7
b_000:
    ldi     p0, 0   ; 1     6
    ldi     p1, 0   ; 2
    doHighB 7       ; 4
    add     p1, p1  ; 5
    rjmp    nob20   ; 7     13  :-/ -> 15

b_01:               ;       5
    movw    p0, a0  ; 1         useful to both b0:6 branches
    sbrs    b0, 6   ;2/3
    rjmp    b_010   ; 4     9 -1
b_011:              ;       8
    doHighB 7       ; 2         too lazy for more labels
    doubleP         ; 4
    doHighB 6       ; 6     14  :-|
b21:                ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     21
b211:               ;       20
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     23
b2110:              ;       22
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     28
    rjmp    b40     ; 8     30  following two skips - ?

b_10:               ;       4
    sbrs    b0, 6   ;1/2
    rjmp    b_100   ; 3     7
b_101:              ;       6
    movw    p0, a0  ; 1
    doHighB 7       ; 3
    doubleP         ; 5
    addA            ; 7
    doHighB 6       ; 9     15  :-(
;b21:               ;       15
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     22
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     24
;b2110:             ;       23
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     29  two skips, but 
;b40:               ;       29
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b400    ; 7     36      21
;b401:              ;       35  21
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     38      18
;b4011:             ;       37  19
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     43
b61:
    doubleP         ; 2     43  13
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     50      6
b611:               ;       52  6
    doubleP         ; 2
    subA            ; 4
    doHighB 0       ; 6     58      :-((
    ret

b_001:              ;       7
    movw    p0, a0  ; 1
    sbrs    b1, 7   ;2/3
    rjmp    nob001  ;3/4
    add     p1, a0  ; 4
    add     p1, a0  ; 5
nob001:
    doHighB 6       ; 7     14  :-|
;b21:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     21
;b211:              ;       20
    sbrc    b0, 4   ;1/2
    rjmp    b2111   ; 3     23
;b2110:             ;       22
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     28
;b40:               ;       28
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b400    ; 7     35
;b401:              ;       34
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     37      18
;b4011:             ;       36
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     42
;b61:
    doubleP         ; 2     42
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     49
;b611:              ;       48      6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    54
    ret

b_100:              ;       7
    movw    p0, a0  ; 1
    doHighB 7       ; 3
    doubleP         ; 5
    doHighB 6       ; 7     14  :-|
;b20:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b200    ; 7     21
    sbrs    b0, 4   ;7/8
    rjmp    b2010   ; 9     23
;b2011:             ;       22
    addA            ; 2
    doubleP         ; 4
    doHighB 4       ; 6     28
    rjmp    b41     ;       30
#if !expected ; favouring space over expected cycles
b_111:              ;       6 
    clr     p0      ; 1
    inc     b1      ; 2
    clr     p1      ; 3
    sbrc    b1, 7   ;4/5
    mov     p1, a0  ; 5
    add     p1, p1  ; 6
    doHighB 6       ; 8     14  :-|
#else
noB111:             ;       10
    clr     p1      ; 1
    doHighB 6       ; 3     13  :-/
;b21:               ;       13
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     20
;b211:              ;       19
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     22
;b2110:             ;       21
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     27
    rjmp    b40     ; 8     29
b_111:              ;       6 
    clr     p0      ; 1
    inc     b1      ; 2
    brpl    noB111  ;3/4
    mov     p1, a0  ; 4
    add     p1, p1  ; 5
    doHighB 6       ; 7     13  :-/
#endif
;b21:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     21
;b211:              ;       20
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     23
;b2110:             ;       22
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     28
    rjmp    b40     ; 8     30

b200:               ;       22
    sbrs    b0, 4   ;1/2
    rjmp    b2000   ; 3     25
b2001:              ;       24
    doubleP         ; 4
    doHighB 4       ; 6
    addA            ; 8     30
;b41:               ;       30
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b410    ; 7     37
;b411:              ;
    sbrc    b0, 2   ;7/8
    rjmp    b4111   ; 9     39
;b4110:             ;       38
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     44
;b60:               ;       44
    doubleP         ; 2
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     51
;b601:              ;       50
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     56
    ret

b2000:              ;       25
    doubleP         ; 2
    doHighB 4       ; 4     29
b40:                ;       29 31
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b400    ; 7     36
b401:               ;       35
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     38      18
b4011:              ;       37
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     43
;b61:
    doubleP         ; 2     43
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     50
;b611:              ;       49      6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    55
    ret

b2010:              ;       24
    doubleP         ; 2
    addA            ; 4
    doHighB 4       ; 6     30
;b40:               ;       30
    doubleP         ; 8
    doHighB 3       ; 10
    sbrs    b0, 3   ; 11            XXX
    rjmp    b400    ; 13    37
;b401:              ;       36
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     39      18
;b4011:             ;       38
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     44
;b61:
    doubleP         ; 2     44
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     51
;b611:              ;       50      6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    56
    ret

b210:               ;       21 ? 22
    sbrs    b0, 4   ;1/2
    rjmp    b2100   ; 3     24
;b2101:             ;       24
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     30
;b41:               ;       30
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b410    ; 7     37
;b411:              ;       36
    sbrc    b0, 2   ;1/2
    rjmp    b4111   ; 3     39
;b4110:             ;       38
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     44
;b60:
    doubleP         ; 2     44
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     51
;b601:              ;       50
    doubleP         ; 8
    addA            ; 10
    doHighB 0       ; 12    56
    ret

b2100:              ;       24
    subA            ; 2
    doubleP         ; 4
    doHighB 4       ; 6     30
;b40:               ;       30
    doubleP         ; 8
    doHighB 3       ; 10
    sbrs    b0, 3   ; 11
    rjmp    b400    ; 13    37
;b401:              ;       36  21
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     39      18
;b4011:             ;       38  18
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     44
;b61:
    doubleP         ; 2     44  12
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     51
;b611:              ;       50  6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    56
    ret

b2111:              ;       23
    doubleP         ; 2
    doHighB 4       ; 4     27
    rjmp    b41     ; 6     29


b400:               ;       37  21
    sbrs    b0, 2   ;1/2
    rjmp    b4000   ; 3     40      16
;b4001:             ;       39  19
    doubleP         ; 2
    addA            ; 4
    doHighB 2       ; 6     45
;b61:
    doubleP         ; 2     45  13
    doHighB 1       ; 4
    sbrc    b0, 1   ;5/6
    rjmp    b611    ; 7     52      6
;b610:              ;       51  6
    subA            ; 2
    doubleP         ; 4
    doHighB 0       ; 6     57      :-(
    ret

b4000:              ;       40  16
    doubleP         ; 2
    doHighB 2       ; 4     44
;b60:
    doubleP         ; 2     44  12
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     51
;b601:              ;       50  6
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     56
    ret

b4010:              ;       39  18
    doubleP         ; 2
    addA            ; 4
    doHighB 2       ; 6     45
;b60:               ;       45  12
    doubleP         ; 2
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     52
;b601:              ;       51  6
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     57
    ret

b410:               ;       37  21
    sbrs    b0, 2   ;1/2
    rjmp    b4100   ; 3     40      18
;b4101:             ;       39  18
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     45
;b61:
    doubleP         ; 2     45  12
    doHighB 1       ; 4
    sbrc    b0, 1   ;5/6
    rjmp    b611    ; 7     52
;b610:              ;       51  6
    subA            ; 2
    doubleP         ; 4
    doHighB 0       ; 6     57      :-(
    ret

b4100:              ;       40  18
    subA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     46
;b60:
    doubleP         ; 2     46  12
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     53      4
;b601:              ;       52  6
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     58      :-((
    ret

b4111:              ;       39  17
    doubleP         ; 2
    doHighB 2       ; 4     43
;b61:
    doubleP         ; 2     43  13
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     50      6
;b611:              ;       49
    doubleP         ; 2         6
    subA            ; 4
    doHighB 0       ; 6     55
    ret

b600:               ;       51  4
    doubleP         ; 2
    doHighB 0       ; 4     55  ;-)
    ret

b610:               ;       51  6
    subA            ; 2
    doubleP         ; 4
    doHighB 0       ; 6     57
    ret

theEnd:
stuck:
    break
    sleep
    rjmp    stuck
empty: ret
.def    a   = r22
.def    b   = r26
.def    ah  = r23
.def    bh  = r27
.def    p   = r24
.def    ph  = r25
testTest:
    ldi     a, 15
    ldi     ah, 1
    movw    b, a
nextA:
    subi    a, -1
    sbci    ah, -2
    ldi     b, 13
    ldi     bh, 128
    movw    p, a
nextB:
    adiw    b, 31
    movw    a0, a
    movw    b0, b
    rcall   mpy16BEB16
    cp      p0, p
    cpc     p1, ph
    rcall   bad
    add     p, a
    adc     ph, ah
    ldi     a0, 130
    cpi     b, 3
    cpc     bh, a0
    brmi    nextB
    rjmp    nextA

bad:
    ret
    break
    sleep
    rjmp    bad
greybeard

A list of pertaining algorithms and implementations for signed and unsigned 8×8→8/16, 16×16→16/32 and 8×16→16/24 bits as a starting point:

greybeard

Dismayed with not getting modified Booth below 58 cycles, a sketchy go at using precomputed multiples - 3*a. The "middle bit pairs" (predictably?) take exactly the same 15 cycles as the Booth-2 variants I tried, the first and last take too long. I've left it at 64:

mpy16P316:                  ; 0
;prepare a3h:a3 = 3 * a1:a0 ...
    movP    a3,a3h, a0,a1   ; 2
    double  a3, a3h         ; 4
;   addP    a3,a3h, a0,a1   ; 6         ... by half
    lsl     b0              ; 5         gains speed exactly once
    brcc    _               ;6/7
    brpl    _2              ;7/8
_3:                         ;       7
    addP    a3,a3h, a0,a1   ; 2         other half
    movP    p0,p1, a3,a3h   ; 4     11

_2:                         ;       8
    movP    p0,p1, a3,a3h   ; 2         reason for delay
    addP    a3,a3h, a0,a1   ; 4     12  other half

_:                          ;       7
    brmi    _1              ;1/2    9
_0:                         ;       8
    ldi     p0, 0           ; 1
    ldi     p1, 0           ; 2
    addP    a3,a3h, a0,a1   ; 4     12  other half

_1:                         ;       9
    movP    p0,p1, a0,a1    ; 2
    addP    a3,a3h, a0,a1   ; 4     13

    doHighB 7               ; 2     13
    doHighB 7               ; 4     17

    doHighB 6               ; 2     17
    doubleP                 ; 4         1
    doHighB 5               ; 6
    sbrs    b0, 6           ;7/8
    rjmp    no6             ; 9
    sbrs    b0, 5           ;9/10
    rjmp    no5             ; 11
    doubleP                 ; 12        2
    add3                    ; 14

no5:                        ;       "11"
    addA                    ; 13
no56:
    doubleP                 ; 15    32  2

no6:                        ;       "9"
    sbrs    b0, 5           ;10/11
    rjmp    no56            ; 12
    doubleP                 ; 13        2
    addA                    ; 15    32

; same for 43               ; 15

    doHighB 2               ; 2     47
    doubleP                 ; 4         5
    doHighB 1               ; 6
    sbrs    b0, 2           ;7/8
    rjmp    no2             ; 9
    sbrs    b0, 1           ;9/10
    rjmp    no1             ; 11
    doubleP                 ; 12        6
    add3                    ; 14

no1:                        ;       11
    addA                    ; 13
no12:
    doubleP                 ; 15        6

no2:                        ;       9
    sbrs    b0, 1           ;10/11
    rjmp    no12            ; 12
    doubleP                 ; 13        6
    addA                    ; 15
    doHighB 0               ; 17    64?!

    ret
greybeard

A first stab at computed goto, based on Booth-2 starting at the Little End. Slower than the "braided unrolled Big-Endian Booth" (59 vs. 57 cycles(?)), and smaller (~401 words + return).

.equ    base    =   256
; modified Booth from Little End; multiply proper at mpy16LEB16
; 16 snippets reached via computed goto, for 1 multiplier nibble
.org base
; 00 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    doubleA         ;10         low 4 bits done, 6 words to go

;   doHighB 3       ;12         could keep doing this all day ...
;   doubleA         ;14         for no conceivable gain
;;  doHighB 4       ;16         this would be the 1st too many
;   rjmp    hi0+4   ;16 -4
    rjmp    hi0     ;12
b600:               ;       51  squeezing the shortest out and in 
    doubleP         ; 2         branches saves _five_ words
    doHighB 0       ; 4     55  ;-)
    ret
.org base+16
; 00 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12         low 4 bits done, 4 words to go

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+32
    doubleA         ; 2         true Booth or not true Booth
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+48
    subA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+64
; 01 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    addA            ;10
    doubleA         ;12

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+80
; 01 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    addA            ;12
    doubleA         ;14
    rjmp    hi0     ;16
.org base+96
; 01 10
    doubleA         ; 2
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    addA            ;14
    rjmp    hi0     ;16
.org base+112
; 01 11
    doubleA         ; 2
    addA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    addA            ;14
    rjmp    hi0     ;16
.org base+128
; 10 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    doubleA         ;10
    subA            ;12
    doHighB 3       ;14
    rjmp    hi1+2   ;16
.org base+144
; 10 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    subA            ;14
    rjmp    hi1     ;16
.org base+160
; 10 10
    doubleA         ; 2
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    subA            ;12
    doubleA         ;14
    rjmp    hi1     ;16
.org base+176
; 10 11
    doubleA         ; 2
    addA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    subA            ;12
    doubleA         ;14
    rjmp    hi1     ;16
.org base+192
; 11 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    subA            ;10
    doubleA         ;12
    doHighB 3       ;14
    rjmp    hi1+2   ;16
.org base+208
; 11 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    subA            ;12
    doubleA         ;14
    rjmp    hi1     ;16
.org base+224
; 11 10
    doubleA         ; 2
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    doHighB 3       ;14
    rjmp    hi1+2   ;16
.org base+240
; 11 11
    subA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12         making the fast cases symmetrical
hi1:
    doHighB 3       ; 2     28
    doubleA         ; 4
    doHighB 4       ; 6
; four bits to go, last known to have been 1
    sbrc    b0, 5   ;7/8
    rjmp    sub4_1  ; 9     37
;add4_1
    sbrc    b0, 4   ;9/10
    rjmp    add41   ; 11
;add42                      38
    doubleA         ; 2
    addA            ; 4
;b20                        42
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcc    add6_0  ;8/9    51
;sub6_0:    1b0
    brmi    sub61   ;9/10   52
;sub62                      51
    doubleA         ; 2
    subA            ; 4
    doHighB 7       ; 6     57  _not_ funny
    ret
sub61:              ;       52
    doubleA         ; 2
    addA            ; 4
    doHighB 7       ; 6     58  _not at all_
    ret

add6_0:             ;       51
    brpl    noAS6   ;1/2    53
    addA            ; 3
    doubleA         ; 5
    doHighB 7       ; 7     58  _not at all_
    ret
noAS6:              ;       53
    doubleA         ; 2
    doHighB 7       ; 4     57  _not_ funny
    ret

sub4_1:; bb1b1              37  
    sbrc    b0, 4   ;1/2
    rjmp    sub40   ; 3     40
sub41:              ;       39
    subA            ; 2
sub40:              ;       41
    doubleA         ; 4     43
;b21                        43
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcs    sub6_1  ;8/9    52
;add6_1:    0b1
    brpl    add61   ;9/10   53
;add62                      52
    doubleA         ; 2
    addA            ; 4
    doHighB 7       ; 6     58  _not_ funny
    ret
add61:              ;       53
    addA            ; 2
    doubleA         ; 4
    doHighB 7       ; 6     59  _not at all_
    ret

add4_0:; bb0b0              37  
    sbrs    b0, 4   ;1/2
    rjmp    add40   ; 3     40
add41:              ;       39
    addA            ; 2
add40:              ;       41
    doubleA         ; 4     43
;b20                        43
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcc    add6_0  ;8/9    52
;sub6_0:    1b0
    brmi    sub61   ;9/10   53
;sub62                      52
    doubleA         ; 2
    subA            ; 4
    doHighB 7       ; 6     57  _not_ funny
    ret
sub6_1:             ;       51
    brmi    noAS6   ;1/2    53
    subA            ; 3
    doubleA         ; 5
    doHighB 7       ; 7     58  _not at all_
    ret 

hi0:
    doHighB 3       ; 2     28
    doubleA         ; 4
    doHighB 4       ; 6
; four bits to go, last known to have been 0
    sbrs    b0, 5   ;7/8
    rjmp    add4_0  ; 9
;sub4_0
    sbrs    b0, 4   ;9/10
    rjmp    sub41   ; 11    39
;sub42                      38
    doubleA         ; 2
    subA            ; 4
;b21                        42
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcs    sub6_1  ;8/9    51
;add6_1:    0b1
    brpl    add61   ;9/10   52
;add62                      51
    doubleA         ; 2
    addA            ; 4
    doHighB 7       ; 6     57  _not_ funny
    ret

.equ    code    =   high(base)
mpy16LEB16:         ;       0   modified Booth from Little End
    mov     ZL, b0  ; 1
    andi    ZL, 15  ; 2
    swap    ZL      ; 3
    ldi     ZH, code; 4
    ldi     p0, 0   ; 5
    ldi     p1, 0   ; 6
    sbrc    b0, 7   ;7/8
    add     p1, a0  ; 8
    doHighB 0       ;10
    ijmp            ;12
greybeard

Finally, the No Holds Barred version, if not in final state. Oh, the answer body is not supposed to be more than 30000 characters, the unedited source is about 55K - later. A bit large at ~2900 words, fast (<= 44 cycles, expected ~ 39).

star_t:
    rjmp    testTest
.org    0x20

.def    a0  = r16   ; addend low byte
.def    a1  = r17
.def    m0  = r18   ; multiplier low byte
.def    m1  = r19
.def    p0  = r20   ; (partial) product low byte
.def    p1  = r21
.def    _zero=r1
.def    tmp = r0
; some macros using factors a1:a0, m1:m0 and product p1:p0
.MACRO addA     ;   adds (weighted) factor "a" into product
    add     p0, a0  ; +1
    adc     p1, a1  ; +2
.EndM
.MACRO subA     ;   subtracts (weighted) factor "a" from product
    sub     p0, a0  ; +1
    sbc     p1, a1  ; +2
.EndM
.MACRO doubleP  ;   adds (shifts/weights)(partial) product
    add     p0, p0  ; +1
    adc     p1, p1  ; +2
.EndM

settle:
    inc     m1      ; 1         _looks_ smarter than  add p1, a0
doM1:
    clr     XH      ; 2         ?
; mov a0, tmp
    ldi     ZH, high(highs); 3
    mov     ZL, m1  ; 4
    ijmp            ; 6
mpy1616:            ; 0
    movw    XL, a0  ; 1
    andi    XH, 15  ; 2
    eor     XH, XL  ; 3         XH = a0 ^ (a1 & 15)
    andi    XL, 15  ; 4
; swapping first could use -+ (dropping carry) in stead of ex-or
    swap    XL      ; 5         XL = a0 << 4
; _if_ XH was used in few worst loxx cases, do these "on demand"
    swap    XH      ; 6         XH = ((a1^a0)<<4)|(a0>>4)
    eor     XH, XL  ; 7         XH = (a1<<4)|(a0>>4)
; mov tmp, a0
    movw    p0, a0  ; 1         other way 'round with gcc ABI?
    ldi     ZH, high(jump); 1
    mov     ZL, m0  ; 2
trampoline:
    ijmp            ; 14 (12+2) + 15 + 8 + 7 - 44? really?

#define done    ret
hi07:
    add     p1, a0  ; 5
hi06:
    add     a0, a0  ; 4
hi03:
    add     p1, a0  ; 3
hi02:
    add     p1, a0  ; 2
hi01:
    add     p1, a0  ; 1
;hi00:
    done
hi0a:
    add     a0, a0  ; 5
hi05:
    add     p1, a0  ; 4
hi04:
    add     a0, a0  ; 3
    add     a0, a0  ; 2
    add     p1, a0  ; 1
    done
hi09:
    add     p1, a0  ; 5
hi08:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    add     a0, a0  ; 2
    add     p1, a0  ; 1
    done
hi0b:
    sub     p1, a0  ; 5
hi0c:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    sub     p1, a0  ; 2
    add     p1, XL  ; 2
    done

hi0d:
    sub     p1, a0  ; 4
hi0e:
    sub     p1, a0  ; 3
hi0f:
    sub     p1, a0  ; 2
    add     p1, XL  ; 1
    done

hi17:
    add     p1, a0  ; 6
hi16:
    add     a0, a0  ; 5
hi13:
    add     p1, a0  ; 4
hi12:
    add     p1, a0  ; 3
hi11:
    add     p1, a0  ; 2
hi10:
    add     p1, XL  ; 1
    done
;hi1a:
;   add     a0, a0  ; 6
hi15:
    add     p1, a0  ; 5
hi14:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    add     p1, a0  ; 2
    add     p1, XL  ; 1
    done
hi19:
    add     p1, a0  ; 6
hi18:
    add     p1, XL  ; 5
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    add     p1, a0  ; 2
    add     p1, a0  ; 1
    done
hi1a:
    sub     p1, a0  ; 6
hi1b:
    sub     p1, a0  ; 5
hi1c:
    add     a0, a0  ; 4
    sub     XL, a0  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi1d:
    sub     p1, a0  ; 5
hi1e:
    sub     p1, a0  ; 4
hi1f:
    sub     p1, a0  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done

hi27:
    add     p1, a0  ; 7
hi26:
    add     a0, a0  ; 6
hi23:
    add     p1, a0  ; 5
hi22:
    add     p1, a0  ; 4
hi21:
    add     p1, a0  ; 3
hi20:
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi2a:
    add     a0, a0  ; 6
hi25:
    add     p1, a0  ; 5
hi24:
    add     XL, a0  ; 4
    add     XL, a0  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
; ...
hi4d:
    sub     p1, a0  ; 7
hi4e:
    sub     p1, a0  ; 6
hi4f:
    sub     p1, a0  ; 5
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done

hi53:;
    add     p1, a0  ; 7
hi52:;
    add     p1, a0  ; 6
hi51:;
    add     p1, a0  ; 5
hi50:;
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi56:;
    add     p1, a0  ; 7
; ...
hi5d:;
    sub     p1, a0  ; 7
hi5e:;
    sub     p1, a0  ; 6
hi5f:;
    sub     p1, a0  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done

hi63:;
    add     p1, a0  ; 7
hi62:;
    add     p1, a0  ; 6
hi61:;
    add     p1, a0  ; 5
hi60:;
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
; ...
hi68:;
    add     p1, a0  ; 7
hi67:;
    add     p1, a0  ; 6
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi6a:;
    add     p1, a0  ; 7
hi69:;              ;           105 ~ 15 * 7
    sub     XL, a0  ; 6
    add     p1, XL  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi6b:; no symmetry
    sub     p1, a0  ; 7
hi6c:; no symmetry
    add     XL, a0  ; 6
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi6d:; no symmetry
;01101101
    sub     p1, a0  ; 6
hi6e:;
    sub     p1, a0  ; 5
hi6f:;
    sub     p1, a0  ; 4
    sub     p1, XL  ; 3
    sbrc    a0, 0   ; 2
    subi    p1, -128; 1
    done

hi73:;
    add     p1, a0  ; 6
hi72:;
    add     p1, a0  ; 5
hi71:;
    add     p1, a0  ; 4
hi70:;
    sub     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi75:; not quite symmetrical
    add     p1, a0  ; 7
hi74:;
    add     a0, a0  ; 6
    add     p1, a0  ; 5
    add     p1, a0  ; 4
    sub     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi76:;
    sub     p1, a0  ; 7
hi77:;
    sub     p1, a0  ; 6
hi78:;              ;           120 ~ 15 * 8
    sub     XL, a0  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi7b:;
    sub     p1, a0  ; 6
hi7c:;
    add     XL, XL  ; 5
    sub     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi79:
    sub     p1, a0  ; 7
hi7a:
    add     a0, a0  ; 6
hi7d:
    sub     p1, a0  ; 5
hi7e:
    sub     p1, a0  ; 4
hi7f:
    sub     p1, a0  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done

hi85:
    add     p1, a0  ; 7
hi84:
    add     p1, a0  ; 6
hi83:
    add     p1, a0  ; 5
hi82:
    add     p1, a0  ; 4
hi81:
    add     p1, a0  ; 3
hi80:
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi86:
    sub     p1, a0  ; 7
hi87:; not quite symmetrical    135 ~ 15 * 9
    sub     XL, a0  ; 6
    add     p1, XL  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi8a:;
    add     p1, a0  ; 7
hi89:;
    add     p1, a0  ; 6
hi88:;
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi8b:; not quite symmetrical
    sub     p1, a0  ; 7
hi8c:; not quite symmetrical
    add     p1, XL  ; 6
    add     XL, XL  ; 5
    sub     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi8d:
    sub     p1, a0  ; 6
hi8e:
    sub     p1, a0  ; 5
hi8f:
    sub     p1, a0  ; 4
    add     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done

hi93:               ;           147 7*7*3
    add     p1, a0  ; 6
hi92:
    add     p1, a0  ; 5
hi91:
    add     p1, a0  ; 4
hi90:
    add     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi95:
    add     p1, a0  ; 7
hi94:; no symmetry
    add     p1, XL  ; 6
    add     XL, XL  ; 5
    add     XL, a0  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi96:;              ;           150 ~ 15 * 10   nananananaana
    sub     p1, a0  ; 7
hi97:
    sub     XL, a0  ; 6         151 ~ (256-)15*-7
    sub     p1, XL  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hi98:;
    sub     p1, a0  ; 7
hi99:;              ;           153 ~ 17 * 9
    add     XL, a0  ; 6
    add     p1, XL  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi9c:;
    add     p1, a0  ; 7
hi9b:;
    add     p1, a0  ; 6
hi9a:;
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hi9d:;              ;           157
    sub     p1, a0  ; 7
hi9e:;
    sub     p1, a0  ; 6
hi9f:;
    sub     p1, a0  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done

hia3:;
    add     p1, a0  ; 7
hia2:;
    add     p1, a0  ; 6
hia1:;
    add     p1, a0  ; 5
hia0:;
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hia4:;
    sub     p1, a0  ; 7
hia5:;              ;           165 ~ 15 * 11
    add     XL, XL  ; 6
    add     XL, a0  ; 5
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hia7:;
    add     p1, a0  ; 6
hia6:;
    sub     XL, a0  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
hia9:;
    add     p1, a0  ; 7
hia8:;
    sub     p1, XL  ; 6
    add     XL, a0  ; 5
    add     XL, a0  ; 4
    add     XL, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hiaa:;
#if greedy
    add     a0, a0  ; 6
    add     XL, XL  ; 5
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
#else
    sub     p1, a0  ; 7
#endif
hiab:;
    sub     p1, a0  ; 6
hiac:;
    sub     p1, XL  ; 5
    add     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
hiad:;
    sub     p1, a0  ; 7
hiae:;
    sub     p1, a0  ; 6
hiaf:;
    sub     p1, a0  ; 5
    sub     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done

hib3:
    add     p1, a0  ; 7
hib2:
    add     p1, a0  ; 6
hib1:
    add     p1, a0  ; 5
hib0:;
    sub     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
hib5:
    add     p1, a0  ; 6
hib4:
    sub     p1, XL  ; 5
    sub     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
; ...
hieb:               ;               ouch
    sub     p1, a0  ; 5
hiec:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    sub     p1, a0  ; 2
    sub     p1, XL  ; 1
    done
hie9:
    sub     p1, a0  ; 6
hiea:
    add     a0, a0  ; 5
hied:
    sub     p1, a0  ; 4
hiee:
    sub     p1, a0  ; 3
hief:
    sub     p1, a0  ; 2
    sub     p1, XL  ; 1
    done

hif5:
    add     p1, a0  ; 6
hif4:
    add     p1, a0  ; 5
hif3:
    add     p1, a0  ; 4
hif2:
    add     p1, a0  ; 3
hif1:
    add     p1, a0  ; 2
hif0:
    sub     p1, XL  ; 1
    done
hif6:
    sub     p1, a0  ; 6
hif7:
    sub     p1, a0  ; 5
hif8:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    sub     p1, a0  ; 2
    sub     p1, a0  ; 1
    done

.org (PC + 0x100) & 0xffff00
highs:
;   rjmp    hi00
    done;-) to start code with a ret-insn, move this table first
    rjmp    hi01
    rjmp    hi02
    rjmp    hi03
    rjmp    hi04
    rjmp    hi05
    rjmp    hi06
    rjmp    hi07
    rjmp    hi08
    rjmp    hi09
    rjmp    hi0a
    rjmp    hi0b
    rjmp    hi0c
    rjmp    hi0d
    rjmp    hi0e
    rjmp    hi0f

    rjmp    hi10
    rjmp    hi11
    rjmp    hi12
    rjmp    hi13
; ...
    rjmp    hiee
    rjmp    hief

    rjmp    hif0
    rjmp    hif1
    rjmp    hif2
    rjmp    hif3
    rjmp    hif4
    rjmp    hif5
    rjmp    hif6
    rjmp    hif7
    rjmp    hif8
;   rjmp    hif9    ;           jmp + 4 adds + 1 sub
    sub     p1, a0  ; 7
;   rjmp    hifa    ;           jmp + 3 adds + 2 subs -lutin?
    sub     p1, a0  ; 6
;   rjmp    hifb    ;           jmp + 2 adds + 2 subs
    sub     p1, a0  ; 5
;   rjmp    hifc
    sub     p1, a0  ; 4
;   rjmp    hifd
    sub     p1, a0  ; 3
;   rjmp    hife
    sub     p1, a0  ; 2
;   rjmp    hiff
    sub     p1, a0  ; 1
    done

#undef done
#define done    rjmp doM1
#define owing   rjmp settle

.org (PC + 0x100) & 0xffff00
jump:
    rjmp    lo00
    done        ; rjmp  lo01
    rjmp    lo02

;(you know the drill)

    rjmp    lofe
;   rjmp    loff
.Macro negP
    com     p1
    neg     p0
    sbci    p1, -1
.EndM
    negP    ; 6
    owing

.Macro add4
    add p0, XL
    adc p1, XH
.EndM
.Macro sub4
    sub p0, XL
    sbc p1, XH
.EndM
.Macro set4
    movw    p0, XL
.EndM
.Macro pp2Z
    movw    ZL, p0
.EndM
.Macro addZ
    add p0, ZL
    adc p1, ZH
.EndM
.Macro subZ
    sub p0, ZL
    sbc p1, ZH
.EndM
.Macro clrP
    clr p0
    clr p1
.EndM

; do not tail merge to keep *-2/-1/0/1...16 fast
; (with 11, 13, 14, 17, 18 & 19 as collateral benefit, really)
lo00:
    clr p0  ; 4
    clr p1  ; 3
    done
lo07:
    addA    ;10
lo05:
    addA    ; 8
lo03:
    doubleP ; 6
lo02:
    addA    ; 4
;lo01:
    done    ; 2
lo08:
    addA    ; 8
lo04:
    doubleP ; 6
;lo02:
    doubleP ; 4
    done    ; 2
lo0a:
    doubleP ;10
lo06:
    doubleP ; 8
    addA    ; 6
    doubleP ; 4
    done    ; 2
lo09:
    doubleP ;10
    doubleP ; 8
    doubleP ; 6
    addA    ; 4
    done    ; 2
lo0b:
    doubleP ;12 d
    doubleP ;10 a0
    addA    ; 8 d
    doubleP ; 6 d
    addA    ; 4 s0
    done    ; 2
lo0c:
    doubleP ;10
    addA    ; 8
    doubleP ; 6
    doubleP ; 4
    done    ; 2
lo0d:
    addA    ;11
lo0e:
    addA    ; 9
lo0f:
    negP    ; 7
    add4    ; 4
    done    ; 2
lo10:
    set4    ; 1
    done
lo15:
    addA    ;12
lo14:
    addA    ;10
lo13:
    addA    ; 8
lo12:
    addA    ; 6
lo11:
    add4    ; 4
    done
lo16:
    doubleP ;10
    addA    ; 8
    doubleP ; 6
    add4    ; 4
    done
lo17:
    doubleP ;12
    doubleP ;10
    doubleP ; 8
    subA    ; 6
    add4    ; 4
    done
lo18:
    doubleP ;10
    doubleP ; 8
    doubleP ; 6
    add4    ; 4
    done
lo19:
    doubleP ;12
    doubleP ;10
    doubleP ; 8
    addA    ; 6
    add4    ; 4
    done
lo1a:       ; ...++.+.
    doubleP ;12
    doubleP ;10
    addA    ; 8
    doubleP ; 6
    add4    ; 4
    done
lo1b:       ; ..+..-.-
    negP    ;13
    doubleP ;10
    add4    ; 8
    doubleP ; 6
    subA    ; 4
    done
lo1c:       ; ..+..-..
    negP    ;11
    doubleP ; 8
    add4    ; 6
    doubleP ; 4
    done
lo1d:
    subA    ;14
lo1e:
    subA    ;12
lo1f:
    subA    ;10
lo20:
    subA    ; 8
    add4    ; 6
    add4    ; 4
    done

lo25:
    addA    ;14
lo24:
    addA    ;12
lo23:
    addA    ;10
lo22:
    addA    ; 8
lo21:
    add4    ; 6
    add4    ; 4
    done

lo6a:
    doubleP ;14
    add4    ;12
    doubleP ;10
    addA    ; 8
    add4    ; 6
    doubleP ; 4
    done    ; 2
lo6b:; .++.++.- .+++.-.-   .++.++.-?+..+.-.-?
;01101011
;XXX wc, faster without preparational p = a
    addA    ;15 set4    ;15 addA 16 ?set4 15
    add4    ;13 subA    ;12 a4      ?d  ?
    pp2Z    ;11 doubleP ;10 d       ?s0 ?
    doubleP ;10 add4    ; 8 a0      ?d  ?
    addZ    ; 8 doubleP ; 6 a4      ?d  ?
    doubleP ; 6 add4    ; 8 d       ?s0 ?
    subA    ; 4 subA    ; 4 s0      ?a4 ?
    done    ; 2
lo6c:       ; .+++.-..
    set4    ;13 a0
    subA    ;12 a0
    doubleP ;10 a4
    add4    ; 8 d
    doubleP ; 6 a4
    add4    ; 4 d
    done    ; 2
lo6d:       ; .+++..--
    set4    ;14
    add4    ;13
    subA    ;11
    pp2Z    ; 9
    doubleP ; 8
    add4    ; 6
    addZ    ; 4
    done    ; 2

; ...

lo9a:       ; +.
    add4    ;14 nP  14
    doubleP ;12 s4  12
    doubleP ;10 Z   10
    addA    ; 8 d   9
    doubleP ; 4 d   7
    add4    ; 4 aZ  5
    done    ; 2
lo9b:; +.+..-.- +.+..-.-    +.+.+.++    .-.-.-.-
    set4    ;14 negP    ;16 add4    ;16 negP15
    doubleP ;13 add4    ;13 doubleP ;14 s4  12
    subA    ;11 add4    ;11 doubleP ;12 Z   10
    pp2Z    ; 9 pp2Z    ; 9 add4    ;10 d   9
    doubleP ; 8 doubleP ; 8 addA    ; 8 d   7
    doubleP ; 6 doubleP ; 6 doubleP ; 6 aZ  5
    addZ    ; 4 addZ    ; 4 addA    ; 4 owi 3
    done    ; 2
lo9c:       ; +.+..-..
    set4    ;13
    doubleP ;12
    subA    ;10
    doubleP ; 8
    add4    ; 6
    doubleP ; 4
    done    ; 2
lo9d:; +..+++.+ .--...-- +.+...--
;10011101
;XXX wc, faster without preparational p = a
    add4    ;15 set4    ;15 negP    ;15
    pp2Z    ;13 doubleP ;14 sub4    ;12
    doubleP ;12 doubleP ;12 sub4    ;10
    addA    ;10 add4    ;10 pp2Z    ; 8
    doubleP ; 8 subA    ; 8 doubleP ; 7
    doubleP ; 6 doubleP ; 6 addZ    ; 5
    addZ    ; 4 subA    ; 4 owing   ; 3
    done

; ...

loaa:       ; +.+.+.+.
    add4    ;13 add4    ;14 d   14
    pp2Z    ;11 doubleP ;12 s4  12
    doubleP ;10 doubleP ;10 Z   10
    doubleP ; 8 add4    ; 8 d   9
    addZ    ; 6 doubleP ; 4 d   7
    doubleP ; 4 done    ; 2 aZ  5
    done    ; 2
loab:; .-.-.-.- +.+..-.-    +.+.+.++    .-.-.-.-
;10101011
;XXX wc, faster without preparational p = a
    negP    ;15 set4    ;15 add4    ;16 negP16
    sub4    ;12 doubleP ;14 doubleP ;14 s4  13
    pp2Z    ;10 subA    ;12 doubleP ;12 d   11
    doubleP ; 9 doubleP ;10 add4    ;10 a4  9
    doubleP ; 7 add4    ; 8 addA    ; 8 d   7
    addZ    ; 5 doubleP ; 6 doubleP ; 6 s0  5
    owing   ; 3 subA    ; 4 addA    ; 4 owi 3
loac:
    add4    ;14
    doubleP ;12
    addA    ;10
    doubleP ; 8
    add4    ; 6
    doubleP ; 4
    done    ; 2
load:       ; .-.-..--  .-.-.-.+    .--.++.+    0.9 1.8 0.8 (avg)
; WC    10101101
    negP    ;15 -1      negP    ;16 a4  a0  a0  17
    sub4    ;12 -16-1   sub4    ;13 d   s4  a0
    pp2Z    ;10 -16-1   doubleP ;11 a0  Z   s4
    sub4    ; 9 -32-1   doubleP ; 9 d   d   d
    doubleP ; 7 -64-2   sub4    ; 7 a4  aZ  d
    addZ    ; 5 -80-3   addA    ; 5 d   d   a0
    owing   ; 3 owing   ; 3 a0  a0  s4
loae:       ; .-.-..-.
    negP    ;14 clrP    ;15
    sub4    ;11 sub4    ;13
    sub4    ; 9 doubleP ;11
            ;   subA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
loaf:       ; .-.-...-      nutritious
;10101111
;XXX wc, faster without preparational p = a
    clrP    ;15 s
    sub4    ;13 d
    doubleP ;11 d
    doubleP ; 9 a4
    sub4    ; 7 d
    subA    ; 5 a4
    owing   ; 3 s0

lob0:       ;               don't call me that
    clrP    ;13
    sub4    ;11
    doubleP ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob1:
;XXX wc, faster without preparational p = a
    clrP    ;15 s
    sub4    ;13 d
    doubleP ;11 d
    doubleP ; 9 a4
    sub4    ; 7 d
    addA    ; 5 a4
    owing   ; 3 a0
lob2:
;XXX wc, faster without preparational p = a
    clrP    ;15
    sub4    ;13
    doubleP ;11
    addA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob3:
    sub4    ;13
    doubleP ;11
    doubleP ; 9
    sub4    ; 7
    subA    ; 5
    owing   ; 3
; ...
lob6:
    sub4    ;13
    doubleP ;11
    addA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob7:; .-.-+.++ ++.-+..-    .-.-+..-   +.+++..- ++..-..- .-..-..-
    sub4    ;14 add4    ;15 doubleP ;15 a4  16  n   17   n  16  s4
    pp2Z    ;12 pp2Z    ;13 sub4    ;13 d       a4       d  13  z
    doubleP ;11 doubleP ;12 doubleP ;11 d       d        s4 11  d
    addA    ; 9 add4    ;10 doubleP ; 9 a4      a4       d  9   a0
    doubleP ; 7 doubleP ; 8 sub4    ; 7 d       d        d  7   d
    addZ    ; 5 doubleP ; 6 subA    ; 5 a4      d        s0 5   az
    owing   ; 3 subZ    ; 4 owing   ; 3 s0      s0       owi    owi
lob8:
    doubleP ;13
    sub4    ;11
    doubleP ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob9:
    sub4    ;14
    pp2Z    ;12
    addA    ;11
    doubleP ; 9
    doubleP ; 7
    addZ    ; 5
    owing
loba:       ; .-...--.
    negP    ;14
    sub4    ;11
    doubleP ; 9
    subA    ; 7
    doubleP ; 5
    owing   ; 3
lobb:
    negP    ;14
    sub4    ;11
    doubleP ; 9
    doubleP ; 7
    subA    ; 5
    owing   ; 3
lobc:
    negP    ;12
    sub4    ; 9
    doubleP ; 7
    doubleP ; 5
    owing   ; 3
lobd:; .-...-.+
    negP    ;14 set4    ;15
    sub4    ;11 doubleP ;14
    doubleP ; 9 subA    ;12
    doubleP ; 7 doubleP ;10
    addA    ; 5 add4    ; 8
    owing   ; 3 doubleP ; 6
            ;   subA    ; 4
            ;   done    ; 2
lobe:       ;           Honni soit qui mal y pense !
    negP    ;12
    sub4    ; 9
    sub4    ; 7
    doubleP ; 5
    owing   ; 3
lobf:       ;
    subA    ;15 clrP sub4 d d suba owing 13
loc0:       ;               DONT't call me THAT!
    subA    ;13
loc1:
    sub4    ;11
    sub4    ; 9
    sub4    ; 7
    sub4    ; 5
    owing

loca:
    add4    ;14
    doubleP ;12
    add4    ;10
    doubleP ; 8
    addA    ; 6
    doubleP ; 4
    done    ; 2
locb:; .-..+.++ ..--.-.- ++..++.-
; WC?!? 11001011
    doubleP ;15 n   16   a4 15  s   17  n   15
    sub4    ;13 d   13   z  13  d   16  s4  12
    doubleP ;11 s4  11   d  12  a4  14  z   10
    addA    ; 9 d   9    aZ 10  s0  12  s0  9
    doubleP ; 7 s4  7    d  8   d   10  d   7
    addA    ; 5 s0  5    d  6   d   8   az  5
    owing   ; 3 owi 3    s0 4   a4  6
            ;                   s0  4
locc:       ;           what comments?
    add4    ;13 n   14
    pp2Z    ;11 s4  11
    doubleP ;10 d   9
    addZ    ; 8 d   7
    doubleP ; 6 a4  5
    doubleP ; 4 owi 3
    done    ; 2

lofa:
    doubleP ;13
    doubleP ;11
    addA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing
lofb:
    addA    ;14
lofc:
    addA    ;12
lofd:
    addA    ;10
lofe:
    addA    ; 8
    negP    ; 6
    owing
greybeard

Implementations bordering on space conscious (for reference, if not sanity).
Resources used, should probably be qualified (g: wild guess, G: guessed, e: educated guess, E: estimated, s: simulated, a: analised, A: analised & substantiated, if by simulation, m: measured) (words×worstCaseCycleCount is a cost measure akin to Area×Delay in IC design (single figure of "merit"?) )

algoritm            bits    cycles words   regs  remarks
                            wc exp   ×wccc excl.
                                           a,b,p
shift factor left  16×16→16(61 56  87 5307       see other
                            62 57  62 3844       answer)
                            73 68  37 2701
                            81 77  24 1944       (see edit history)
                            85 70g 15 1275       w*expcc~1050
                           108 64g 18 1944       w*expcc~1150
(jump table, for reference  51E49g 888e 44K G   (almost done)
                            44E39g2888E127K e)

(I checked the identical "wordcycle entries" more than once.)
Macros, should conceivably be factored out

.MACRO doubleA  ;   adds (shifts/weights) factor "a"
    add     a0, a0  ; +1
    adc     a1, a1  ; +2
.EndM
.MACRO doHighB  ;   "does" bit in b1, bit number as a parameter
    sbrc    b1, @0  ; 1
    add     p1, a0  ; 2
.EndM
.MACRO condAdd
    doHighB @0      ; +2
    sbrs    b0, @1  ; +3
    rjmp    PC+3    ;+4/5
    addA            ; +6
.EndM
.MACRO step16; "do" 2 bits, bit# in b1 and b0 as a parameter
    condAdd @0, @0  ; +6
    doubleA         ; +8
.EndM

16×16→16 bits, 85/81 cycles, 15/24 words:

mpy16x16:           ;       0
    clr     p0      ; 1
    clr     p1      ; 2
; wanting early out: shifting the factor; faster from Little End
    lsr     b0      ; 3
    brcc    shiftB1 ;4/5
addFull:
    addA            ; 2
shiftB1:            ;       due to handling this 2nd multiplier
    lsr     b1      ; 3     bit even if the multiplicand is zero
    brcc    pc+2    ;4/5    after the first shift, the earlyOutA
addHigh:            ;       variant is 3 cycles slower than 4.8
    add     p1, a0  ; 5     libgcc __mulhi3 - for * 0 or 0x8000
shiftA:
    doubleA         ; 7         why is adc zero-flag handling ...
#if 1||earlyOutA
    brne    shiftB0 ;+1/2   7   ... different from subc/sbci/cpc?
    tst     a0      ;+ 2
    breq    done    ;+ 3/-1upto-69?
#endif
shiftB0:
    lsr     b0      ; 8
    brcs    addFull ;9/10
    sbci    b1, 0   ; 10    presume zero or high reg?
    brne    shiftB1 ;11/12-2
done:               ; wc:   8*10+5=85   @15+1 words (?!)
    ret             ; best: 14 (0=b&0xfffe) (none for a)
                    ;(earlyOutA: wc: 8*13+4=108 @18+1 words)

16×16→16 bits, 73 cycles, 37 words:

mpy16x16:           ;       0
    clr     p0      ; 1
    clr     p1      ; 2
    rcall   nibble  ; 9     incl. ret (>16bit PC AVRs have mul(?))
    swap    b0      ; 10
    swap    b1      ; 11
    doubleA         ; 13
nibble:
    step16  0       ; +8
    step16  1       ; +16
    step16  2       ; +24
    doHighB 3       ; +26
    sbrs    b0, 3   ;27/28
    ret             ;       yikes
    addA            ; +30
    ret             ;       30 Hrrm *2+13 = 73 @ 4*8+5 = 37 words
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!