Faster 16bit multiplication algorithm for 8-bit MCU

前端 未结 6 1414
没有蜡笔的小新
没有蜡笔的小新 2021-02-12 11:27

I\'m searching for an algorithm to multiply two integer numbers that is better than the one below. Do you have a good idea about that? (The MCU - AT Tiny 84/85 or similar - wher

6条回答
  •  梦谈多话
    2021-02-12 11:44

    A non-answer, tinyARM assembler (web doc) instead of C++ or C. I modified a pretty generic multiply-by-squares-lookup for speed (< 50 cycles excluding call&return overhead) at the cost of only fitting into AVRs with no less than 1KByte of RAM, using 512 aligned bytes for a table of the lower half of squares. At 20 MHz, that would nicely meet the 2 max 3 usec time limit still not showing up in the question proper - but Sergio Formiggini wanted 16 MHz. As of 2015/04, there is just one ATtiny from Atmel with that much RAM, and that is specified up to 8 MHz … (Rolling your "own" (e.g., from OpenCores) your FPGA probably has a bunch of fast multipliers (18×18 bits seems popular), if not processor cores.)
    For a stab at fast shift-and-add, have a look at shift and add, factor shifting left, unrolled 16×16→16 and/or improve on it (wiki post). (You might well create that community wiki answer begged for in the question.)

    .def    a0  = r16   ; factor low byte
    .def    a1  = r17
    #warning two warnings about preceding definitions of
    #warning  r16 and r17 are due and may as well be ignored
    .def    a   = r16   ; 8-bit factor
    .def    b   = r17   ; 8-bit factor ; or r18, rather?
    .def    b0  = r18   ; factor low byte
    .def    b1  = r19
    .def    p0  = r20   ; product low byte
    .def    p1  = r21
    
    ; "squares table" SqTab shall be two 512 Byte tables of
    ;  squares of 9-bit natural numbers, divided by 4
    
    ; Idea: exploit p = a * b = Squares[a+b] - Squares[a-b]
    
    init:
        ldi     r16, 0x73
        ldi     r17, 0xab
        ldi     r18, 23
        ldi     r19, 1
        ldi     r20, HIGH(SRAM_SIZE)
        cpi     r20, 2
        brsh    fillSqTable ; ATtiny 1634?
        rjmp    mpy16T16
    fillSqTable:
        ldi     r20, SqTabH
        subi    r20, -2
        ldi     zh, SqTabH
        clr     zl
    ; generate sqares by adding up odd numbers starting at 1 += -1
        ldi     r22, 1
        clr     r23
        ser     r26
        ser     r27
    fillLoop:
        add     r22, r26
        adc     r23, r27
        adiw    r26, 2
        mov     r21, r23
        lsr     r21         ; get bits 9:2
        mov     r21, r22
        ror     r21
        lsr     r21
        bst     r23, 1
        bld     r21, 7
        st      z+, r21
        cp      zh, r20
        brne    fillLoop
        rjmp    mpy16F16
    
    ; assembly lines are marked up with cycle count
    ;  and (latest) start cycle in block.
    ; If first line in code block, the (latest) block start cycle
    ;  follows; else if last line, the (max) block cycle total
    
    ;**************************************************************
    ;*
    ;* "mpy16F16" - 16x16->16 Bit Unsigned Multiplication
    ;*                        using table lookup
    ;* Sergio Formiggini special edition
    ;* Multiplies  two 16-bit register values a1:a0 and b1:b0.
    ;* The result is placed in p1:p0.
    ;*
    ;* Number of flash words: 318 + return = 
    ;*                       (40 + 256(flash table) + 22(RAM init))
    ;* Number of cycles     : 49 + return
    ;* Low  registers used  : None
    ;* High registers used  : 7+2 (a1:a0, b1:b0, p1:p0, sq;
    ;*                             + Z(r31:r30))
    ;* RAM bytes used       : 512 (squares table)
    ;*
    ;**************************************************************
    mpy16F16:
        ldi     ZH, SqTabH>>1;1 0   0   squares table>>1
        mov     ZL, a0      ; 1 1
        add     ZL, b0      ; 1 2       a0+b0
        rol     ZH          ; 1 3       9 bit offset
        ld      p0, Z       ; 2 4       a0+b0l          1
        lpm     p1, Z       ; 3 6   9   a0+b0h          2
    
        ldi     ZH, SqTabH  ; 1 0   9   squares table
    
        mov     ZL, a1      ; 1 0   10
        sub     ZL, b0      ; 1 1       a1-b0
        brcc    noNegF10    ; 1 2
        neg     ZL          ; 1 3
    noNegF10:
        ld      sq, Z       ; 2 4       a1-b0l          3
        sub     p1, sq      ; 1 6   7
    
        mov     ZL, a0      ; 1 0   17
        sub     ZL, b1      ; 1 1       a0-b1
        brcc    noNegF01    ; 1 2
        neg     ZL          ; 1 3
    noNegF01:
        ld      sq, Z       ; 2 4       a0-b1l          4
        sub     p1, sq      ; 1 6   7
    
        mov     ZL, a0      ; 1 0   24
        sub     ZL, b0      ; 1 1       a0-b0
        brcc    noNegF00    ; 1 2
        neg     ZL          ; 1 3
    noNegF00:
        ld      sq, Z       ; 2 4       a0-b0l          5
        sub     p0, sq      ; 1 6
        lpm     sq, Z       ; 3 7       a0-b0h          6*
        sbc     p1, sq      ; 1 10  11
    
        ldi     ZH, SqTabH>>1;1 0   35
        mov     ZL, a1      ; 1 1
        add     ZL, b0      ; 1 2       a1+b0
        rol     ZH          ; 1 3
        ld      sq, Z       ; 2 4       a1+b0l          7
        add     p1, sq      ; 1 6   7
    
        ldi     ZH, SqTabH>>1;1 0   42
        mov     ZL, a0      ; 1 1
        add     ZL, b1      ; 1 2       a0+b1
        rol     ZH          ; 1 3
        ld      sq, Z       ; 2 4       a0+b1l          8
        add     p1, sq      ; 1 6   7
    
        ret                 ;       49
    
    .CSEG
    .org 256; words?!
    SqTableH:
    .db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
    .db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
    .db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
    .db   0,   0,   1,   1,   1,   1,   1,   1,   1,   1
    .db   1,   1,   1,   1,   1,   1,   2,   2,   2,   2
    .db   2,   2,   2,   2,   2,   2,   3,   3,   3,   3
    .db   3,   3,   3,   3,   4,   4,   4,   4,   4,   4
    .db   4,   4,   5,   5,   5,   5,   5,   5,   5,   6
    .db   6,   6,   6,   6,   6,   7,   7,   7,   7,   7
    .db   7,   8,   8,   8,   8,   8,   9,   9,   9,   9
    .db   9,   9,  10,  10,  10,  10,  10,  11,  11,  11
    .db  11,  12,  12,  12,  12,  12,  13,  13,  13,  13
    .db  14,  14,  14,  14,  15,  15,  15,  15,  16,  16
    .db  16,  16,  17,  17,  17,  17,  18,  18,  18,  18
    .db  19,  19,  19,  19,  20,  20,  20,  21,  21,  21
    .db  21,  22,  22,  22,  23,  23,  23,  24,  24,  24
    .db  25,  25,  25,  25,  26,  26,  26,  27,  27,  27
    .db  28,  28,  28,  29,  29,  29,  30,  30,  30,  31
    .db  31,  31,  32,  32,  33,  33,  33,  34,  34,  34
    .db  35,  35,  36,  36,  36,  37,  37,  37,  38,  38
    .db  39,  39,  39,  40,  40,  41,  41,  41,  42,  42
    .db  43,  43,  43,  44,  44,  45,  45,  45,  46,  46
    .db  47,  47,  48,  48,  49,  49,  49,  50,  50,  51
    .db  51,  52,  52,  53,  53,  53,  54,  54,  55,  55
    .db  56,  56,  57,  57,  58,  58,  59,  59,  60,  60
    .db  61,  61,  62,  62,  63,  63,  64,  64,  65,  65
    .db  66,  66,  67,  67,  68,  68,  69,  69,  70,  70
    .db  71,  71,  72,  72,  73,  73,  74,  74,  75,  76
    .db  76,  77,  77,  78,  78,  79,  79,  80,  81,  81
    .db  82,  82,  83,  83,  84,  84,  85,  86,  86,  87
    .db  87,  88,  89,  89,  90,  90,  91,  92,  92,  93
    .db  93,  94,  95,  95,  96,  96,  97,  98,  98,  99
    .db 100, 100, 101, 101, 102, 103, 103, 104, 105, 105
    .db 106, 106, 107, 108, 108, 109, 110, 110, 111, 112
    .db 112, 113, 114, 114, 115, 116, 116, 117, 118, 118
    .db 119, 120, 121, 121, 122, 123, 123, 124, 125, 125
    .db 126, 127, 127, 128, 129, 130, 130, 131, 132, 132
    .db 133, 134, 135, 135, 136, 137, 138, 138, 139, 140
    .db 141, 141, 142, 143, 144, 144, 145, 146, 147, 147
    .db 148, 149, 150, 150, 151, 152, 153, 153, 154, 155
    .db 156, 157, 157, 158, 159, 160, 160, 161, 162, 163
    .db 164, 164, 165, 166, 167, 168, 169, 169, 170, 171
    .db 172, 173, 173, 174, 175, 176, 177, 178, 178, 179
    .db 180, 181, 182, 183, 183, 184, 185, 186, 187, 188
    .db 189, 189, 190, 191, 192, 193, 194, 195, 196, 196
    .db 197, 198, 199, 200, 201, 202, 203, 203, 204, 205
    .db 206, 207, 208, 209, 210, 211, 212, 212, 213, 214
    .db 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
    .db 225, 225, 226, 227, 228, 229, 230, 231, 232, 233
    .db 234, 235, 236, 237, 238, 239, 240, 241, 242, 243
    .db 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
    .db 254, 255
    ; word addresses, again?!
    .equ SqTabH = (high(SqTableH) << 1)
    
    .DSEG
    RAMTab .BYTE 512
    

提交回复
热议问题