NASM: How to create/handle basic bmp file using intel 64 bit assembly?

问题

How do I create/handle simple bmp file filling it with one color only using intel 64 bit assembly and nasm assembler?

回答1:

The steps that include such operation are:

Create bmp file header with fixed values (explanation of specific fields below)
Create buffer which contains enough space - three bytes per pixel (one color = red + green + blue)
Open/create file
Fill the buffer
Write header to file
Write buffer to file
Close file
Exit program

Ad. 2: This is a bit more tricky - if the number of pixels per row is not divisible by 4 the program has to fill lacking bytes with 0xFF. Here I purpousely created a picture 201x201. On this example we can see that we will have 3*201=603 bytes per row meaning that we will need additional byte per row. Because of this the size required for picture buffer is 604*201=121404.

The source code that answers questions:

section     .text
global      _start                              ;must be declared for linker (ld)

_start:                                         ;tell linker entry point

;#######################################################################
;### This program creates empty bmp file - 64 bit version ##############
;#######################################################################
;### main ##############################################################
;#######################################################################

    ; open file
    mov     rax,85                              ;system call number - open/create file
    mov     rdi,msg                             ;file name
                                                ;flags
    mov     rsi,111111111b                      ;mode
    syscall                                     ;call kernel

    ; save file descriptor
    mov     r8, rax

    ; write headline to file
    mov     rax, 1                              ;system call number - write
    mov     rdi, r8                             ;load file desc
    mov     rsi, bmpheadline                    ;load adress of buffer to write
    mov     rdx, 54                             ;load number of bytes
    syscall                                     ;call kernel

        mov         rbx, 201                    ;LOOPY counter
        mov         rdx, empty_space            ;load address of buffer (space allocated for picture pixels)
LOOPY:
        mov         rcx, 201                    ;LOOPX counter

LOOPX:
        mov         byte [rdx+0], 0x00          ;BLUE
        mov         byte [rdx+1], 0xFF          ;GREEN
        mov         byte [rdx+2], 0xFF          ;RED

        dec         rcx                         ;decrease counter_x
        add         rdx, 3                      ;move address pointer by 3 bytes (1 pixel = 3 bytes, which we just have written)
        cmp         rcx, 0                      ;check if counter is 0
        jne         LOOPX                       ;if not jump to LOOPX

        dec         rbx                         ;decrease counter_y
        mov         byte [rdx], 0xFF            ;additional byte per row
        inc         rdx                         ;increase address
        cmp         rbx, 0                      ;check if counter is 0
        jne         LOOPY                       ;if not jump to LOOPY



    ; write content to file
    mov     rax, 1                              ;system call number - write
    mov     rdi, r8                             ;load file desc
    mov     rsi, empty_space                    ;load adress of buffer to write
    mov     rdx, 121404                         ;load number of bytes
    syscall                                     ;call kernel

    ; close file
    mov     rax, 3                              ;system call number - close
    mov     rdi, r8                             ;load file desc
    syscall                                     ;call kernel

    ; exit program
    mov     rax,60                              ;system call number - exit
    syscall                                     ;call kernel

section     .data

    msg         db  'filename.bmp',0x00         ;name of out file, 0x00 = end of string
    bmpheadline db  0x42,0x4D,0x72,0xDA,0x01,0x00,0x00,0x00,0x00,0x00,0x36,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0xC9,0x00,0x00,0x00,0xC9,0x00,0x00,0x00,0x01,0x00,0x18,0x00,0x00,0x00,0x00,0x00,0x3C,0xDA,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00

section .bss                                    ;this section is responsible for preallocated block of memory of fixed size

    empty_space: resb 121404                    ;preallocation of 121404 bytes

Here the explaination of bmp headline (more under this link: http://www.dragonwins.com/domains/getteched/bmp/bmpfileformat.htm )

;### File Header - 14 bytes
;#######################################################################
;### bfType,        2 bytes,    The characters "BM"
;### 0x42,0x4D = "B","M"
;### 
;### bfSize,        4 bytes,    The size of the file in bytes
;### 0x72,0xDA,0x01,0x00 => 0x00,0x01,0xDA,0x72 = 0x1DA72 = 121458 bytes
;### 121458 = 54 + 201 * (201 + 1) * 3
;### 
;### Comment:
;### We want to create file 201x201, that means 201 rows and 201 columns
;### meaning each row will take 201*3 = 603 bytes
;### 
;### According to BMP file specification each such row must be adjusted
;### so its size is dividable by 4, this gives us plus 1 byte for each 
;### row.
;###
;###
;### bfReserved1,   2 bytes,    Unused - must be zero
;### 0x00,0x00
;### 
;### bfReserved2,   2 bytes,    Unused - must be zero
;### 0x00,0x00
;### 
;### bfOffBits,     4 bytes,    Offset to start of Pixel Data
;### 0x36,0x00,0x00,0x00 = 54 bytes
;### 

;### Image Header - 40 bytes
;#######################################################################
;### biSize             4   Header Size - Must be at least 40
;### 0x28,0x00,0x00,0x00 = 40
;### 
;### biWidth            4   Image width in pixels
;### 0xC9,0x00,0x00,0x00 = 201
;### 
;### biHeight           4   Image height in pixels
;### 0xC9,0x00,0x00,0x00 = 201
;### 
;### biPlanes           2   Must be 1
;### 0x01,0x00
;### 
;### biBitCount         2   Bits per pixel - 1, 4, 8, 16, 24, or 32
;### 0x18,0x00 = 24
;### 
;### biCompression      4   Compression type (0 = uncompressed)
;### 0x00,0x00,0x00,0x00
;### 
;### biSizeImage        4   Image Size - may be zero for uncompressed images
;### 0x3C,0xDA,0x01,0x00 => 0x00,0x01,0xDA,0x3C = 121404 bytes
;### 
;### biXPelsPerMeter    4   Preferred resolution in pixels per meter
;### 0x00,0x00,0x00,0x00
;### 
;### biYPelsPerMeter    4   Preferred resolution in pixels per meter
;### 0x00,0x00,0x00,0x00
;### 
;### biClrUsed          4   Number Color Map entries that are actually used
;### 0x00,0x00,0x00,0x00
;### 
;### biClrImportant     4   Number of significant colors
;### 0x00,0x00,0x00,0x00
;###

回答2:

Here's an improved version of rbraun's answer. This should really be a Q&A over on codereview.SE >.<

I decided to post a separate answer instead of an edit, but feel free to copy any of this back into that answer if you want. I've tested this for a few different row/column sizes, and it works.

I improved the comments, as well as optimizing a bit. Comments like "call kernel" are too obvious to bother writing; that's just noise. I changed the comments on the system calls to more clearly say what was going on. e.g. it looks like you're calling sys_open, but you're actually using sys_creat. That means there is no flags arg, even though you mention it in a comment.

I also parameterized the BMP header and loops it so it works for any assemble-time value of BMPcols and BMProws with no extra overhead at run-time. If the row width is a multiple of 4B without padding, it leaves out the store and increment instructions altogether.

For very large buffers, it would make a lot of sense to use multiple write() calls on a buffer that ends at the end of a line, so you can reuse it. e.g. any multiple of lcm(4096, row_bytes) would be good, since it holds a whole number of rows. Around 128kiB is maybe a good size, because L2 cache size in Intel CPUs since Nehalem is 256kiB, so the data can hopefully stay hot in L2 while the kernel memcpys it into the pagecache repeatedly. You definitely want the buffer to be significantly smaller than last-level cache size.

Changes from original:

Fixed file-creation mode: don't set the execute bits, just read/write. Use octal like a normal person.
Improve comments, as discussed above: be more explicit about what system calls we're making. Avoid re-stating what's already clear from the asm instructions.
Demonstrate RIP-relative addressing for static objects
Put static constant data in .rodata. We don't need a .data section/segment at all.
Used 32-bit operand size where possible, especially for putting small constants in registers. (And note that mov-immediate is not really a "load").
Improved loop idiom: dec / jnz with no separate CMP.
Parameterized on BMProws / BMPcols, and defined assemble-time constants for various sizes instead of hard-coding. The assembler can do the math for you, so take advantage of it.
Define the BMP header with separately named dd items, instead of a no-longer-meaningful block of bytes with db.
Make only one write() system call: copy the BMP header into the buffer first. A 54 byte memcpy is much faster than an extra syscall.
Save some instructions by not repeating the setup of args for system calls when they're already there.
Merged the three byte stores for pixel components into one dword store. These stores overlap, but that's fine.

DEFAULT REL            ; default to RIP-relative addressing for static data

;#######################################################################
;### This program creates empty bmp file - 64 bit version ##############

section     .rodata                  ; read-only data is the right place for these, not .data


    BMPcols   equ  2019
    BMProws   equ  2011

    ; 3 bytes per pixel, with each row padded to a multiple of 4B
    BMPpixbytes equ 3 * BMProws * ((BMPcols + 3) & ~0x3)

    ;; TODO: rewrite this header with separate db and dd directives for the different fields.  Preferably in terms of assembler-constant width and height

    ALIGN 16   ; for efficient rep movs
bmpheader:
;; BMP is a little-endian format, so we can use dd and stuff directly instead of encoding the bytes ourselves
bfType:  dw "BM"
bfSize:  dd BMPpixbytes + bmpheader_len   ; size of file in bytes
         dd 0                 ; reserved
bfOffBits: dd bmpheader_len   ; yes we can refer to stuff that's defined later.

biSize:   dd 40    ; header size, min = 40
biWidth:  dd BMPcols
biHeight: dd BMProws
biPlanes:       dw 1     ; must be 1
biBitCount:     dw 24    ; bits per pixel: 1, 4, 8, 16, 24, or 32
biCompression:  dd 0     ; uncompressed = 0
biSizeImage:    dd BMPpixbytes  ; Image Size - may be zero for uncompressed images

biXPelsPerMeter: dd 0   ;  Preferred resolution in pixels per meter
biYPelsPerMeter: dd 0   ;  Preferred resolution in pixels per meter
biClrUsed:       dd 0   ;  Number Color Map entries that are actually used
biClrImportant:  dd 0   ;  Number of significant colors

    bmpheader_len   equ   $ - bmpheader         ; Let the assembler calculate this for us.  Should be 54.  `.` is the current position

    ; output filename is hard-coded.  Checking argc / argv is left as an exercise for the reader.
    ; Of course it would be even easier to be more Unixy and just always write to stdout, so the user could redirect
    fname         db  'filename.bmp',0x00         ;name of out file, 0x00 = end of string


section .bss                                    ;this section is responsible for fixed size preallocated blocks

    bmpbuf: resb 54 + BMPpixbytes    ; static buffer big enough to hold the whole file (including header).
    bmpbuf_len  equ  $ - bmpbuf


section     .text
global      _start                              ;make the symbol externally visible

_start:                                         ;The linker looks for this symbol to set the entry point

;#######################################################################
;### main ##############################################################

    ; creat(fname, 0666)
    mov     eax,85                              ; SYS_creat from /usr/include/x86_64-linux-gnu/asm/unistd_64.h
    ;mov     edi, fname                          ;file name string.  Static data is always in the low 2G, so you can use 32bit immediates.
    lea     rdi, [fname]                        ; file name, PIC version.  We don't need [rel fname] since we used DEFAULT REL.
                                                ; Ubuntu 16.10 defaults to enabling position-independent executables that can use ASLR, but doesn't require it the way OS X does.)
                                                ;creat doesn't take flags.  It's equivalent to open(path, O_CREAT|O_WRONLY|O_TRUNC, mode).
    mov     esi, 666o                          ;mode in octal, to be masked by the user's umask
    syscall                              ; eax = fd or -ERRNO

    test  eax,eax             ; error checking on system calls.
    js    .handle_error       ; We don't print anything, so run under strace to see what happened.


    ;;; memcpy the BMP header to the start of our buffer.
    ;;; SSE loads/stores would probably be more efficient for such a small copy
    mov    edi, bmpbuf
    mov    esi, bmpheader
     ;Alternative: rep movsd or movsq may be faster.
     ;mov    ecx, bmpheader_len/4 + 1   ; It's not a multiple of 4, but copy extra bytes because MOVSD is faster
    mov    ecx, bmpheader_len
    rep movsb

    ; edi now points to the first byte after the header, where pixels should be stored
    ; mov  edi, bmpbuffer+bmpheader_len might let out-of-order execution get started on the rest while rep movsb was still running, but IDK.


;#########  main loop
        mov         ebx, BMProws
.LOOPY:                                     ; do{

        mov         ecx, BMPcols  ; Note the use of a macro later to decide whether we need padding at the end of each row or not, so arbitrary widths should work.

.LOOPX:                                        ; do{
        mov         dword [rdi],  (0xFF <<16) | (0xFF <<8) | 0x00      ;RED=FF, GREEN=FF, BLUE=00
                                                ; stores one extra byte, but we overlap it with the next store

        add         rdi, 3                      ;move address pointer by 3 bytes (1 pixel = 3 bytes, which we just have written)
        dec         ecx
        jne         .LOOPX                     ; } while(--x != 0)
    ; end of inner loop


%if    ((BMPcols * 3) % 4) != 0
        ; Pad the row to a multiple of 4B
        mov         dword [rdi], 0xFFFFFFFF    ; might only need a byte or word store, but another dword store that we overlap is fine as long as it doesn't go off the end of the buffer

        add         rdi, 4 - (BMPcols * 3) % 4  ; advance to a 4B boundary
%endif

        dec         ebx
        jne         .LOOPY                    ; } while(--y != 0)


;##### Write out the buffer to the file

    ; fd is still where we left it in RAX.
    ; write and close calls both take it as the first arg,
    ;  and the SYSCALL ABI only clobbers RAX, RCX, and R11, so we can just put it in EDI once.
    mov     edi, eax                            ; fd

    ; write content to file: write(fd, bmpbuf, bmpbuf_len)
    mov     eax, 1                              ;SYS_write
    lea     rsi, [bmpbuf]                       ;buffer.
    ; We already have enough info in registers that reloading this stuff as immediate constants isn't necessary, but it's much more readable and probably at least as efficient anyway.
    mov     edx, bmpbuf_len
    syscall

    ; close file
    mov     eax, 3                              ;SYS_close
    ; fd is still in edi
    syscall

.handle_error:
    ; exit program
    mov     rax,60                              ;system call number - exit
    syscall

I used RIP-relative LEA sometimes, and absolute addressing (mov r32, imm32) sometimes for the static data. This is silly; really I should have just picked one and used it everywhere. (And if I picked absolute non-PIC so I know the address is definitely in the low 31 bits of virtual address space, take advantage of that everywhere with stuff like add edi,3 instead of RDI.)

See my comments on the original answer for more optimization suggestions. I didn't implement anything more than the most basic thing of combining the three byte-stores into one dword store. Unrolling so you can use wider stores would help a lot, but this is left as an exercise for the reader.

来源：https://stackoverflow.com/questions/41024390/nasm-how-to-create-handle-basic-bmp-file-using-intel-64-bit-assembly

标签

Linux

assembly

x86

nasm

x86-64