nginx 地址对齐(ngx_align_ptr)

内存池，要在大块连续内存上，分配小块内存，指向小内存块的地址是否对齐，对系统性能有一定影响：因为 cpu 从主存上读取数据很慢的，合理的地址对齐可以减少访问次数，提高访问效率。看看 nginx 的[内存池地址对齐操作：

// p 是内存指针，a 是对齐字节数
#define ngx_align_ptr(p, a)                                                   \
    (u_char *) (((uintptr_t) (p) + ((uintptr_t) a - 1)) & ~((uintptr_t) a - 1))

该宏的原理详细证明，请参考《高效算法的奥秘》（第二版）第三章 2 的幂边界

当 $ a = 2^n$ 时，~((uintptr_t) a - 1)) 的 64 位二进制数，最右边 $n$ 位数是 0。所以 x & ~((uintptr_t) a - 1)) 能被 $2^n$ 整除。

a 对齐字节数	2 的幂	64位二进制
1	$2^0$	1111111111111111111111111111111111111111111111111111111111111111
2	$2^1$	1111111111111111111111111111111111111111111111111111111111111110
4	$2^2$	1111111111111111111111111111111111111111111111111111111111111100
8	$2^3$	1111111111111111111111111111111111111111111111111111111111111000
16	$2^4$	1111111111111111111111111111111111111111111111111111111111110000
32	$2^5$	1111111111111111111111111111111111111111111111111111111111100000
64	$2^6$	1111111111111111111111111111111111111111111111111111111111000000

测试

测试源码

测试 ~((uintptr_t)a - 1))

// 测试 ~((uintptr_t)a - 1))
void test_a() {
    int i, len;
    uintptr_t l;
    char* p;
    char test[128];

    int aligns[] = {1, 2, 4, 8, 16, 32, 64};
    len = sizeof(aligns) / sizeof(int);

    for (i = 0; i < len; i++) {
        l = ~((uintptr_t)aligns[i] - 1);
        p = i2bin(l, test, 128);
        printf("a: %2d,  d: %s\n", aligns[i], p);
    }
}

结果：

a:  1,  d: 1111111111111111111111111111111111111111111111111111111111111111
a:  2,  d: 1111111111111111111111111111111111111111111111111111111111111110
a:  4,  d: 1111111111111111111111111111111111111111111111111111111111111100
a:  8,  d: 1111111111111111111111111111111111111111111111111111111111111000
a: 16,  d: 1111111111111111111111111111111111111111111111111111111111110000
a: 32,  d: 1111111111111111111111111111111111111111111111111111111111100000
a: 64,  d: 1111111111111111111111111111111111111111111111111111111111000000

地址添加随机数，测试不同的对齐方式

// 测试数值是否对齐
void test_align_mod() {
    char bin[128];
    u_char *p, *a, *r;
    int i, len, alignment;
    int aligns[] = {1, 2, 4, 8, 16, 32, 64};

    len = sizeof(aligns) / sizeof(int);
    srand(time(NULL));

    p = (u_char*)malloc(1024 * sizeof(u_char));
    printf("p: %p\n", p);

    r = p;

    for (i = 0; i < len; i++) {
        alignment = aligns[i];
        r = p + rand() % 64;
        a = ngx_align_ptr(r, alignment);
        printf("a: %2d, r: %p, align: %p, abin: %s, mod: %lu\n", alignment, r,
               a, i2bin((unsigned long long)a, bin, 128),
               (uintptr_t)a % alignment);
    }
    free(p);
}

结果：

p: 0x7fd035800600
a:  1, r: 0x7fd03580062f, align: 0x7fd03580062f, abin: 11111111101000000110101100000000000011000101111, mod: 0
a:  2, r: 0x7fd03580061a, align: 0x7fd03580061a, abin: 11111111101000000110101100000000000011000011010, mod: 0
a:  4, r: 0x7fd035800635, align: 0x7fd035800638, abin: 11111111101000000110101100000000000011000111000, mod: 0
a:  8, r: 0x7fd035800613, align: 0x7fd035800618, abin: 11111111101000000110101100000000000011000011000, mod: 0
a: 16, r: 0x7fd035800633, align: 0x7fd035800640, abin: 11111111101000000110101100000000000011001000000, mod: 0
a: 32, r: 0x7fd035800602, align: 0x7fd035800620, abin: 11111111101000000110101100000000000011000100000, mod: 0
a: 64, r: 0x7fd03580061b, align: 0x7fd035800640, abin: 11111111101000000110101100000000000011001000000, mod: 0

测试对齐效率

在一块连续内存上，分配小块（一个范围内随机大小）内存，对齐地址和不对齐地址分别保存在不同的数组 aligns 和 unaligns，再对数组里指向的数据地址进行读写。

#define RAND_AREA 128
#define ALIGN_COUNT (1024 * 1024 * 4)
#define UN_ALIGN_COUNT ALIGN_COUNT
#define BLOCK_SIZE (1024 * 1024 * 1024)

typedef struct {
    size_t len;
    u_char* data;
} ngx_str_t;

void test_mem_alloc(int argc, char** argv) {
    u_char *p, *last, *end;
    int size, alignment, i, j;
    long long start, stop;
    char buf[256];
    ngx_str_t *s, *aligns, *ualigns;

    alignment = (argc == 2) ? atoi(argv[1]) : 4;

    size = BLOCK_SIZE * sizeof(char);
    p = (u_char*)malloc(size);
    last = p;
    end = last + size;

    // 保存对齐和不对齐的数据指针
    aligns = (ngx_str_t*)malloc(ALIGN_COUNT * sizeof(ngx_str_t));
    ualigns = (ngx_str_t*)malloc(UN_ALIGN_COUNT * sizeof(ngx_str_t));

    i = 0;
    srand(time(NULL));

    last += 1;

    // 不对齐
    while (end > last) {
        // 取不对齐的地址
        size = rand() % (RAND_AREA - 1) + 1;
        if ((uintptr_t(last + size) % 2) == 0) {
            continue;
        }
        if ((last + size) > end) {
            break;
        }

        ualigns[i].len = size;
        ualigns[i].data = last;

        last += size;
        if (++i >= UN_ALIGN_COUNT) {
            break;
        }
    }

    i = 0;

    // 对齐
    while (end > last) {
        last = (u_char*)ngx_align_ptr(last, alignment);
        size = rand() % (RAND_AREA - 1) + 1;
        if ((last + size) > end) {
            break;
        }

        aligns[i].len = size;
        aligns[i].data = last;

        last += size;

        if (++i >= ALIGN_COUNT) {
            break;
        }
    }

    // ------------------
    // 对数组保存的指针进行读写操作。

    // 写
    start = mstime();
    for (i = 0; i < UN_ALIGN_COUNT; i++) {
        s = &ualigns[i];
        memset(s->data, (char)(rand() % 255), s->len - 1);
        s->data[s->len - 1] = '\0';
        // printf("unalign: %p, data: %s\n", s->data, s->data);
    }
    stop = mstime();

    printf("ualign write, alignment: %d, count: %d, cost: %lld ms\n", alignment,
           i, stop - start);

    start = mstime();
    for (i = 0; i < ALIGN_COUNT; i++) {
        s = &aligns[i];
        memset(s->data, (char)(rand() % 255), s->len - 1);
        s->data[s->len - 1] = '\0';
    }
    stop = mstime();

    printf("align  write, alignment: %d, count: %d, cost: %lld ms\n", alignment,
           i, stop - start);

    // 读
    start = mstime();
    for (i = 0; i < UN_ALIGN_COUNT; i++) {
        s = &ualigns[i];
        strncpy(buf, (char*)s->data, s->len);
        // printf("unalign: %p, len: %lu\n", s->data, s->len);
    }
    stop = mstime();

    printf("ualign read, alignment: %d, count: %d, cost: %lld ms\n", alignment,
           i, stop - start);

    start = mstime();
    for (i = 0; i < ALIGN_COUNT; i++) {
        s = &aligns[i];
        strncpy(buf, (char*)s->data, s->len);
        // printf("align: %p, len: %lu\n", s->data, s->len);
    }
    stop = mstime();

    printf("align  read, alignment: %d, count: %d, cost: %lld ms\n", alignment,
           i, stop - start);

    free(aligns);
    free(ualigns);
    free(p);
}

结果：

# gcc -g -O0 align.cpp -o align && ./align 4
ualign write, alignment: 4, count: 4194304, cost: 256 ms
align  write, alignment: 4, count: 4194304, cost: 250 ms
ualign read, alignment: 4, count: 4194304, cost: 179 ms
align  read, alignment: 4, count: 4194304, cost: 185 ms
# gcc -g -O0 align.cpp -o align && ./align 8
ualign write, alignment: 8, count: 4194304, cost: 235 ms
align  write, alignment: 8, count: 4194304, cost: 255 ms
ualign read, alignment: 8, count: 4194304, cost: 183 ms
align  read, alignment: 8, count: 4194304, cost: 189 ms
# gcc -g -O0 align.cpp -o align && ./align 16
ualign write, alignment: 16, count: 4194304, cost: 256 ms
align  write, alignment: 16, count: 4194304, cost: 285 ms
ualign read, alignment: 16, count: 4194304, cost: 184 ms
align  read, alignment: 16, count: 4194304, cost: 182 ms
# gcc -g -O0 align.cpp -o align && ./align 32
ualign write, alignment: 32, count: 4194304, cost: 233 ms
align  write, alignment: 32, count: 4194304, cost: 263 ms
ualign read, alignment: 32, count: 4194304, cost: 175 ms
align  read, alignment: 32, count: 4194304, cost: 165 ms
# gcc -g -O0 align.cpp -o align && ./align 64
ualign write, alignment: 64, count: 4194304, cost: 238 ms
align  write, alignment: 64, count: 4194304, cost: 298 ms
ualign read, alignment: 64, count: 4194304, cost: 175 ms
align  read, alignment: 64, count: 4194304, cost: 169 ms

测试 nginx 内存池

内存池源码扣出来改造了一下，进行测试源码，地址对齐与否好像性能貌似没什么很大的区别。

int main(int argc, char **argv) {
    int i, unaligns;
    size_t size, total, used;
    unsigned is_align;
    u_char *p;
    char buf[1024];
    ngx_pool_t *pool;
    long long start, stop;
    ngx_str_t *array, *s;

    srand(time(NULL));
    is_align = (argc == 2 && !strcasecmp(argv[1], "1")) ? 1 : 0;
    unaligns = 0;

    array = (ngx_str_t *)malloc(ALLOC_COUNT * sizeof(ngx_str_t));

    // 申请写数据
    start = mstime();
    pool = ngx_create_pool(ngx_pagesize, is_align);
    for (i = 0; i < ALLOC_COUNT; i++) {
        size = rand() % (1024 - 1) + 1;
        p = (u_char *)ngx_palloc(pool, size * sizeof(u_char));
        memset(p, (u_char)(rand() % 255), size - 1);
        p[size - 1] = '\0';

        array[i].data = p;
        array[i].len = size;

        used += size;
    }
    stop = mstime();
    calc_test(pool, used, stop - start);

    // 读数据
    start = mstime();
    for (i = 0; i < ALLOC_COUNT; i++) {
        s = &array[i];
        strncpy(buf, (char *)s->data, s->len);
    }
    stop = mstime();
    printf("read align: %d, blocks: %d, time: %llu ms\n", pool->is_align, i,
           stop - start);

    ngx_destroy_pool(pool);
    return 0;
}

结果：

$ gcc -g test_pool_align.cpp -o align && ./align 0
write align: 0, max fit: 8, blocks: 133089, total: 545132544, used: 536677369, use rate: 0.984490, time: 246 ms
read align: 0, blocks: 1048576, time: 71 ms
$ gcc -g test_pool_align.cpp -o align && ./align 1
write align: 1, max fit: 8, blocks: 133908, total: 548487168, used: 536746859, use rate: 0.978595, time: 254 ms
read align: 1, blocks: 1048576, time: 71 ms