Agner Fog's strstr() vs g++ built-in strstr() performance

不想你离开。 提交于 2019-12-09 03:45:24

问题


I attempted to compare the built-in strstr() in g++ (g++ (Debian 7.2.0-19) with the assembly implementation in Agner Fog's asmlib (http://www.agner.org/optimize/asmlib.zip). The target CPU is core-i3 supporting sse4x and avx.

To compile:

g++  -O2  -msse4  main.cpp libaelf64.a

(you would need the static library is from the asmlib distribution).

Functions rdtsc64_start(), rdtsc64_end() to measure performance follow intel's white paper (https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf)

Provided the technique is correct, A_strstr() from asmlib is about 5 times slower than gcc's strstr(). I am making an effort to go though A_strstr()'s assembly to better understand what is going on, meanwhile, does anyone with (or without) experience with asmlib have an explanation? Thanks.

#include <iostream>
#include "asmlib.h"
#include <cstdint>
#include <unistd.h>
#include <string.h>

inline __attribute__ ((always_inline))
static uint64_t rdtsc64_start() {
    unsigned long cycles_high, cycles_low;
    asm volatile (
            "CPUID\n\t"
            "RDTSC\n\t"
            "mov %%rdx, %0\n\t"
            "mov %%rax, %1\n\t": "=r" (cycles_high), "=r"
            (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
    return ((uint64_t)cycles_high << 32) | cycles_low;
}

inline __attribute__ ((always_inline))
static uint64_t rdtsc64_end() {
    unsigned long cycles_high, cycles_low;
    asm volatile(
            "RDTSCP\n\t"
            "mov %%rdx, %0\n\t"
            "mov %%rax, %1\n\t"
            "CPUID\n\t": "=r" (cycles_high), "=r"
            (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
    return ((uint64_t)cycles_high << 32) | cycles_low;
}

static void rdtsc64_warmup() {
    unsigned long cycles_high, cycles_low;
    asm volatile (
            "CPUID\n\t"
            "RDTSC\n\t"
            "mov %%rdx, %0\n\t"
            "mov %%rax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
            "%rax", "%rbx", "%rcx", "%rdx");
    asm volatile(
            "RDTSCP\n\t"
            "mov %%rdx, %0\n\t"
            "mov %%rax, %1\n\t"
            "CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
            "%rbx", "%rcx", "%rdx");
    asm volatile (
            "CPUID\n\t"
            "RDTSC\n\t"
            "mov %%rdx, %0\n\t"
            "mov %%rax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
            "%rax", "%rbx", "%rcx", "%rdx");
    asm volatile(
            "RDTSCP\n\t"
            "mov %%rdx, %0\n\t"
            "mov %%rax, %1\n\t"
            "CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
            "%rbx", "%rcx", "%rdx");
}

int main(void) {
    char a[] = "This is a very long string with a WORD inside.";
    char b[] = "WORD";
    char *p = nullptr;

    rdtsc64_warmup();
    int64_t start = rdtsc64_start();
    for (volatile int64_t i = 0; i < 100000000; ++i) {
        char *p = ::strstr(a,b);
    }
    int64_t end = rdtsc64_end();
    std::cerr << "strstr, counter=  " << (end - start) << std::endl;

    rdtsc64_warmup();
    start = rdtsc64_start();
    for (volatile int64_t i = 0; i < 100000000; ++i) {
        char *p = A_strstr(a, b);
    }
    end = rdtsc64_end();
    std::cerr << "A_strstr, counter=" << (end - start) << std::endl;
    return 0;
}

来源:https://stackoverflow.com/questions/48601519/agner-fogs-strstr-vs-g-built-in-strstr-performance

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!