问题
Ran a benchmark and uint32_fast_t is 8 byte but slower than 4 byte uint32_t for nearly all operations. If this is the case why does uint32_fast_t not stay as 4 bytes?
OS info: 5.3.0-62-generic #56~18.04.1-Ubuntu SMP Wed Jun 24 16:17:03 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
Cpu info:
cat /sys/devices/cpu/caps/pmu_name
skylake
model name : Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz
Benchmark I used for testing:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstdint>
#define TEST_SIZE (1u << 26)
#define ADD(X, Y) X += Y
#define SUB(X, Y) X -= Y
#define MULT(X, Y) X *= Y
#define DIV(X, Y) X /= Y
#define MOD(X, Y) X = X % Y
#define AND(X, Y) X &= Y
#define OR(X, Y) X |= Y
#define XOR(X, Y) X ^= Y
// if you compile this make sure to have -DOP=<Operation macro name here>
#define bench_flush_all_pending() asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")
uint64_t inline __attribute__((always_inline)) __attribute__((const))
get_cycles() {
uint32_t hi, lo;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return (((uint64_t)lo) | (((uint64_t)hi) << 32));
}
constexpr uint32_t
size_ratio() {
return sizeof(uint_fast32_t) / sizeof(uint32_t);
}
int
main(int argc, char ** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: ./%s <A or B>\n", argv[0]);
}
uint_fast32_t * valsf32 =
(uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
valsf32[i] = rand();
}
uint32_t * vals32 = (uint32_t *)valsf32;
uint_fast32_t sinkf32 = rand();
uint32_t sink32 = rand();
uint64_t start, end;
if (argv[1][0] == 'A') {
start = get_cycles();
#ifndef DO_NOTHING
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP(sinkf32, valsf32[i]);
}
bench_do_not_optimize_out(sinkf32);
bench_flush_all_pending();
#endif
end = get_cycles();
}
else if (argv[1][0] == 'B') {
start = get_cycles();
#ifndef DO_NOTHING
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
OP(sink32, vals32[size_ratio() * i]);
}
bench_do_not_optimize_out(sink32);
bench_flush_all_pending();
#endif
end = get_cycles();
}
else {
bench_do_not_optimize_out(sinkf32);
bench_do_not_optimize_out(sink32);
}
fprintf(stderr,
"\t%s (%zu Bytes): %.2E \"Cycles\" Per operator\n",
argv[1][0] == 'A' ? "Fast u32" : "Norm u32",
argv[1][0] == 'A' ? sizeof(uint_fast32_t) : sizeof(uint32_t),
((double)(end - start)) / TEST_SIZE);
free(valsf32);
}
I compiled this with:
g++ -O3 test_operations.cc -o test_ops -DOP=<Operation macro name here>
i.e to test addition:
g++ -O3 test_operations.cc -o test_ops -DOP=ADD
To get a baseline just include -DDO_NOTHING
I'm hoping someone could either explain the advantage of having uint32_fast_t as 8 bytes or something I missed in this benchmark so that I can actually get a good comparison.
The same holds with -fno-unroll-loops
Here is a example run:
Running: test_ADD Fast u32
Fast u32 (8 Bytes): 8.52E-01 "Cycles" Per operator
Running: test_ADD Norm u32
Norm u32 (4 Bytes): 7.92E-01 "Cycles" Per operator
Running: test_SUB Fast u32
Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_SUB Norm u32
Norm u32 (4 Bytes): 7.94E-01 "Cycles" Per operator
Running: test_MULT Fast u32
Fast u32 (8 Bytes): 2.54E+00 "Cycles" Per operator
Running: test_MULT Norm u32
Norm u32 (4 Bytes): 1.26E+00 "Cycles" Per operator
Running: test_DIV Fast u32
Fast u32 (8 Bytes): 1.48E+01 "Cycles" Per operator
Running: test_DIV Norm u32
Norm u32 (4 Bytes): 1.09E+01 "Cycles" Per operator
Running: test_MOD Fast u32
Fast u32 (8 Bytes): 1.52E+01 "Cycles" Per operator
Running: test_MOD Norm u32
Norm u32 (4 Bytes): 1.20E+01 "Cycles" Per operator
Running: test_AND Fast u32
Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_AND Norm u32
Norm u32 (4 Bytes): 7.98E-01 "Cycles" Per operator
Running: test_OR Fast u32
Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_OR Norm u32
Norm u32 (4 Bytes): 8.00E-01 "Cycles" Per operator
Running: test_XOR Fast u32
Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_XOR Norm u32
Norm u32 (4 Bytes): 7.95E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
Fast u32 (8 Bytes): 2.09E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
Norm u32 (4 Bytes): 2.09E-07 "Cycles" Per operator
updated initialization function:
uint_fast32_t * valsf32 =
(uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
valsf32[i] = (uint32_t)rand(); // rand max is already an int by why not
valsf32[i] += (valsf32[i] == 0);
assert(valsf32[i]);
}
uint32_t * vals32 = (uint32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
vals32[size_ratio() * i] = (uint32_t)valsf32[i];
vals32[size_ratio() * i] += (vals32[size_ratio() * i] == 0);
assert(vals32[size_ratio() * i]);
assert(vals32[size_ratio() * i] == valsf32[i]);
}
Results:
Running: test_ADD Fast u32
Fast u32 (8 Bytes): 8.27E-01 "Cycles" Per operator
Running: test_ADD Norm u32
Norm u32 (4 Bytes): 7.83E-01 "Cycles" Per operator
Running: test_SUB Fast u32
Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_SUB Norm u32
Norm u32 (4 Bytes): 7.72E-01 "Cycles" Per operator
Running: test_MULT Fast u32
Fast u32 (8 Bytes): 2.55E+00 "Cycles" Per operator
Running: test_MULT Norm u32
Norm u32 (4 Bytes): 1.28E+00 "Cycles" Per operator
Running: test_DIV Fast u32
Fast u32 (8 Bytes): 1.49E+01 "Cycles" Per operator
Running: test_DIV Norm u32
Norm u32 (4 Bytes): 1.10E+01 "Cycles" Per operator
Running: test_MOD Fast u32
Fast u32 (8 Bytes): 1.53E+01 "Cycles" Per operator
Running: test_MOD Norm u32
Norm u32 (4 Bytes): 1.25E+01 "Cycles" Per operator
Running: test_AND Fast u32
Fast u32 (8 Bytes): 8.35E-01 "Cycles" Per operator
Running: test_AND Norm u32
Norm u32 (4 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_OR Fast u32
Fast u32 (8 Bytes): 8.31E-01 "Cycles" Per operator
Running: test_OR Norm u32
Norm u32 (4 Bytes): 7.76E-01 "Cycles" Per operator
Running: test_XOR Fast u32
Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_XOR Norm u32
Norm u32 (4 Bytes): 7.82E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
Fast u32 (8 Bytes): 1.79E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
Norm u32 (4 Bytes): 1.79E-07 "Cycles" Per operator
To test memory bandwidth added a TOUCH operator:
#define TOUCH(X, Y) X = Y
Results of TOUCH:
Fast u32 (8 Bytes): 1.05E+00 "Cycles" Per operator
Norm u32 (4 Bytes): 1.04E+00 "Cycles" Per operator
Added:
#define THREE_WAY ((A * vals8[i] + B) * vals8[i] + C)
#define TW(X, Y) X ^= THREE_WAY
// A, B, C, vals8, and i are all define
...
uint8_t * vals8 = (uint8_t *)calloc(TEST_SIZE, 1);
for (uint32_t i = 0; i < TEST_SIZE; ++i) {
vals8[i] = rand();
}
...
if (argv[1][0] == 'A') {
uint_fast32_t A = rand();
uint_fast32_t B = rand();
uint_fast32_t C = rand();
...
else if (argv[1][0] == 'B') {
uint32_t A = rand();
uint32_t B = rand();
uint32_t C = rand();
Result of OP=TW
Fast u32 (8 Bytes): 2.01E+00 "Cycles" Per operator
Norm u32 (4 Bytes): 9.48E-01 "Cycles" Per operator
Edit 6: Added CPU info.
来源:https://stackoverflow.com/questions/63045795/cpp-uint32-fast-t-resolves-to-uint64-t-but-is-slower-for-nearly-all-operations-t