首页 > 解决方案 > Cpp uint32_fast_t 解析为 uint64_t,但几乎所有操作都比 uint32_t (x86_64) 慢。为什么它解析为 uint64_t?

问题描述

跑了一个基准测试,uint32_fast_t 是 8 字节,但几乎所有操作都比 4 字节 uint32_t 慢。如果是这种情况,为什么 uint32_fast_t 不保持为 4 个字节?

操作系统信息:5.3.0-62-generic #56~18.04.1-Ubuntu SMP Wed Jun 24 16:17:03 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux

处理器信息:

cat /sys/devices/cpu/caps/pmu_name
skylake

model name  : Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz

我用于测试的基准:

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstdint>

#define TEST_SIZE (1u << 26)

#define ADD(X, Y)     X += Y
#define SUB(X, Y)     X -= Y
#define MULT(X, Y)    X *= Y
#define DIV(X, Y)     X /= Y
#define MOD(X, Y)     X = X % Y
#define AND(X, Y)     X &= Y
#define OR(X, Y)      X |= Y
#define XOR(X, Y)     X ^= Y

// if you compile this make sure to have -DOP=<Operation macro name here>


#define bench_flush_all_pending()    asm volatile("" : : : "memory");
#define bench_do_not_optimize_out(X) asm volatile("" : : "r,m"(X) : "memory")

uint64_t inline __attribute__((always_inline)) __attribute__((const))
get_cycles() {
    uint32_t hi, lo;
    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
    return (((uint64_t)lo) | (((uint64_t)hi) << 32));
}


constexpr uint32_t
size_ratio() {
    return sizeof(uint_fast32_t) / sizeof(uint32_t);
}

int
main(int argc, char ** argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: ./%s <A or B>\n", argv[0]);
    }
    uint_fast32_t * valsf32 =
        (uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        valsf32[i] = rand();
    }
    uint32_t * vals32 = (uint32_t *)valsf32;

    uint_fast32_t sinkf32   = rand();
    uint32_t      sink32    = rand();
    uint64_t      start, end;
    if (argv[1][0] == 'A') {
        start = get_cycles();
#ifndef DO_NOTHING
        for (uint32_t i = 0; i < TEST_SIZE; ++i) {
            OP(sinkf32, valsf32[i]);
        }
        bench_do_not_optimize_out(sinkf32);
        bench_flush_all_pending();
#endif
        end = get_cycles();
    }
    else if (argv[1][0] == 'B') {
        start = get_cycles();
#ifndef DO_NOTHING
        for (uint32_t i = 0; i < TEST_SIZE; ++i) {
            OP(sink32, vals32[size_ratio() * i]);
        }
        bench_do_not_optimize_out(sink32);
        bench_flush_all_pending();
#endif
        end = get_cycles();
    }
    else {
        bench_do_not_optimize_out(sinkf32);
        bench_do_not_optimize_out(sink32);
    }

    fprintf(stderr,
            "\t%s (%zu Bytes): %.2E \"Cycles\" Per operator\n",
            argv[1][0] == 'A' ? "Fast u32" : "Norm u32",
            argv[1][0] == 'A' ? sizeof(uint_fast32_t) : sizeof(uint32_t),
            ((double)(end - start)) / TEST_SIZE);
    free(valsf32);
}

我编译了这个:

g++ -O3 test_operations.cc -o test_ops -DOP=<Operation macro name here> 

即测试加法:

g++ -O3 test_operations.cc -o test_ops -DOP=ADD

要获得基线,只需包括-DDO_NOTHING

我希望有人可以解释将 uint32_fast_t 设置为 8 个字节的优势,或者我在这个基准测试中遗漏的一些东西,以便我实际上可以得到一个很好的比较。

这同样适用于-fno-unroll-loops

这是一个示例运行:

Running: test_ADD Fast u32
    Fast u32 (8 Bytes): 8.52E-01 "Cycles" Per operator
Running: test_ADD Norm u32
    Norm u32 (4 Bytes): 7.92E-01 "Cycles" Per operator
Running: test_SUB Fast u32
    Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_SUB Norm u32
    Norm u32 (4 Bytes): 7.94E-01 "Cycles" Per operator
Running: test_MULT Fast u32
    Fast u32 (8 Bytes): 2.54E+00 "Cycles" Per operator
Running: test_MULT Norm u32
    Norm u32 (4 Bytes): 1.26E+00 "Cycles" Per operator
Running: test_DIV Fast u32
    Fast u32 (8 Bytes): 1.48E+01 "Cycles" Per operator
Running: test_DIV Norm u32
    Norm u32 (4 Bytes): 1.09E+01 "Cycles" Per operator
Running: test_MOD Fast u32
    Fast u32 (8 Bytes): 1.52E+01 "Cycles" Per operator
Running: test_MOD Norm u32
    Norm u32 (4 Bytes): 1.20E+01 "Cycles" Per operator
Running: test_AND Fast u32
    Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_AND Norm u32
    Norm u32 (4 Bytes): 7.98E-01 "Cycles" Per operator
Running: test_OR Fast u32
    Fast u32 (8 Bytes): 8.30E-01 "Cycles" Per operator
Running: test_OR Norm u32
    Norm u32 (4 Bytes): 8.00E-01 "Cycles" Per operator
Running: test_XOR Fast u32
    Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_XOR Norm u32
    Norm u32 (4 Bytes): 7.95E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
    Fast u32 (8 Bytes): 2.09E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
    Norm u32 (4 Bytes): 2.09E-07 "Cycles" Per operator 

更新初始化函数:

    uint_fast32_t * valsf32 =
        (uint_fast32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        valsf32[i] = (uint32_t)rand(); // rand max is already an int by why not
        valsf32[i] += (valsf32[i] == 0);
        assert(valsf32[i]);
    }
    uint32_t * vals32 = (uint32_t *)calloc(TEST_SIZE, sizeof(uint_fast32_t));
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        vals32[size_ratio() * i] = (uint32_t)valsf32[i];
        vals32[size_ratio() * i] += (vals32[size_ratio() * i] == 0);
        assert(vals32[size_ratio() * i]);
        assert(vals32[size_ratio() * i] == valsf32[i]);
    }

结果:

Running: test_ADD Fast u32
    Fast u32 (8 Bytes): 8.27E-01 "Cycles" Per operator
Running: test_ADD Norm u32
    Norm u32 (4 Bytes): 7.83E-01 "Cycles" Per operator
Running: test_SUB Fast u32
    Fast u32 (8 Bytes): 8.29E-01 "Cycles" Per operator
Running: test_SUB Norm u32
    Norm u32 (4 Bytes): 7.72E-01 "Cycles" Per operator
Running: test_MULT Fast u32
    Fast u32 (8 Bytes): 2.55E+00 "Cycles" Per operator
Running: test_MULT Norm u32
    Norm u32 (4 Bytes): 1.28E+00 "Cycles" Per operator
Running: test_DIV Fast u32
    Fast u32 (8 Bytes): 1.49E+01 "Cycles" Per operator
Running: test_DIV Norm u32
    Norm u32 (4 Bytes): 1.10E+01 "Cycles" Per operator
Running: test_MOD Fast u32
    Fast u32 (8 Bytes): 1.53E+01 "Cycles" Per operator
Running: test_MOD Norm u32
    Norm u32 (4 Bytes): 1.25E+01 "Cycles" Per operator
Running: test_AND Fast u32
    Fast u32 (8 Bytes): 8.35E-01 "Cycles" Per operator
Running: test_AND Norm u32
    Norm u32 (4 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_OR Fast u32
    Fast u32 (8 Bytes): 8.31E-01 "Cycles" Per operator
Running: test_OR Norm u32
    Norm u32 (4 Bytes): 7.76E-01 "Cycles" Per operator
Running: test_XOR Fast u32
    Fast u32 (8 Bytes): 8.34E-01 "Cycles" Per operator
Running: test_XOR Norm u32
    Norm u32 (4 Bytes): 7.82E-01 "Cycles" Per operator
Running: test_NOTHING Fast u32
    Fast u32 (8 Bytes): 1.79E-07 "Cycles" Per operator
Running: test_NOTHING Norm u32
    Norm u32 (4 Bytes): 1.79E-07 "Cycles" Per operator

为了测试内存带宽,添加了一个 TOUCH 运算符:

#define TOUCH(X, Y) X = Y

触摸结果:

    Fast u32 (8 Bytes): 1.05E+00 "Cycles" Per operator
    Norm u32 (4 Bytes): 1.04E+00 "Cycles" Per operator

添加:

#define THREE_WAY ((A * vals8[i] + B) * vals8[i] + C)
#define TW(X, Y) X ^= THREE_WAY
// A, B, C, vals8, and i are all define
...
    uint8_t * vals8 = (uint8_t *)calloc(TEST_SIZE, 1);
    for (uint32_t i = 0; i < TEST_SIZE; ++i) {
        vals8[i] = rand();
    }
...
       if (argv[1][0] == 'A') {
        uint_fast32_t A = rand();
        uint_fast32_t B = rand();
        uint_fast32_t C = rand();
...
    else if (argv[1][0] == 'B') {
        uint32_t A = rand();
        uint32_t B = rand();
        uint32_t C = rand();

OP=TW 的结果

    Fast u32 (8 Bytes): 2.01E+00 "Cycles" Per operator
    Norm u32 (4 Bytes): 9.48E-01 "Cycles" Per operator

编辑 6:添加 CPU 信息。

标签: c++typesx86-64abimicrobenchmark

解决方案


推荐阅读