首页 > 解决方案 > 为什么 c++ 中的组合数据结构比独立数组的性能低

问题描述

这是我在 Linux 中使用命令编译的测试代码

g++ main.cpp -O3 -o stest

我尝试了两种组合数据的方法(test2 test3)。但是,这两种方式都没有像我预期的那样有更好的表现。在我看来,组合数据应该比独立数组有更好的性能,因为较高的缓存会逐块从较低的缓存中加载数据。因此,组合数据有更多机会在一次内存访问中加载。但是,独立数组 ( test1 ) 需要三个内存访问。但是,测试结果表明test1的性能最好。这对我来说太奇怪了。而且,我不知道为什么。如果你知道,请告诉我。提前致谢。

#include <iostream>
#include <cstdlib>
#include <unistd.h>
#include <string.h>
#include <sstream>
#include <sys/times.h>
#include <cmath>
using namespace std;

tms start, tEnd;

long long test1(int n) {
    int *a = new int[n];
    int *b = new int[n];
    int *c = new int[n];


    times(&start);
    for (int i = 0; i < n; i++) {
        a[i] = b[i] = i;
    }

    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
    long long sum = 0;
    for (int i = 0; i < n; i++) {
        sum += c[i];
    }

    times(&tEnd);

    double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
    cout << "test1: " << elap_time << "ms  result=" << sum << " " << endl;
    delete[] a;
    delete[] b;
    delete[] c;
    return sum;
}

struct D {
    int a, b, c;
};

long long test2(int n) {
    struct D *d = new D[n];

    times(&start);
    for (int i = 0; i < n; i++) {
        struct D &di = d[i];
        di.a = di.b = i;
    }

    for (int i = 0; i < n; i++) {
        struct D &di = d[i];
        di.c = di.a + di.b;
    }
    long long sum = 0;
    for (int i = 0; i < n; i++) {
        sum += d[i].c;
    }

    times(&tEnd);
    double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
    cout << "test2: " << elap_time << "ms  result=" << sum << " " << endl;
    delete [] d;
    return sum;
}

long long test3(int n) {
    int *abc = new int[3 * n];

    times(&start);
    for (int i = 0; i < n; i++) {
        int base = 3 * i;
        abc[base] = abc[base + 1] = i;
    }

    for (int i = 0; i < n; i++) {
        int base = 3 * i;
        abc[base + 2] = abc[base] + abc[base + 1];
    }

    long long sum = 0;
    for (int i = 0; i < n; i++) {
        sum += abc[3 * i + 2];
    }

    times(&tEnd);
    double elap_time = double(tEnd.tms_utime - start.tms_utime + tEnd.tms_stime - start.tms_stime) / sysconf(_SC_CLK_TCK);
    cout << "test3: " << elap_time << "ms  result=" << sum << " " << endl;
    delete [] abc;
    return sum;
}


int main(int argc, char *argv[]) {
    int n = 9999999;
    sscanf(argv[1], "%d", &n);
    test1(n);
    test2(n);
    test3(n);

    cout<<"after changing order"<<endl;

    test2(n);
    test3(n);
    test1(n);

    cout<<"after changing order"<<endl;
    test3(n);
    test1(n);
    test2(n);

    return 0;
}

我在具有四个 i5-4460 CPU 和8GB内存的计算机上测试了stest 。这是我用来测试程序的命令,我确信使用参数399999999,计算机不会内存不足:

q@q-lab:~/Desktop$ ./stest 399999999
test1: 1.61ms  result=159999998800000002 
test2: 2.38ms  result=159999998800000002 
test3: 2.37ms  result=159999998800000002 
after changing order
test2: 2.38ms  result=159999998800000002 
test3: 2.38ms  result=159999998800000002 
test1: 1.61ms  result=159999998800000002 
after changing order
test3: 2.38ms  result=159999998800000002 
test1: 1.61ms  result=159999998800000002 
test2: 2.39ms  result=159999998800000002

标签: c++performancestructure

解决方案


合并数据需要更多时间来寻址或计算数据偏移。而且,CPU 缓存是不可预测的并且难以优化。最好不要尝试优化 CPU 缓存。


推荐阅读