首页 > 解决方案 > 为什么我需要很多线程才能将 memcpy() 推到系统的极限?

问题描述

当我使用 memcpy() 复制块(16MiB,页面对齐)时,我需要多个线程才能将机器推到极限。为什么呢?我希望外部存储器的接口是等式中最慢的部分,因此即使只有一个线程也应该是限制。

测试程序:

#include <string.h>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <stdexcept>
#include <iostream>
#include <sys/mman.h>

class block {
    size_t blockSize;
    void *buffer;
public:
    block() {};
    block(size_t aSize) {init(aSize);}
    void init(size_t aSize) {
        blockSize = aSize;
        buffer = mmap(nullptr, blockSize,
                      PROT_READ | PROT_WRITE,
                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (buffer == nullptr) {
            throw std::runtime_error("can't allocate block");
        }
    }
    ~block() {
        munmap(buffer, blockSize);
    }
    char *begin() const {
        return reinterpret_cast<char*>(buffer);
    }
    char *end() const {
        return reinterpret_cast<char*>(buffer) + blockSize;
    }
    size_t size() const {
        return blockSize;
    }
};

void copyThread(const block& src,
                const std::vector<block>& dst) {
    for (auto& b: dst) {
        memcpy(b.begin(), src.begin(), src.size()); 
    }
}

typedef std::chrono::system_clock clock_type;

int main(int argc, const char *argv[]) {
    auto nThreads = argc > 1 ? std::stoul(argv[1]) : 1ul;
    auto nBlocks = argc > 2 ? std::stoul(argv[2]) : 10ul;
    auto blocksize = argc > 3 ? std::stoul(argv[3]) : 16*1024*1014ul;

    block src(blocksize);
    std::vector<std::vector<block>> dstBlocks;
    for (unsigned long i=0; i<nThreads; i++) {
        dstBlocks.emplace_back(nBlocks);
    }
    for (auto& v: dstBlocks) {
        for (auto& b: v) {
            b.init(blocksize);
        }
    }
    std::cerr << "blocks allocated\n"; 
    std::vector<std::thread> workers(nThreads);
    int i = 0;
    auto before = clock_type::now();
    for (auto& worker: workers) {
        worker = std::thread(copyThread, std::ref(src), std::ref(dstBlocks.at(i++)));
    }
    for (auto& worker: workers) {
        worker.join();
    }
    auto deltaT = std::chrono::duration_cast<std::chrono::duration<double>>(clock_type::now() - before).count();
    auto total = blocksize * nBlocks * nThreads;
    std::cout << nThreads
              << " " << blocksize
              << " " << nBlocks
              << " " << total
              << " " << deltaT
              << " " << total/deltaT
              << " " << total/deltaT/(1024*1014*1014)
              << "\n";
        return 0;
}

循环给出以下输出:

for n in $(seq 32); do ./memcpy $n 2>/dev/null; done
1 16613376 10 166133760 0.0204955 8.10587e+09 7.69881
2 16613376 10 332267520 0.021766 1.52654e+10 14.4988
3 16613376 10 498401280 0.0227502 2.19075e+10 20.8074
4 16613376 10 664535040 0.0228769 2.90483e+10 27.5896
5 16613376 10 830668800 0.0238712 3.47979e+10 33.0504
6 16613376 10 996802560 0.025281 3.94289e+10 37.4489
7 16613376 10 1162936320 0.0266224 4.36827e+10 41.489
8 16613376 10 1329070080 0.0263878 5.03668e+10 47.8375
9 16613376 10 1495203840 0.0298019 5.01715e+10 47.652
10 16613376 10 1661337600 0.0312424 5.31757e+10 50.5053
11 16613376 10 1827471360 0.0335261 5.45089e+10 51.7716
12 16613376 10 1993605120 0.035536 5.6101e+10 53.2838
13 16613376 10 2159738880 0.0414056 5.21605e+10 49.5411
14 16613376 10 2325872640 0.0500519 4.64692e+10 44.1357
15 16613376 10 2492006400 0.0507584 4.90954e+10 46.63
16 16613376 10 2658140160 0.0529706 5.01814e+10 47.6614
17 16613376 10 2824273920 0.0538962 5.24021e+10 49.7706
18 16613376 10 2990407680 0.059596 5.0178e+10 47.6582
19 16613376 10 3156541440 0.0571108 5.52705e+10 52.4949
20 16613376 10 3322675200 0.0616152 5.39262e+10 51.2182
21 16613376 10 3488808960 0.0643704 5.4199e+10 51.4772
22 16613376 10 3654942720 0.0645592 5.66138e+10 53.7708
23 16613376 10 3821076480 0.0678021 5.63563e+10 53.5263
24 16613376 10 3987210240 0.0707682 5.63418e+10 53.5125
25 16613376 10 4153344000 0.0775049 5.35882e+10 50.8971
26 16613376 10 4319477760 0.0866202 4.98669e+10 47.3627
27 16613376 10 4485611520 0.0882388 5.08349e+10 48.2821
28 16613376 10 4651745280 0.0900769 5.1642e+10 49.0486
29 16613376 10 4817879040 0.0928807 5.18717e+10 49.2668
30 16613376 10 4984012800 0.0931539 5.3503e+10 50.8162
31 16613376 10 5150146560 0.0958964 5.37053e+10 51.0084
32 16613376 10 5316280320 0.100783 5.27498e+10 50.1008

所以只有大约 10 个线程才能达到全速(最后一列)。那是在 AMD EPYC 7F72 24 核处理器上

标签: c++memcpy

解决方案


推荐阅读