c++ - 异步比顺序执行花费更长的时间
问题描述
我正在尝试使用多线程在 C++ 中执行一项非常基本的任务,我注意到使用较少数量的线程运行此代码实际上比使用多线程更快。
我尝试切换到线程(...),添加和删除互斥锁,但它仍然是相同的或更慢。
struct threadResults
{
int64_t threadSize;
int64_t sum =0;
int32_t min_val = std::numeric_limits<int32_t>::max();
int32_t max_val = std::numeric_limits<int32_t>::min();
};
void doCalculation(int thread_num, std::vector<int> rand_values, int32_t numPerThread, threadResults* myThreadResult, int remainder)
{
if(remainder!=0&&thread_num==0)
{
for(int i=0;i<numPerThread+remainder;++i)
{
myThreadResult->sum += rand_values[i];
if(myThreadResult->min_val>rand_values[i])
{
myThreadResult->min_val = rand_values[i];
}
if(myThreadResult->max_val<rand_values[i])
{
myThreadResult->max_val = rand_values[i];
}
}
}
else if(remainder!=0&&thread_num!=0)
{
int start = (numPerThread*thread_num)+remainder;
for(int i=start;i<numPerThread+start;++i) {
myThreadResult->sum += rand_values[i];
if (myThreadResult->min_val > rand_values[i]) {
myThreadResult->min_val = rand_values[i];
}
if (myThreadResult->max_val < rand_values[i]) {
myThreadResult->max_val = rand_values[i];
}
}
}
else if(remainder==0)
{
int start = (numPerThread*thread_num);
for(int i=start;i<numPerThread+start;++i) {
myThreadResult->sum += rand_values[i];
if (myThreadResult->min_val > rand_values[i]) {
myThreadResult->min_val = rand_values[i];
}
if (myThreadResult->max_val < rand_values[i]) {
myThreadResult->max_val = rand_values[i];
}
}
}
}
std::future<void> myThreads[num_threads];
auto start = sc::high_resolution_clock::now();
//TODO: Implement multithreaded code to populate the above stat values
for(int i=0;i<num_threads;i++) {
myThreads[i] = async(std::launch::async, doCalculation, i, rand_values, numPerThread, &myThreadResults[i], remainder);
}
for(int i=0;i<num_threads;i++) {
myThreads[i].get();
}
for(threadResults t : myThreadResults)
{
sum += t.sum;
if(t.min_val<min_val)
{
min_val = t.min_val;
}
if(t.max_val>max_val)
{
max_val = t.max_val;
}
}
auto end = sc::high_resolution_clock::now();
我希望使用 1 个线程运行此代码比 2+ 个线程花费更多时间,但事实并非如此。
$ ./sumit 10000000 1 0
Elapsed Time: 101228us
Sum: 101555736
Min: -100000
Max: 100000
Avg: 10.1556
$ ./sumit 10000000 2 0
Elapsed Time: 142738us
Sum: 101555736
Min: -100000
Max: 100000
Avg: 10.1556
解决方案
更改此行:
void doCalculation(int thread_num, std::vector<int> rand_values, int32_t numPerThread, threadResults* myThreadResult, int remainder)
对此:
void doCalculation(int thread_num, std::vector<int> const &rand_values, int32_t numPerThread, threadResults* myThreadResult, int remainder)
我敢打赌事情会变得更快。
我写了一个程序来完成你所做的事情。我会将整个内容粘贴在这里,如果您需要更具建设性的帮助,这确实是您应该做的:
#include <fmt/core.h>
#include <future>
#include <cstdint>
#include <limits>
#include <vector>
#include <stdexcept>
#include <random>
#include <string>
#include <algorithm>
#include <chrono>
using ::std::int64_t;
using ::std::uint32_t;
using ::std::future;
struct threadResults
{
int64_t sum;
int32_t min_val;
int32_t max_val;
};
using randvec_t = ::std::vector<int>;
threadResults doCalculation(randvec_t::iterator begin, randvec_t::iterator end)
{
if (begin == end) {
throw ::std::range_error("Range to iterate over must not be empty!");
}
threadResults results{0, *begin, *begin};
for (auto i = begin; i != end; ++i) {
auto const cur = *i;
results.sum += cur;
if (results.min_val > cur) {
results.min_val = cur;
} else if (results.max_val < cur) {
results.max_val = cur;
}
}
return results;
}
int main(int argc, char const * const argv[])
{
if (argc != 3) {
fmt::print(stderr, "Usage: {} <num_to_sum> <threads_to_use>\n", argv[0]);
return 1;
}
int const size = ::std::stoi(argv[1]);
int const nthreads = ::std::stoi(argv[2]);
if (size <= nthreads || nthreads < 0) {
fmt::print(stderr, "You must have more elements than threads and neither may be negative.");
}
randvec_t elements;
elements.reserve(size);
{
//std::random_device rd;
std::mt19937 gen(5);
std::uniform_int_distribution<> dis(0, ::std::numeric_limits<randvec_t::value_type>::max());
for (int i = 0; i < size; ++i) {
elements.push_back(dis(gen));
}
}
namespace sc = ::std::chrono;
auto start = sc::high_resolution_clock::now();
::std::vector<::std::future<threadResults>> futures;
futures.reserve(nthreads);
for (unsigned int start = 0; start < elements.size();) {
unsigned int const end = start + (elements.size() - start) / (nthreads - futures.size());
futures.push_back(::std::async(::std::launch::async, doCalculation, elements.begin() + start, elements.begin() + end));
start = end;
}
threadResults aggregate{0, 0, 0};
bool first = true;
for (auto &future: futures) {
if (first) {
aggregate = future.get();
first = false;
} else {
auto this_result = future.get();
aggregate.sum += this_result.sum;
aggregate.min_val = ::std::min(this_result.min_val, aggregate.min_val);
aggregate.max_val = ::std::max(this_result.max_val, aggregate.max_val);
}
}
auto end = sc::high_resolution_clock::now();
auto duration = end - start;
fmt::print("Duration: {}us\n", duration / sc::microseconds(1));
fmt::print(" sum: {}\n", aggregate.sum);
fmt::print(" min: {}\n", aggregate.min_val);
fmt::print(" max: {}\n", aggregate.max_val);
return 0;
}
以下是在我漂亮的 16 个硬件线程 Ryzen 7 上运行该程序的结果(我使用takoshell,因此出现了奇怪的提示):
/tmp/so_problem
$ ./a.out 100000000 1
Duration: 76676us
sum: 107373414219986326
min: 11
max: 2147483633
/tmp/so_problem
$ ./a.out 100000000 4
Duration: 20534us
sum: 107373414219986326
min: 11
max: 2147483633
/tmp/so_problem
$ ./a.out 100000000 8
Duration: 17177us
sum: 107373414219986326
min: 11
max: 2147483633
我不确定为什么结果会在 4 到 8 个线程之间触底。现在......这将是一个有趣的问题。我强烈怀疑它与内存带宽有关。
请注意我的程序是如何缺少您的程序所具有的许多奇怪的无意义的绒毛。
推荐阅读
- javascript - 如何在不使用标准按钮的情况下让 Facebook 登录工作
- apache-kafka - 关于用 Kafka 实现 cqrs/saga 的问题
- modx - Modx Formit 简单联系方式
- java - 如果字符串对象是不可变的,那么为什么内容会发生变化?
- python - 运行循环后将垃圾值分配给列表元素
- typescript - 如何在电子中使用打字稿而无需编译?
- python-3.x - 如何使 OpenCV 捕获窗口显示在我的浏览器窗口上?
- javascript - JQuery ajaxSubmit 函数不适用于 AngularJS HTML 页面中的表单
- visual-studio-2019 - 有谁知道解决方案上下文菜单中“添加”子菜单组的 VSSDK id?
- reactjs - django rest + react js 如何直接访问react route的url?