c++ - 为什么 omp 版本比串行版本慢?
问题描述
这是这个问题的后续问题
现在我有了代码:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp barrier
//#pragma omp single
//std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
但为什么它比串行模拟慢 2-3 倍(32 秒对 18 秒):
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a,b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
int main() {
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
for (int i = 0; i <= n + 1; ++i) {
x[i] = y[i] = i * h;
_new[i][0]=v[i][0] = 6 - 2 * x[i];
_new[n + 1][i]=v[n + 1][i] = 4 - 2 * y[i];
_new[i][n + 1]=v[i][n + 1] = 3 - x[i];
_new[0][i]=v[0][i] = 6 - 3 * y[i];
}
const double eps=0.01;
while(true){ //for [iters=1 to maxiters by 2]
double maxdiff=0.0;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
_new[i][j]=(v[i-1][j]+v[i+1][j]+v[i][j-1]+v[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
v[i][j]=(_new[i-1][j]+_new[i+1][j]+_new[i][j-1]+_new[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
maxdiff=max(maxdiff, fabs(_new[i][j]-v[i][j]));
if(maxdiff<eps) break;
std::cout << maxdiff<<std::endl;
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
同样有趣的是它与版本相同(如果你这么说,我可以在这里发布)看起来像这样
while(true){ //106 iteratins here!!!
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
}
但我认为使 omp 代码变慢的原因是在 while 循环中产生线程 106 次......但不!然后可能线程同时写入相同的数组单元。但它发生在哪里?没看到可以给我看看吗?
也许是因为太多的障碍?但是讲师告诉我像这样实现代码并“分析它”也许答案是“Jacobi 算法并不意味着并行运行良好”?或者这只是我的蹩脚编码?
解决方案
所以 vel 的根源是
max(maxdiffA[w],fabs(_new[icol][jrow] - v[icol][jrow]))
因为它是
#define max(a, b) (a)>(b)?(a):(b)
它可能创建了太多的分支('if's)没有这个东西并行版本的工作速度提高了 8 倍,并且加载 CPU 68% 而不是 99% .. 奇怪的事情:相同的“max”不会影响串行版本
推荐阅读
- hbase - 无法从 hbase shell 命令中删除对等点
- apache-flink - Apache Flink KeyedStream 后窗口操作员行为澄清
- reactjs - 在 ReactJS 中验证来自后端的数据的正确位置是什么?
- logistic-regression - 二元逻辑回归:总体百分比没有增加的显着性?
- python - 在 python 中加载 COM dll
- excel - 从行开始查找要粘贴的列
- git - 为什么git说我的文件在存储库之外?
- c# - 有什么方法可以在不使用继承的情况下将属性带入类?
- django - 使用 Django digitalocean Spaces 请求的资源上不存在“Access-Control-Allow-Origin”标头
- c++ - 从记录集中读取文本字段返回“?”