首页 > 解决方案 > 使用单线程时的 C++11 线程与 OpenMP

问题描述

我正在尝试并行化多次运行的代码(因此,毫秒数)。我想使用 C++11 线程,因为它可以在 Mac OS X 上本地运行,而不是要求用户安装 OpenMP(例如使用 Homebrew)。但是,我发现 OpenMP 的性能明显优于 C++11 线程。

令人惊讶的是,即使 C++11 仅使用单个线程,它的开销也比 OpenMP 大得多。对于下面的简单循环,我们有以下时间(仅使用一个线程):

C++11 Multithreading: 57 ms
OpenMP Multithreading: 47 ms
Native for loop: 43 ms

当使用超过 1 个线程时,我看到 OpenMP 的速度比 C++11 线程有更显着的改进。如何使用 C++11 线程获得 OpenMP 类型的速度?

这是一些经过改编的代码,它们或多或少地完成了我的实际代码正在尝试做的事情。

C++11 多线程

#include <stdlib.h>
#include <stdio.h>
#include <thread>
#include <vector>
#include <algorithm>
#include <time.h>

long int diff(timespec start, timespec end)
{
    long int tv_nsec;
    tv_nsec = 1000000000*(end.tv_sec-start.tv_sec)+end.tv_nsec-start.tv_nsec;
    return tv_nsec;
}
int main() {

    int n = 1E6;
    int ndim = 3;
    int nthreads = 1;
    double * arr1 =(double*) malloc(n* sizeof(double));
    double * arr2 =(double*) malloc(n* sizeof(double));
    double * xs=(double*) malloc(n* sizeof(double));
    double * ys =(double*) malloc(n* sizeof(double));
    int * iarr =(int*) malloc(n* sizeof(int));
    double * arr3 =(double*) malloc(n*ndim *sizeof(double));
    for (int i = 0; i++ ; i<n) {
        iarr[i] = i;
        xs[i] = i;
        ys[i] = i; for (int j = 0; j++ ; j<ndim) {
            charges[j*ndim +i] = i*j;
        }
    }

        struct timespec start10,start20, end10, end20;
        clock_gettime(CLOCK_MONOTONIC, &start10);

        {
        std::vector<std::thread> threads(nthreads);
        for (int t = 0; t < nthreads; t++) {
            threads[t] = std::thread(std::bind(
                    [&](const int bi, const int ei, const int t)
                    {
                        for(int i = bi;i<ei;i++)
                        {
                            {
                                arr1[i] = xs[iarr[i]];
                                arr2[i] = ys[iarr[i]];
                                for (int idim=0; idim<ndim; idim++){
                                        arr3[idim*n+i] = charges[idim*n +iarr[i]];
                                }

                            }
                        }
                    },t*n/nthreads,(t+1)==nthreads?n:(t+1)*n/nthreads,t));
        }
        std::for_each(threads.begin(),threads.end(),[](std::thread& x){x.join();});
        }


        clock_gettime(CLOCK_MONOTONIC, &end10);
        printf("Initializing and sorting (%d threads): %.2lf ms\n",nthreads, (diff(start10,end10))/(double)1E6);

开放式MP:

        struct timespec start10,start20, end10, end20;
        clock_gettime(CLOCK_MONOTONIC, &start10);

    #pragma omp parallel num_threads(1)
    {
    #pragma omp for
    for (int i = 0; i<n; i++){

                                arr1[i] = xs[iarr[i]];
                                arr2[i] = ys[iarr[i]];
                                for (int idim=0; idim<ndim; idim++){
                                        arr3[idim*n+i] = charges[idim*n +iarr[i]];
                                }

                            }
    }
        clock_gettime(CLOCK_MONOTONIC, &end10);
        printf("Initializing and sorting (%d threads): %.2lf ms\n",nthreads, (diff(start10,end10))/(double)1E6);
}

标签: multithreadingc++11openmp

解决方案


推荐阅读