首页 > 解决方案 > #pragma omp parallel 仅使用 1 个线程

问题描述

我正在尝试使用 openMP 实现并行 GEMM 并使用 ctypes 从 python 调用它。代码运行时,资源监视器仅显示整个运行时使用了 1 个线程。c++和python代码如下——

#include <cstddef>
#include <iostream>
#include <omp.h>
extern "C"
{
void parallelGemm(const double* a, const double* b, double* c, size_t arows, size_t acols, size_t bcols)
{
#pragma omp parallel for
    for(int i=0;i<arows;i++)
    {
        for(int k = 0; k<acols; k++)
        {
#pragma omp simd
            for(int j = 0; j< bcols; j++)
            {
                c[i*bcols +j] += a[i*acols+k]*b[k*bcols+j];
            }
        }
    }
}

}

和 python 代码是 -

import ctypes
import time
import numpy
from numpy.ctypeslib import ndpointer
import time

from numpy.ctypeslib import ndpointer
lib = ctypes.cdll.LoadLibrary('libtest.so')
fun = lib.parallelGemm

fun.restype = None
fun.argtypes = [ndpointer(ctypes.c_double, flags = "C_CONTIGUOUS"),
        ndpointer(ctypes.c_double, flags = "C_CONTIGUOUS"),
        ndpointer(ctypes.c_double, flags = "C_CONTIGUOUS"),
        ctypes.c_size_t, 
        ctypes.c_size_t,
        ctypes.c_size_t]

A = numpy.ones((5000, 10000))
B = numpy.ones((10000, 5000))
C = numpy.zeros((5000,5000))
print(A)
print('--------------------------------------')
s = time.time()
result = numpy.dot(A, B)
e = time.time()
print(result)
print('NUMPY.DOT TIME = ', e-s)
print('---------------------------------------')
arows = A.shape[0]
acol = A.shape[1]
bcol = B.shape[1]

s = time.time()
fun(A, B, C, arows, acol, bcol)
e = time.time()

print(C)
print('PARALLEL_GEMM TIME =', e-s)
print('---------------------------------------')
print(C-result)

这可能是导致问题的原因。TIA

标签: c++parallel-processingopenmpvectorization

解决方案


推荐阅读