首页 > 解决方案 > 多个线程比单个进程花费更多时间

问题描述

我正在实现模式匹配算法,通过在整个目标的梯度图像上移动模板梯度信息,在每次旋转(-60 到 60)时也是如此。我已经保存了每次旋转的模板信息,即已经预处理和保存了 121 个模板。

但问题是,这会消耗大量时间(大约 110 毫秒),因此决定将一组旋转(-60 到 -30、-30 到 0、0 到 30 和 30 到 60)的匹配拆分为 4 个线程,但是线程处理单个进程需要更多时间(大约 115 毫秒到 120 毫秒)。

代码片段是...


#define MAXTARGETNUM    64
MatchResultA totalResultsTemp[MAXTARGETNUM];



void CShapeMatch::match(ShapeInfo *ShapeInfoVec, search_region SearchRegion, float MinScore, float Greediness, int width,int height, int16_t  *pBufGradX ,int16_t  *pBufGradY,float  *pBufMag,  bool corr)
{
  MatchResultA resultsPerDeg[MAXTARGETNUM];
....
....
    int startX =  SearchRegion.StartX;
    int startY =  SearchRegion.StartY;
    int endX   =  SearchRegion.EndX;
    int endY   =  SearchRegion.EndY;
    
    float AngleStep  = SearchRegion.AngleStep;
    float AngleStart = SearchRegion.AngleStart;
    float AngleStop = SearchRegion.AngleStop;

    int startIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStart/AngleStep;
    int stopIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStop/AngleStep;

for (int k = startIndex; k < stopIndex ; k++){
         .... 
         for(int j = startY; j < endY; j++){
            for(int i = startX; i < endX; i++){
                
                    for(int m = 0; m < ShapeInfoVec[k].NoOfCordinates; m++)
                    {
                        curX = i + (ShapeInfoVec[k].Coordinates + m)->x;        // template X coordinate
                        curY = j + (ShapeInfoVec[k].Coordinates + m)->y ;       // template Y coordinate
                        
                        iTx = *(ShapeInfoVec[k].EdgeDerivativeX + m);           // template X derivative
                        iTy = *(ShapeInfoVec[k].EdgeDerivativeY + m);           // template Y derivative
                        iTm   = *(ShapeInfoVec[k].EdgeMagnitude + m);           // template gradients magnitude
                        
                        if(curX < 0 ||curY < 0||curX > width-1 ||curY > height-1)
                            continue;
                        offSet = curY*width + curX;
                        iSx = *(pBufGradX + offSet);            // get corresponding  X derivative from source image
                        iSy = *(pBufGradY + offSet);            // get corresponding  Y derivative from source image
                        iSm = *(pBufMag   + offSet);

                        if (PartialScore > MinScore)
                    {   
                    
                        float Angle = ShapeInfoVec[k].Angel;
                        bool hasFlag = false;
                        for(int n = 0; n < resultsNumPerDegree; n++)
                        {       
                            if(abs(resultsPerDeg[n].CenterLocX - i) < 5 && abs(resultsPerDeg[n].CenterLocY - j) < 5)
                            {   
                                hasFlag = true;
                                if(resultsPerDeg[n].ResultScore < PartialScore)
                                {   
                                    resultsPerDeg[n].Angel = Angle;
                                    resultsPerDeg[n].CenterLocX = i;
                                    resultsPerDeg[n].CenterLocY = j;
                                    resultsPerDeg[n].ResultScore = PartialScore;
                                    
                                    break;
                                }
                            }
                        }
                        if(!hasFlag)
                        {   
                            resultsPerDeg[resultsNumPerDegree].Angel = Angle;
                            resultsPerDeg[resultsNumPerDegree].CenterLocX = i;
                            resultsPerDeg[resultsNumPerDegree].CenterLocY = j;
                            resultsPerDeg[resultsNumPerDegree].ResultScore = PartialScore;
    
                            resultsNumPerDegree ++;
                        }
                        minScoreTemp = minScoreTemp < PartialScore ? PartialScore : minScoreTemp;   
                    }
                }
            }
            
            
            
            for(int i = 0; i < resultsNumPerDegree; i++)
                {
                    mtx.lock();
                    totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
                    totalResultsNum++;
                    mtx.unlock();
                }
        
            n++;
}

void CallerFunction(){
            int16_t  *pBufGradX   = (int16_t *) malloc(bufferSize * sizeof(int16_t));
            int16_t  *pBufGradY   = (int16_t *) malloc(bufferSize * sizeof(int16_t));
            float    *pBufMag     = (float *) malloc(bufferSize * sizeof(float));
          
          clock_t start = clock();

          float temp_stop = SearchRegion->AngleStop;

            SearchRegion->AngleStop = -30;
            thread t1(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,  width, height, pBufGradX ,pBufGradY,pBufMag, corr);

            SearchRegion->AngleStart = -30;
            SearchRegion->AngleStop=0;
            thread t2(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,  width, height, pBufGradX ,pBufGradY,pBufMag, corr);            

            SearchRegion->AngleStart = 0;
            SearchRegion->AngleStop=30;
            thread t3(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);          

            SearchRegion->AngleStart = 30;
            SearchRegion->AngleStop=temp_stop;
            thread t4(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);

            t1.join();
            t2.join();
            t3.join();
            t4.join();
            
            clock_t end = clock();
            cout  << 1000*(double)(end-start)/CLOCKS_PER_SEC << endl;
}

正如我们所见,有很多堆访问,但它们只是只读的。只有totalResultTemptotalResultNum是在其上执行写入的共享全局资源。

我的电脑配置是, i5-7200U CPU @ 2.50GHz 4 cores 4 Gig RAM Ubuntu 18

标签: c++linuxmultithreadingimage-processingoperating-system

解决方案


for(int i = 0; i < resultsNumPerDegree; i++)
                {
                    mtx.lock();
                    totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
                    totalResultsNum++;
                    mtx.unlock();
                }

您写入静态数组,而互斥锁真的很耗时。不要创建锁,而是尝试使用std::atomic_int,或者在我看来更好,只需传递给函数存储结果的确切位置,因此同步问题不再是您的问题


推荐阅读