c++ - 多个线程比单个进程花费更多时间
问题描述
我正在实现模式匹配算法,通过在整个目标的梯度图像上移动模板梯度信息,在每次旋转(-60 到 60)时也是如此。我已经保存了每次旋转的模板信息,即已经预处理和保存了 121 个模板。
但问题是,这会消耗大量时间(大约 110 毫秒),因此决定将一组旋转(-60 到 -30、-30 到 0、0 到 30 和 30 到 60)的匹配拆分为 4 个线程,但是线程处理单个进程需要更多时间(大约 115 毫秒到 120 毫秒)。
代码片段是...
#define MAXTARGETNUM 64
MatchResultA totalResultsTemp[MAXTARGETNUM];
void CShapeMatch::match(ShapeInfo *ShapeInfoVec, search_region SearchRegion, float MinScore, float Greediness, int width,int height, int16_t *pBufGradX ,int16_t *pBufGradY,float *pBufMag, bool corr)
{
MatchResultA resultsPerDeg[MAXTARGETNUM];
....
....
int startX = SearchRegion.StartX;
int startY = SearchRegion.StartY;
int endX = SearchRegion.EndX;
int endY = SearchRegion.EndY;
float AngleStep = SearchRegion.AngleStep;
float AngleStart = SearchRegion.AngleStart;
float AngleStop = SearchRegion.AngleStop;
int startIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStart/AngleStep;
int stopIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStop/AngleStep;
for (int k = startIndex; k < stopIndex ; k++){
....
for(int j = startY; j < endY; j++){
for(int i = startX; i < endX; i++){
for(int m = 0; m < ShapeInfoVec[k].NoOfCordinates; m++)
{
curX = i + (ShapeInfoVec[k].Coordinates + m)->x; // template X coordinate
curY = j + (ShapeInfoVec[k].Coordinates + m)->y ; // template Y coordinate
iTx = *(ShapeInfoVec[k].EdgeDerivativeX + m); // template X derivative
iTy = *(ShapeInfoVec[k].EdgeDerivativeY + m); // template Y derivative
iTm = *(ShapeInfoVec[k].EdgeMagnitude + m); // template gradients magnitude
if(curX < 0 ||curY < 0||curX > width-1 ||curY > height-1)
continue;
offSet = curY*width + curX;
iSx = *(pBufGradX + offSet); // get corresponding X derivative from source image
iSy = *(pBufGradY + offSet); // get corresponding Y derivative from source image
iSm = *(pBufMag + offSet);
if (PartialScore > MinScore)
{
float Angle = ShapeInfoVec[k].Angel;
bool hasFlag = false;
for(int n = 0; n < resultsNumPerDegree; n++)
{
if(abs(resultsPerDeg[n].CenterLocX - i) < 5 && abs(resultsPerDeg[n].CenterLocY - j) < 5)
{
hasFlag = true;
if(resultsPerDeg[n].ResultScore < PartialScore)
{
resultsPerDeg[n].Angel = Angle;
resultsPerDeg[n].CenterLocX = i;
resultsPerDeg[n].CenterLocY = j;
resultsPerDeg[n].ResultScore = PartialScore;
break;
}
}
}
if(!hasFlag)
{
resultsPerDeg[resultsNumPerDegree].Angel = Angle;
resultsPerDeg[resultsNumPerDegree].CenterLocX = i;
resultsPerDeg[resultsNumPerDegree].CenterLocY = j;
resultsPerDeg[resultsNumPerDegree].ResultScore = PartialScore;
resultsNumPerDegree ++;
}
minScoreTemp = minScoreTemp < PartialScore ? PartialScore : minScoreTemp;
}
}
}
for(int i = 0; i < resultsNumPerDegree; i++)
{
mtx.lock();
totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
totalResultsNum++;
mtx.unlock();
}
n++;
}
void CallerFunction(){
int16_t *pBufGradX = (int16_t *) malloc(bufferSize * sizeof(int16_t));
int16_t *pBufGradY = (int16_t *) malloc(bufferSize * sizeof(int16_t));
float *pBufMag = (float *) malloc(bufferSize * sizeof(float));
clock_t start = clock();
float temp_stop = SearchRegion->AngleStop;
SearchRegion->AngleStop = -30;
thread t1(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness, width, height, pBufGradX ,pBufGradY,pBufMag, corr);
SearchRegion->AngleStart = -30;
SearchRegion->AngleStop=0;
thread t2(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness, width, height, pBufGradX ,pBufGradY,pBufMag, corr);
SearchRegion->AngleStart = 0;
SearchRegion->AngleStop=30;
thread t3(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);
SearchRegion->AngleStart = 30;
SearchRegion->AngleStop=temp_stop;
thread t4(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);
t1.join();
t2.join();
t3.join();
t4.join();
clock_t end = clock();
cout << 1000*(double)(end-start)/CLOCKS_PER_SEC << endl;
}
正如我们所见,有很多堆访问,但它们只是只读的。只有totalResultTemp
和totalResultNum
是在其上执行写入的共享全局资源。
我的电脑配置是,
i5-7200U CPU @ 2.50GHz 4 cores
4 Gig RAM
Ubuntu 18
解决方案
for(int i = 0; i < resultsNumPerDegree; i++)
{
mtx.lock();
totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
totalResultsNum++;
mtx.unlock();
}
您写入静态数组,而互斥锁真的很耗时。不要创建锁,而是尝试使用std::atomic_int,或者在我看来更好,只需传递给函数存储结果的确切位置,因此同步问题不再是您的问题
推荐阅读
- tensorflow - 使用fit_generator()每次产量多次通过?
- apache-camel - 在 Openshift 上部署 de route 时,来自 Apache Camel SFTP 组件的 chmod 选项不起作用
- arduino - “数据”之前的预期主表达式
- javascript - 当浏览器窗口向下或向上时如何在导航中删除或添加类
- elasticsearch - 弹性搜索查询不仅可以识别结果,还可以突出显示满足特定属性的项目
- scrapy - 部署到 Scrapy Cloud 时出现“错误:部署失败:b''”
- ios - UIImagePickerViewController 相机重载标签栏控制器(调用viewDidLoad)
- python - 使用 dropout (TF2.0) 时,可变批量大小不适用于 tf.keras.layers.RNN?
- visio - 雪花的 Visio 模具?
- javascript - React TypeScript 从 Click 事件中获取数据属性