首页 > 解决方案 > 在 CUDA 中转换“for循环”的问题

问题描述

我试图通过像素移位/重叠从图像中并行提取补丁。我已经编写了代码的 CPU 版本。但是我无法转换具有像素移位增量的 for 循环。我已经给出了使用 for 循环的代码部分。 CreatePatchDataSet函数有一个“for loop”,它有一个像素移位的增量。请帮我把这个函数转换成 Cuda。我提供了以下代码。

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#include <fstream>
#include <sstream>
#include <random>
#include <vector>
#include <omp.h>


using namespace std;
using namespace cv;

#define PATCH_SIZE (5)                          
#define PIXEL_SHIFT (2)                         

void ConvertMat2DoubleArray(cv::Mat input, double* output)
{

    for (int i = 0; i < input.rows; i++)
    {
        double *src = input.ptr<double>(i);
        for (int j = 0; j < input.cols; j++)
        {
            output[input.cols * input.channels() * i + input.channels() * j + 0] = src[j];
        }
    }

}

void GetNumOfPatch(const int width, const int height, const int patch_size, const int pixel_shift, int* num_of_patch, int* num_of_patch_col, int* num_of_patch_row) {

    *num_of_patch_col = 0;
    int len_nb = 0;
    while (len_nb < width) {
        if (len_nb != 0) {
            len_nb += patch_size - (patch_size - pixel_shift);
        }
        else {
            len_nb += patch_size;
        }
        (*num_of_patch_col)++;
    }

    len_nb = 0;
    *num_of_patch_row = 0;
    while (len_nb < height) {
        if (len_nb != 0) {
            len_nb += patch_size - (patch_size - pixel_shift);
        }
        else {
            len_nb += patch_size;
        }
        (*num_of_patch_row)++;
    }
    *num_of_patch = (*num_of_patch_col) * (*num_of_patch_row);
}



void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {
    int counter_row = 0;
    int num_of_patch_image = num_of_patch_row * num_of_patch_col;
 
 
    for (int i = 0; i < height; i += pixel_shift) {
        int counter_col = 0;
        for (int j = 0; j < width; j += pixel_shift) {
            //Get Low Resolution Image
            for (int ii = 0; ii < patch_size; ii++) {
                for (int jj = 0; jj < patch_size; jj++) {
                    if ((i + ii) < height && (j + jj) < width) {
                        patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];
                    }
                    else {
                        patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
                    }
                }
            }
            counter_col++;
            if (counter_col == num_of_patch_col) {
                break;
            }
        }
        counter_row++;
        if (counter_row == num_of_patch_row) {
            break;
        }
    }
}











int main()
{
  int ratio=2;
  cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
 
    
  cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
  cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, 
  cv::INTER_LANCZOS4);
  
 
 
  double* orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
    ConvertMat2DoubleArray(imageH, orgimageH);
 
  int widthH = imageH.cols;
    int heightH = imageH.rows;

    
 
    int dimH = (int)PATCH_SIZE *  (int)PATCH_SIZE* (int)image.channels();
    int dimL = (int)PATCH_SIZE/ratio*  (int)PATCH_SIZE/ratio  * (int)image.channels();
 
 
 //3. Create training data set=========================
    int num_of_patch_image = 0;
    int num_of_patch_col = 0;
    int num_of_patch_row = 0;
 
    GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
  cout<<"patch numbers: \n " << num_of_patch_image << endl;
  
    double* FY = (double*)calloc(dimH  * num_of_patch_image, sizeof(double));
 
  CreatePatchDataSet(orgimageH, FY, widthH, heightH, (int)PIXEL_SHIFT,  (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
    free(orgimageH);
    free(FY);



return 0;
}

我在 CPU 版本中获得的前 10 个值的结果:补丁号:16129 238,240,240,235,237,230,227,229,228,227

我尝试使用 cuda: 将此函数转换为内核函数。但它进入了无限循环。由于我对这个 CUDA 领域非常陌生,请您帮我找出代码中的问题吗?

__global__ void CreatePatchDataSet(double *original_data, double* patch_data, const int width, const int height, const int pixel_shift, const int patch_size, const int num_of_patch_col, const int num_of_patch_row) {


    int num_of_patch_image = num_of_patch_row * num_of_patch_col;
    int i = threadIdx.x + (blockDim.x*blockIdx.x);
  int j = threadIdx.y + (blockDim.y*blockIdx.y);
    while (i<height && j< width)
    {
     int counter_row = 0;
     int counter_col = 0;
            //Get Low Resolution Image
            for (int ii = 0; ii < patch_size; ii++) {
                for (int jj = 0; jj < patch_size; jj++) {
                    if ((i + ii) < height && (j + jj) < width) {

                        patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = original_data[width*(i + ii) + (j + jj)];

                    }
                    else {
                        patch_data[num_of_patch_image * (patch_size * ii + jj) + num_of_patch_col*counter_row + counter_col] = 0.;
                    }

                }
            }

            counter_col++;
            if (counter_col == num_of_patch_col) {
                break;
            }

        
        counter_row++;
        if (counter_row == num_of_patch_row) {
            break;
        }
}
 i+= blockDim.x*gridDim.x;
 j+= blockDim.y*gridDim.y;

}


int main()
{
  int ratio=2;
  cv::Mat image = cv::imread("input_b2_128.tif", CV_LOAD_IMAGE_UNCHANGED);
 
    
  cv::Mat imageH = cv::Mat(image.rows * ratio, image.cols * ratio, CV_8UC1);
  cv::resize(image, imageH, cv::Size(imageH.cols, imageH.rows), 0, 0, cv::INTER_LANCZOS4);
  
  
 
 
  double *orgimageH = (double*)calloc(imageH.cols*imageH.rows*image.channels(), sizeof(double));
    ConvertMat2DoubleArray(imageH, orgimageH);
 
  int widthH = imageH.cols;
    int heightH = imageH.rows;
//  

    int dimH = (int)PATCH_SIZE *  (int)PATCH_SIZE* (int)image.channels();
    int dimL = (int)PATCH_SIZE/ratio*  (int)PATCH_SIZE/ratio  * (int)image.channels();
 
 
 //3. Create training data set=========================
    int num_of_patch_image = 0;
    int num_of_patch_col = 0;
    int num_of_patch_row = 0;
 
    GetNumOfPatch(widthH, heightH, (int)PATCH_SIZE, (int)PIXEL_SHIFT, &num_of_patch_image, &num_of_patch_col, &num_of_patch_row);
  cout<<"patch numbers: \n " << num_of_patch_image << endl;
  
    double* FY = (double*)calloc(dimH  * num_of_patch_image, sizeof(double));
 
 
  double *d_orgimageH;
  gpuErrchk(cudaMalloc ((void**)&d_orgimageH, sizeof(double)*widthH*heightH));

  double *d_FY;
  gpuErrchk(cudaMalloc ((void**)&d_FY, sizeof(double)* dimH  * num_of_patch_image));
  
  
  gpuErrchk(cudaMemcpy(d_orgimageH , orgimageH ,  sizeof(double)*widthH*heightH, cudaMemcpyHostToDevice));
  
  
  dim3 dimBlock(16, 16);
  dim3 dimGrid;
  dimGrid.x = (widthH + dimBlock.x - 1) / dimBlock.x;
  dimGrid.y = (heightH + dimBlock.y - 1) / dimBlock.y;
  CreatePatchDataSet<<<dimGrid,dimBlock>>>(d_orgimageH, d_FY, widthH, heightH, (int)PIXEL_SHIFT,  (int)PATCH_SIZE, num_of_patch_col, num_of_patch_row);
  
   gpuErrchk(cudaMemcpy(FY,d_FY, sizeof(double)*dimH  * num_of_patch_image, cudaMemcpyDeviceToHost));
//  cout<<"Hello world";
    free(orgimageH);
    free(FY);

 cudaFree(d_FY);
 cudaFree(d_orgimageH);

return 0;
}

我使用过的图片:[1]:https ://i.stack.imgur.com/Ywg7p.png

标签: c++cudagpu

解决方案


i+= blockDim.x*gridDim.x;
j+= blockDim.y*gridDim.y;

位于内核的 while 循环之外。因为i并且j永远不会在 while 循环内改变,所以它不会停止。这里可能还有更多问题,但这是最突出的问题。

编辑:我发现的另一个问题是,您在两者上都只有一个时间,ij不是每个时间都有一个。您可能应该在 CPU 代码中使用 for 循环:

for (i = pixel_shift * (threadIdx.x + (blockDim.x*blockIdx.x)); 
    i < height;
    i += pixel_shift * blockDim.x * gridDim.x) {
  for (j = ...; j < ...; j += ...) {
    /* ... */
  }
}

编辑2:我可以想象这是一个好主意:

for (counter_row = threadIdx.y + blockDim.y * blockIdx.y; 
     counter_row < num_of_patch_row;
     counter_row += blockDim.y * gridDim.y) {
  i = counter_row * pixel_shift;
  if (i > height)
    break;
  for (counter_col = threadIdx.x + blockDim.x * blockIdx.x; 
       counter_col < num_of_patch_col; 
       counter_col += blockDim.x * gridDim.x) {
    j = counter_col * pixel_shift;
    if (j > width)
      break;
    /* ... */
  }
}

我还在内部和外部循环之间交换了执行参数的 x/y 字段,因为考虑到 x 字段在 warp 中是连续的(内存访问优势),这似乎更合适。


推荐阅读