首页 > 解决方案 > 在 Cuda 中升级数据

问题描述

实现矩阵以水平和垂直放大。例如,3 通道数据放大 4 倍,如下所示。

在此处输入图像描述

在此处输入图像描述

我的 cuda 代码正在为此工作。

#include <iostream>
#include <fstream>
#include "ResizeAreaKernel.h"
#define DEBUG
using namespace std;
__global__ void ResizeAreaKernel(float *input, float *output, int upscale, int w, int h, int c, int total)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;//there are w*h number of threads
    int stride = blockDim.x * gridDim.x;
    for (int i = tid; i < total; i += stride){
       int rowadd= ((int)(i/w)*c*w*upscale*upscale)-((int)(i/w)*w*c*upscale);//(j*3*5*4*4) - (j*5*3*4)
       for(int y = 0; y < upscale; y++){
          int s=i*c*upscale+rowadd;
          int e=s+upscale*c;
          for(int x = s; x < e; x=x+c){
             for(int c_ = 0; c_ < c; c_++){
                output[x+c_+y*c*w*upscale] = input[i*c+c_];
             } 
          }       
       }
    }
    return;
}


int ResizeAreaInference(float *input, float *output, int upscale, int w, int h, int c)
{  
    int N = w*h*c;    
    const int THREADS_PER_BLOCK = 256;
    const int NUMBLOCKS = (int)((float)(N+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK); 
    ResizeAreaKernel<<<NUMBLOCKS, THREADS_PER_BLOCK>>>(input, output, upscale, w, h, c, N);
    cudaDeviceSynchronize();
    return 0;
}

#ifdef DEBUG
void printdata(float *ptr, int size, const char* name, int stride)
{
   ofstream myfile;
   myfile.open (name);
   for(int i=0; i < size; i++){
      if(i % stride == 0 && i!=0)
         myfile << "\n";
      myfile << *(ptr+i) << ",";


   }
   myfile.close();
   return;
}

int main(void)
{
   int w = 4;
   int h = 3;
   int c = 3;
   int upscale = 4;
   float *in, *out;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&in, h*w*c*sizeof(float));
  cudaMallocManaged(&out, 10*h*upscale*w*upscale*c*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < h*w*c; i++) {
    in[i] = rand() % 100;
  }
  printdata(in, w*h*c, "input.txt",w*c);
  ResizeAreaInference(in, out, upscale, w, h, c);

  // Check for errors (all values should be 3.0f)
  printdata(out, w*upscale*h*upscale*c, "output.txt", w*upscale*c);

  // Free memory
  cudaFree(in);
  cudaFree(out);

  return 0;
}

#endif

但我需要在内存中为输出缓冲区占用更多空间(现在是 10 倍)

cudaMallocManaged(&out, 10*h*upscale*w*upscale*c*sizeof(float));

我需要h*upscale*w*upscale*c*sizeof(float)这么多内存来输出,但如果我不占用额外的空间,我有

Bus error (core dumped)

可能是什么问题?

标签: cuda

解决方案


坦率地说,看着你发布这段代码的新版本有点乏味,当它们在内核代码中都有相同或相关的索引问题时,宣布它们现在要么工作要么不工作(这里这里)

所以为了让你摆脱痛苦,我会这样做:

#include <iostream>
#include <fstream>

using namespace std;

template<int c>
__global__ void ResizeAreaKernel(float *input, float *output, int upscale, int w, int h)
{
    int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ( (tidx < h) && (tidy < w) ) {
        int lda_in = w * c;
        int lda_out = w * c * upscale;

        int pid = tidx * lda_in + (c * tidy);

        float pixel[c];
#pragma unroll
        for(int i=0; i<c; i++)
            pixel[i] = input[pid+i];

        for(int r1=0; r1<upscale; r1++) {
            for(int r2=0; r2<upscale; r2++) {
                int oid = (upscale * tidx + r1) * lda_out + (upscale * c * tidy) + (r2 * c);
#pragma unroll
                for(int i=0; i<c; i++)
                    output[oid+i] = pixel[i];

            }
        }
    }
}


int ResizeAreaInference(float *input, float *output, int upscale, int w, int h, int c)
{  
    dim3 bdim(16,16);
    int gx = ((h + bdim.x - 1) / bdim.x); 
    int gy = ((w + bdim.y - 1) / bdim.y); 
    dim3 gdim(gx,gy);

    switch(c) {

        case 1:
            ResizeAreaKernel<1><<<gdim, bdim>>>(input, output, upscale, w, h);
            break;

        case 3:
            ResizeAreaKernel<3><<<gdim, bdim>>>(input, output, upscale, w, h);
            break;

        case 4:
            ResizeAreaKernel<4><<<gdim, bdim>>>(input, output, upscale, w, h);
            break;
    }

    cudaDeviceSynchronize();
    return 0;
}

void printdata(float *ptr, int size, const char* name, int stride)
{
    ofstream myfile;
    myfile.open (name);
    for(int i=0; i < size; i++){
        if(i % stride == 0 && i!=0)
            myfile << "\n";
        myfile << *(ptr+i) << ",";
    }
    myfile.close();
    return;
}

int main(void)
{
    int w = 41;
    int h = 31;
    int c = 3;
    int upscale = 4;
    float *in, *out;

    // Allocate Unified Memory  accessible from CPU or GPU
    cudaMallocManaged(&in, h*w*c*sizeof(float));
    cudaMallocManaged(&out, h*upscale*w*upscale*c*sizeof(float));

    // initialize x and y arrays on the host
    for (int i = 0; i < h*w; i++) {
        float val = i+1.0f; 
        for(int j=0; j<c; j++) {
            in[i*c+j] = val;
        }
    }
    printdata(in, w*h*c, "input.txt", w*c);
    ResizeAreaInference(in, out, upscale, w, h, c);

    printdata(out, w*upscale*h*upscale*c, "output.txt", w*upscale*c);

    // Free memory
    cudaFree(in);
    cudaFree(out);

    return 0;
}

[警告 - 代码测试非常简单,使用风险自负]

此代码接受这样的输入(对于 41 x 31,三通道情况):

$ nvcc -std=c++11 -arch=sm_52 -o batu4 batu4.cu

$ cuda-memcheck ./batu4
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

在此处输入图像描述

并发出:

在此处输入图像描述

几点:

  1. 对这种内核使用二维网格更有意义。代码更简单更容易理解,性能也可能会更好
  2. 通道数可能只会是每像素 1,3 或 4 个浮点数,因此将其设为常数而不是内核参数是有意义的。编译器可以(并且确实)做了很多优化,当可以将其声明为常量时,这些优化将提高性能。C++ 模板是一种有用的方法
  3. 您选择使用随机数作为像素值使得调试比需要的困难得多。通过将每个输入值的每个通道设置为已知值,就可以查看内核的输出并立即了解索引方案是如何失败的。
  4. 同样,仅使用一个线程运行代码以查看输出以查看索引是否不正确,然后cuda-memcheck查看越界读取和写入是如何发生的,这将非常容易且信息丰富。
  5. 您的问题的解决方案只需要笔和纸和一些简单的整数运算。一旦你理解了问题的数学原理,编写代码就变得不言而喻了。下次试试。

推荐阅读