首页 > 解决方案 > CPP Main 中的 CUDA 常量内存使用情况

问题描述

我正在尝试在常量内存中使用一个数组,该数组将被填充到 cpp main 中。当我复制到符号并从中复制回来时,我观察到全 0,类似地,如果我尝试在常量内存中使用数组,我最终得到全 0。

我究竟做错了什么?

(我尝试过“extern”,但我所有的尝试都以未解决的符号构建错误告终,我还尝试将 dummy.h 设置为 dummy.cuh。我可能会听到符号是为本地范围定义的,但这不太可能帮我很多:))

这是编辑后的版本,我使用的是 VS2017 社区版和 CUDA 10.1:

//dummy.h
#include <cuda_runtime.h>
void dummyBackTransferStream(float* d_array, int size, const cudaStream_t* stream);

//dummy.cpp
#include "dummy.h"
__constant__ float order[300];
inline int idivCeil(int x, int y)
{
  return (x + y - 1) / y;
}

__global__ void dummyBackTransferKernel(float* d_array, int size)
{
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  if (x < size)
  {
    d_array[x] = order[x];
    //std::cout << x << " : " << order[x] << std::endl;
  }
}

void dummyBackTransferStream(float* d_array, int size, const cudaStream_t* stream)
{
  dim3 blockSize(32); // Consider using #define 
  dim3 gridSize(idivCeil(size, blockSize.x));
  dummyBackTransferKernel << < gridSize, blockSize, 0, *stream >> > (d_array, size);
}
//main.cpp
#include "dummy.h"
#include "../Preprocessor.h"
#include <iostream>

#define TEST_SIZE 250

int main(int argc, char** argv)
{
  CHK_CUDA(cudaSetDevice(0));
  cudaStream_t testStream;
  cudaStreamCreate(&testStream);

  std::cout << "Const test" << std::endl;

  float* c_buf; //Host array as input
  CHK_CUDA(cudaMallocHost((void**)&c_buf, sizeof(float) * TEST_SIZE));
  float* ct_buf; //Host array to contain the symbols after cudaMemcpyToSymbol+cudaMemcpyFromSymbol
  CHK_CUDA(cudaMallocHost((void**)&ct_buf, sizeof(float) * TEST_SIZE));
  float* cd_buf; //Device array to contain result of a kernel using the constant memory
  CHK_CUDA(cudaMalloc((void**)&cd_buf, sizeof(float) * TEST_SIZE));
  float* ch_buf; //Result of the kernel copied back to host  
  CHK_CUDA(cudaMallocHost((void**)&ch_buf, sizeof(float) * TEST_SIZE));  

  for (int pp = 0; pp < TEST_SIZE; ++pp) {
    c_buf[pp] = (float)rand() / RAND_MAX;
  }  

  cudaMemcpyToSymbolAsync(order, c_buf, TEST_SIZE * sizeof(float), 0, cudaMemcpyHostToDevice, testStream);
  cudaMemcpyFromSymbolAsync(ct_buf, order, TEST_SIZE * sizeof(float), 0, cudaMemcpyDeviceToHost, testStream);

  dummyBackTransferStream(cd_buf, TEST_SIZE, &testStream);
  CHK_CUDA(cudaMemcpy(ch_buf, cd_buf , sizeof(float) * TEST_SIZE, cudaMemcpyDeviceToHost));

  cudaStreamSynchronize(testStream);

  for (int pp = 0; pp < TEST_SIZE; ++pp) {
    std::cout << c_buf[pp] << "     " << ch_buf[pp] << "     " << ct_buf[pp] << std::endl;
  }

  std::cout << "done!" << std::endl;
  return 0;
}

结果

1>main.cpp(29): error C2065: 'order': undeclared identifier
1>main.cpp(30): error C2065: 'order': undeclared identifier
1>    0 Warning(s)
1>    2 Error(s)
1>
1>Time Elapsed 00:00:04.29
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

这是cmake文件的相关部分:

set(CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
    -gencode arch=compute_70,code=sm_70
    -gencode arch=compute_62,code=sm_62
    -gencode arch=compute_61,code=sm_61
    -gencode arch=compute_60,code=sm_60
    -gencode arch=compute_50,code=sm_50
    -gencode arch=compute_35,code=sm_35
    -use_fast_math
    -rdc=true
    )    

感谢您的帮助,我们将不胜感激。

标签: c++arraysvisual-c++cudac++14

解决方案


推荐阅读