c - CUDA + 使用 C 计数 int 元素出现次数
问题描述
在主机端,我正在读取一个 128 x 128 整数数组,随机值在 0-31 之间。我有一个存储值 0-31 的 Occurrences 数组,然后在设备上我尝试执行一个内核,该内核循环遍历 128 x 128 数组中的值,然后计算 0-31 出现的次数。
我在如何拆分 CUDA 中的块/线程以及如何让内核向主机提供通信并打印出每个元素的出现次数方面遇到问题。这是我第一次使用 CUDA,我将不胜感激任何建设性的建议!到目前为止,这是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define MAXR 16
#define MAXC 16
#define N 256
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {
int count = 0;
//provide unique thread ID
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int k;
//for(k=0; k < 32;k++) {
// occurrences_d[k]=k;
// }
if(idx < N) {
//for(k=0; k < MAXR*MAXC; k++) {
for(int j=0; j<32; j++) {
count =0;
if(arrayONE_d[idx]==occurrences_d[j]){
count+=1;
occurrences_final_d[j] =count;
}
else {}
}
}
//occurrences_final_d[0] = 77;
}
}
int main(void) {
//const int N = MAXR*MAXC;
int arr1_h[MAXR][MAXC];
//int *occurrences_h[0][32];
//creating arrays for the device (GPU)
//int *arr1_d;
int occurrences_h[32];
int *occurrences_d;
int *occurrences_final_h[32] = {0};
int *occurrences_final_d;
int *arrayONE_h[256] = {0};
int *arrayONE_d;
int i, j;
// allocating memory for the arrays on the device
cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
cudaMalloc( (void**) &occurrences_d, 32* sizeof(int));
cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));
/*
for(i=0; i < 32; i++) {
occurrences_h[i] = i;
}
/*
*
*/
//Reading in matrix from .txt file and storing it in arr1 on the host (CPU)
FILE *fp;
fp =fopen("arrays16.txt","r");
// this loop takes the information from .txt file and puts it into arr1 matrix
for(i=0;i<MAXR;i++) {
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%d\t", &arr1_h[i][j]);
}
}
for(i=0;i<MAXR;i++) {
printf("\n");
for(j=0;j<MAXC;j++) {
//printf("d\t", arr1_h[i][j]);
}
printf("\n\n");
}
int x,y;
int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
for(x=0;x<MAXR;x++)
{
for(y=0;y<MAXC;y++)
{
// printf("**%d ",arr1_h[x][y]);
arrayONE_h[z]= &arr1_h[x][y];
z++;
}
}
for(x=0; x < 256; x++) {
printf("%d\n", *arrayONE_h[x]);
//return 0;
}
int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);
printf("\n\n");
printf("**LENGTH = %d", length);
// copying the arrays/memory from the host to the device (GPU)
cudaMemcpy(arrayONE_d, &arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(occurrences_d, &occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(occurrences_final_d, &occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice);
// how many blocks we will allocate
//dim3 DimGrid();
//how many threads per block we will allocate
dim3 DimBlock(256);
//kernel launch against the GPU
count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);
//copy the arrays post-computation from the device back to the host (CPU)
cudaMemcpy(&occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
// some error checking - run this with cuda-memcheck when executing your code
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
//free up the memory of the device arrays
cudaFree(arrayONE_d);
cudaFree(occurrences_d);
cudaFree(occurrences_final_d);
//print out the number of occurrences of each 0-31 value
for(i=0;i<32;i++) {
printf("\n");
printf("%d\n",occurrences_final_h[i]);
}
}
解决方案
正如我在评论中提到的,您对指针的理解是有缺陷的。我已经在您的代码中的许多地方进行了更改以解决此问题。我已经用评论标记了其中的大部分,// mod
但我可能错过了一些。
此外,当多个线程可以更新同一位置时,您的内核根本无法跟踪元素。解决这个问题的一种方法是使用原子(我已经演示过)。还有各种其他方法,例如并行缩减,但这些都不是对内核的微不足道的更改。此外,您的内核逻辑在一些方面被破坏了。
接下来是我可以对您的代码进行的最小数量的修改以获得合理的东西。您可以使用一些编译开关来探索不同的内核行为:
- 没有开关 - 靠近你的内核,但它不能正常工作
-DUSE_ATOMICS
将演示对内核的修改以使其正确计数。-DUSE_ALT_KERNEL
探索了一种不同的内核逻辑方法:为每个直方图 bin 分配一个线程,并让每个线程遍历整个数组,跟踪属于该 bin 的元素。由于只有一个线程正在写入每个 bin 结果,因此不需要原子。但是,我们只能拥有与 bin 一样多的线程(通过这个简单的实现)。没有太多困难,这种方法可能可以扩展到每个 bin 一个 warp,在一个线程将最终结果写入 bin 之前,使用 warp shuffle 进行最终的 warp 级别减少。这将在一定程度上提高内存访问效率。但是,这也会将复杂性引入您可能尚未了解的内核。
这是代码:
$ cat t316.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define MAXR 16
#define MAXC 16
#define BINS 32
#define N (MAXR*MAXC)
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {
//provide unique thread ID
int idx = threadIdx.x + blockIdx.x * blockDim.x;
#ifndef USE_ALT_KERNEL
if(idx < N) {
//for(k=0; k < MAXR*MAXC; k++) {
for(int j=0; j<32; j++) {
if(arrayONE_d[idx]==occurrences_d[j]){
#ifndef USE_ATOMICS
occurrences_final_d[j]++;
#else
atomicAdd(occurrences_final_d+j, 1);
#endif
}
else {}
}
}
#else
// use one thread per histo bin
if (idx < BINS){
int count = 0;
int myval = occurrences_d[idx];
for (int i = 0; i < N; i++) if (arrayONE_d[i] == myval) count++;
occurrences_final_d[idx] = count;
}
#endif
}
int main(void) {
//const int N = MAXR*MAXC;
int arr1_h[MAXR][MAXC];
//int *occurrences_h[0][32];
//creating arrays for the device (GPU)
//int *arr1_d;
int occurrences_h[32]; // mod
int *occurrences_d;
int occurrences_final_h[32] = {0}; // mod
int *occurrences_final_d;
int arrayONE_h[256] = {0}; // mod
int *arrayONE_d;
int i, j;
// allocating memory for the arrays on the device
cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
cudaMalloc( (void**) &occurrences_d, 32* sizeof(int));
cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));
/*
for(i=0; i < 32; i++) {
occurrences_h[i] = i;
}
*/
//Reading in matrix from .txt file and storing it in arr1 on the host (CPU)
// FILE *fp;
// fp =fopen("arrays16.txt","r");
// this loop takes the information from .txt file and puts it into arr1 matrix
for(i=0;i<MAXR;i++) {
for(j=0;j<MAXC;j++)
{
// fscanf(fp,"%d\t", &arr1_h[i][j]);
arr1_h[i][j] = j; // mod
}
}
for(i=0;i<MAXR;i++) {
for(j=0;j<MAXC;j++) {
//printf("d\t", arr1_h[i][j]);
}
}
int x,y;
int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
for(x=0;x<MAXR;x++)
{
for(y=0;y<MAXC;y++)
{
// printf("**%d ",arr1_h[x][y]);
arrayONE_h[z]= arr1_h[x][y]; // mod
z++;
}
}
for(x=0; x < 256; x++) {
// printf("%d\n", arrayONE_h[x]); // mod
//return 0;
}
int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);
printf("**LENGTH = %d\n", length);
// copying the arrays/memory from the host to the device (GPU)
cudaMemcpy(arrayONE_d, arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice); //mod
cudaMemcpy(occurrences_d, occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice); // mod
cudaMemcpy(occurrences_final_d, occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice); // mod
// how many blocks we will allocate
//dim3 DimGrid();
//how many threads per block we will allocate
#ifndef USE_ALT_KERNEL
dim3 DimBlock(N);
#else
dim3 DimBlock(BINS);
#endif
//kernel launch against the GPU
count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);
//copy the arrays post-computation from the device back to the host (CPU)
cudaMemcpy(occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost); // mod
cudaMemcpy(occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost); // mod
// some error checking - run this with cuda-memcheck when executing your code
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
//free up the memory of the device arrays
cudaFree(arrayONE_d);
cudaFree(occurrences_d);
cudaFree(occurrences_final_d);
//print out the number of occurrences of each 0-31 value
for(i=0;i<32;i++) {
printf("%d ",occurrences_final_h[i]);
}
printf("\n");
}
$ nvcc -o t316 t316.cu
$ cuda-memcheck ./t316
========= CUDA-MEMCHECK
**LENGTH = 256
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
========= ERROR SUMMARY: 0 errors
$ nvcc -o t316 t316.cu -DUSE_ATOMICS
$ ./t316
**LENGTH = 256
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
$ nvcc -o t316 t316.cu -DUSE_ALT_KERNEL
$ cuda-memcheck ./t316
========= CUDA-MEMCHECK
**LENGTH = 256
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
========= ERROR SUMMARY: 0 errors
$
在上面的输出中,我们看到基本内核产生了不正确的结果。原子内核和备用内核产生正确的结果
(您的代码已被修改为使用合成数据,因此它不需要打开文件。)
推荐阅读
- sql - Postgres jsonb_build_object 为空行返回空而不是空
- swift - 创建自定义框架和 xcframework 时出错
- python - 多元回归,用多个自变量重塑输入
- docker - 来自守护进程的 Docker 错误响应:OCI 运行时创建失败 container_linux.go:380:导致启动容器进程
- javascript - 打字稿:如何创建具有 2 项的元组,其中第 2 项使用第 1 项的值作为键?
- syntax - 使用 Roslyn SyntaxGeneration 生成带有声明的模式匹配
- java - 使用相机模块 NFC 读/写
- mysql - 如果同一用户在一小时内重复访问,则使用 MySQL 时,只有第一个被认为是有效的
- python-3.x - 如何制作拼写错误信息?
- javascript - Bootstrap 5 下拉菜单和导航:无法读取 null 的属性“孩子”