cuda - 如何像 python 一样编写 cuda c argmax 内核并让它解码 one-hot 矩阵?
问题描述
就像标题建议的那样。
这是代码。
其中 show_data() 和 get_data() 是显示和导入数据的函数。
我不知道如何列出一个主要列的单热矩阵(它可以适合 cublas 函数),并像 python 的 argmax 函数一样对其进行解码。
我知道我一定弄乱了 argmax 内核中的索引,不知道如何修复它。
数据是这样的:
[[1。0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0。1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0。0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0。0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0。0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0。0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0。0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[0。0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0。0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0。0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
输出应该是这样的:
[1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10]
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
__global__ void argmax_kernel(float *data, float *dataout, int ld, int sd) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
float temp = 0.0f;
int i = 0;
int j = 0;
for (i = 0; i < ld; i++) {
for (j = (tid + i*sd*stride); j < sd; j+= stride) {
if (j = (tid + i*sd*stride)) {
temp = data[j];
dataout[i*sd*stride] = 1.0f;
}
else if (temp <= data[j]) {
temp = data[j];
dataout[i*sd*stride] += 1.0f;
}
}
}
}
void argmax(float *data, float *dataout, int ld, int sd) {
int i = 0;
int j = 0;
float temp = 0.0f;
for (i = 0; i < ld; i++) {
for (j = 0; j < sd; j++) {
if (j == 0) {
temp = data[IDX2C(i, j, ld)];
dataout[i] = 1.0f;
}
else if (temp <= data[IDX2C(i, j, ld)]) {
temp = data[IDX2C(i, j, ld)];
dataout[i] = dataout[i] + 1.0f;
}
}
}
}
int main() {
float *y_in, *y_out;
float *d_yin, *d_yout;
int rols_y = 20;
int cols_y = 10;
int size_y = rols_y*cols_y;
char y_file_name[100] = "argmax.csv";
y_in = (float*)malloc(size_y * sizeof(float));
y_out = (float*)malloc(rols_y * sizeof(float));
cudaMalloc((void**)&d_yin, size_y * sizeof(*y_in));
cudaMalloc((void**)&d_yout, rols_y * sizeof(*y_out));
get_ydata(y_in, y_file_name);
show_data(y_in, rols_y, cols_y);
cudaMemcpy(d_yin, y_in, size_y * sizeof(float), cudaMemcpyHostToDevice);
argmax(y_in, y_out, rols_y, cols_y);
show_data(y_out, rols_y, 1);
dim3 dimBlock(256);
int threadBlocks = (size_y + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
argmax_kernel<<<dimGrid, dimBlock>>>(d_yin, d_yout, rols_y, cols_y);
cudaFree(d_yin);
cudaFree(d_yout);
free(y_in);
free(y_out);
system("pause");
return EXIT_SUCCESS;
}
解决方案
You have outright errors in your kernel code such as this:
if (j = (tid + i*sd*stride))
but I wouldn't be able to explain what is going on more generally with your kernel code. None of it makes sense to me. I think the simplest thing to do is just to duplicate your host code in your kernel, converting only the outer loop in ld
to be a grid-stride loop, which seems to be your desire. Here is an example:
$ cat t23.cu
#include <iostream>
#include <cublas_v2.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
__global__ void argmax_kernel(float *data, float *dataout, int ld, int sd) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
float temp = 0.0f;
int i = 0;
int j = 0;
for (i = tid; i < ld; i+= stride) {
for (j = 0; j < sd; j++) {
if (j == 0) {
temp = data[IDX2C(i,j, ld)];
dataout[i] = 1.0f;
}
else if (temp <= data[IDX2C(i, j, ld)]) {
temp = data[IDX2C(i, j, ld)];
dataout[i] += 1.0f;
}
}
}
}
void argmax(float *data, float *dataout, int ld, int sd) {
int i = 0;
int j = 0;
float temp = 0.0f;
for (i = 0; i < ld; i++) {
for (j = 0; j < sd; j++) {
if (j == 0) {
temp = data[IDX2C(i, j, ld)];
dataout[i] = 1.0f;
}
else if (temp <= data[IDX2C(i, j, ld)]) {
temp = data[IDX2C(i, j, ld)];
dataout[i] = dataout[i] + 1.0f;
}
}
}
}
int main() {
float *y_in, *y_out;
float *d_yin, *d_yout;
int rows_y = 20;
int cols_y = 10;
int size_y = rows_y*cols_y;
float d_in[cols_y][rows_y] = {
{1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.}};
y_in = (float*)malloc(size_y * sizeof(float));
y_out = (float*)malloc(rows_y * sizeof(float));
//transpose
for (int r = 0; r < cols_y; r++)
for (int c = 0; c < rows_y; c++)
y_in[IDX2C(c,r,rows_y)] = d_in[r][c];
cudaMalloc((void**)&d_yin, size_y * sizeof(*y_in));
cudaMalloc((void**)&d_yout, rows_y * sizeof(*y_out));
cudaMemcpy(d_yin, y_in, size_y * sizeof(float), cudaMemcpyHostToDevice);
argmax(y_in, y_out, rows_y, cols_y);
for (int i = 0; i < rows_y; i++)
std::cout << y_out[i] << " ";
std::cout << std::endl;
dim3 dimBlock(256);
int threadBlocks = (size_y + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
argmax_kernel<<<dimGrid, dimBlock>>>(d_yin, d_yout, rows_y, cols_y);
cudaMemcpy(y_out, d_yout, rows_y*sizeof(*y_out), cudaMemcpyDeviceToHost);
for (int i = 0; i < rows_y; i++)
std::cout << y_out[i] << " ";
std::cout << std::endl;
cudaFree(d_yin);
cudaFree(d_yout);
free(y_in);
free(y_out);
}
$ nvcc -o t23 t23.cu
$ cuda-memcheck ./t23
========= CUDA-MEMCHECK
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
========= ERROR SUMMARY: 0 errors
$
There are other things I don't understand here, like why you are using a float
output data type to represent an index, but that doesn't seem to be the crux of your question. I'm not suggesting this code is defect-free or optimal in any way, however it may be best to keep things simple at this point.
If you want to, you can probably simplify your host and device argmax functions like this:
$ cat t23.cu
#include <iostream>
#include <cublas_v2.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
__host__ __device__ void argmax(float *data, float *dataout, int ld, int sd, int start, int inc) {
for (int i = start; i < ld; i+=inc) {
float valout = 1.0f;
float temp = data[IDX2C(i, 0, ld)];
for (int j = 1; j < sd; j++) {
float val = data[IDX2C(i, j, ld)];
if (temp <= val) {
temp = val;
valout += 1.0f;
}
}
dataout[i] = valout;
}
}
__global__ void argmax_kernel(float *data, float *dataout, int ld, int sd) {
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
argmax(data, dataout, ld, sd, tid, stride);
}
int main() {
float *y_in, *y_out;
float *d_yin, *d_yout;
int rows_y = 20;
int cols_y = 10;
int size_y = rows_y*cols_y;
float d_in[cols_y][rows_y] = {
{1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.}};
y_in = (float*)malloc(size_y * sizeof(float));
y_out = (float*)malloc(rows_y * sizeof(float));
//transpose
for (int r = 0; r < cols_y; r++)
for (int c = 0; c < rows_y; c++)
y_in[IDX2C(c,r,rows_y)] = d_in[r][c];
cudaMalloc((void**)&d_yin, size_y * sizeof(*y_in));
cudaMalloc((void**)&d_yout, rows_y * sizeof(*y_out));
cudaMemcpy(d_yin, y_in, size_y * sizeof(float), cudaMemcpyHostToDevice);
argmax(y_in, y_out, rows_y, cols_y, 0, 1);
for (int i = 0; i < rows_y; i++)
std::cout << y_out[i] << " ";
std::cout << std::endl;
dim3 dimBlock(256);
int threadBlocks = (size_y + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
argmax_kernel<<<dimGrid, dimBlock>>>(d_yin, d_yout, rows_y, cols_y);
cudaMemcpy(y_out, d_yout, rows_y*sizeof(*y_out), cudaMemcpyDeviceToHost);
for (int i = 0; i < rows_y; i++)
std::cout << y_out[i] << " ";
std::cout << std::endl;
cudaFree(d_yin);
cudaFree(d_yout);
free(y_in);
free(y_out);
}
$ nvcc -o t23 t23.cu
$ cuda-memcheck ./t23
========= CUDA-MEMCHECK
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
========= ERROR SUMMARY: 0 errors
For other readers, I will point out that I don't consider this code to be a typical argmax
implementation; it will not duplicate results for usual argmax
functions on arbitrary data. But it should be usable with this kind of one-hot encoding data (only).
推荐阅读
- asp.net-core - 使用自定义身份验证方案授权失败 - asp.net-core 3.0
- java - 无法使用来自另一个本地依赖模块的接口或类
- javascript - 一次滚动一个 div 的按钮
- python - 来自 pandas read_sql 的“TypeError:'NoneType' 对象不可迭代”
- android - 无法在我的 android 应用程序中创建数据库
- powerbi - 检索与 Power BI 中特定状态相关的最后一条消息
- java - java - 如何在java maven项目中修复“没有GET映射”以显示自定义错误页面?
- css - 更改按钮文本颜色
- mysql - mysqlimport命令的跟踪过程
- android - 构建android Rom时空间不足