首页 > 解决方案 > 如何像 python 一样编写 cuda c argmax 内核并让它解码 one-hot 矩阵?

问题描述

就像标题建议的那样。

这是代码。

其中 show_data() 和 get_data() 是显示和导入数据的函数。

我不知道如何列出一个主要列的单热矩阵(它可以适合 cublas 函数),并像 python 的 argmax 函数一样对其进行解码。

我知道我一定弄乱了 argmax 内核中的索引,不知道如何修复它。

数据是这样的:

[[1。0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

[0。1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]

[0。0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]

[0。0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]

[0。0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]

[0。0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]

[0。0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]

[0。0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]

[0。0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]

[0。0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

输出应该是这样的:

[1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10]

#define IDX2C(i,j,ld) (((j)*(ld))+(i))

__global__ void argmax_kernel(float *data, float *dataout, int ld, int sd) {
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    float temp = 0.0f;

    int i = 0;
    int j = 0;


    for (i = 0; i < ld; i++) {
        for (j = (tid + i*sd*stride); j < sd; j+= stride) {
            if (j = (tid + i*sd*stride)) {
                temp = data[j];
                dataout[i*sd*stride] = 1.0f;
            }
            else if (temp <= data[j]) {
                temp = data[j];
                dataout[i*sd*stride] += 1.0f;
            }
        }
    }
}

void argmax(float *data, float *dataout, int ld, int sd) {
    int i = 0;
    int j = 0;
    float temp = 0.0f;

    for (i = 0; i < ld; i++) {
        for (j = 0; j < sd; j++) {
            if (j == 0) {
                temp = data[IDX2C(i, j, ld)];
                dataout[i] = 1.0f;
            }
            else if (temp <= data[IDX2C(i, j, ld)]) {
                temp = data[IDX2C(i, j, ld)];
                dataout[i] = dataout[i] + 1.0f;
            }
        }
    }
}

int main() {
    float *y_in, *y_out;
    float *d_yin, *d_yout;
    int rols_y = 20;
    int cols_y = 10;
    int size_y = rols_y*cols_y;
    char y_file_name[100] = "argmax.csv";


    y_in = (float*)malloc(size_y * sizeof(float));
    y_out = (float*)malloc(rols_y * sizeof(float));

    cudaMalloc((void**)&d_yin, size_y * sizeof(*y_in));
    cudaMalloc((void**)&d_yout, rols_y * sizeof(*y_out));

    get_ydata(y_in, y_file_name);
    show_data(y_in, rols_y, cols_y);

    cudaMemcpy(d_yin, y_in, size_y * sizeof(float), cudaMemcpyHostToDevice);
    argmax(y_in, y_out, rols_y, cols_y);
    show_data(y_out, rols_y, 1);


    dim3 dimBlock(256);
    int threadBlocks = (size_y + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);

    argmax_kernel<<<dimGrid, dimBlock>>>(d_yin, d_yout, rols_y, cols_y);


    cudaFree(d_yin);
    cudaFree(d_yout);

    free(y_in);
    free(y_out);

    system("pause");
    return EXIT_SUCCESS;
}

标签: cuda

解决方案


You have outright errors in your kernel code such as this:

if (j = (tid + i*sd*stride))

but I wouldn't be able to explain what is going on more generally with your kernel code. None of it makes sense to me. I think the simplest thing to do is just to duplicate your host code in your kernel, converting only the outer loop in ld to be a grid-stride loop, which seems to be your desire. Here is an example:

 $ cat t23.cu
#include <iostream>
#include <cublas_v2.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

__global__ void argmax_kernel(float *data, float *dataout, int ld, int sd) {
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    float temp = 0.0f;

    int i = 0;
    int j = 0;


    for (i = tid; i < ld; i+= stride) {
        for (j = 0; j < sd; j++) {
            if (j == 0) {
                temp = data[IDX2C(i,j, ld)];
                dataout[i] = 1.0f;
            }
            else if (temp <= data[IDX2C(i, j, ld)]) {
                temp = data[IDX2C(i, j, ld)];
                dataout[i] += 1.0f;
            }
        }
    }
}

void argmax(float *data, float *dataout, int ld, int sd) {
    int i = 0;
    int j = 0;
    float temp = 0.0f;

    for (i = 0; i < ld; i++) {
        for (j = 0; j < sd; j++) {
            if (j == 0) {
                temp = data[IDX2C(i, j, ld)];
                dataout[i] = 1.0f;
            }
            else if (temp <= data[IDX2C(i, j, ld)]) {
                temp = data[IDX2C(i, j, ld)];
                dataout[i] = dataout[i] + 1.0f;
            }
        }
    }
}

int main() {
    float *y_in, *y_out;
    float *d_yin, *d_yout;
    int rows_y = 20;
    int cols_y = 10;
    int size_y = rows_y*cols_y;
    float d_in[cols_y][rows_y] = {
{1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.}};

    y_in = (float*)malloc(size_y * sizeof(float));
    y_out = (float*)malloc(rows_y * sizeof(float));
    //transpose
    for (int r = 0; r < cols_y; r++)
      for (int c = 0; c < rows_y; c++)
        y_in[IDX2C(c,r,rows_y)] = d_in[r][c];
    cudaMalloc((void**)&d_yin, size_y * sizeof(*y_in));
    cudaMalloc((void**)&d_yout, rows_y * sizeof(*y_out));

    cudaMemcpy(d_yin, y_in, size_y * sizeof(float), cudaMemcpyHostToDevice);
    argmax(y_in, y_out, rows_y, cols_y);
    for (int i = 0; i <  rows_y; i++)
      std::cout << y_out[i] << " ";
    std::cout << std::endl;
    dim3 dimBlock(256);
    int threadBlocks = (size_y + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);

    argmax_kernel<<<dimGrid, dimBlock>>>(d_yin, d_yout, rows_y, cols_y);
    cudaMemcpy(y_out, d_yout, rows_y*sizeof(*y_out), cudaMemcpyDeviceToHost);
    for (int i = 0; i <  rows_y; i++)
      std::cout << y_out[i] << " ";
    std::cout << std::endl;


    cudaFree(d_yin);
    cudaFree(d_yout);

    free(y_in);
    free(y_out);

}
$ nvcc -o t23 t23.cu
$ cuda-memcheck ./t23
========= CUDA-MEMCHECK
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
========= ERROR SUMMARY: 0 errors
$

There are other things I don't understand here, like why you are using a float output data type to represent an index, but that doesn't seem to be the crux of your question. I'm not suggesting this code is defect-free or optimal in any way, however it may be best to keep things simple at this point.

If you want to, you can probably simplify your host and device argmax functions like this:

$ cat t23.cu
#include <iostream>
#include <cublas_v2.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))


__host__ __device__ void argmax(float *data, float *dataout, int ld, int sd, int start, int inc) {
    for (int i = start; i < ld; i+=inc) {
        float valout = 1.0f;
        float temp = data[IDX2C(i, 0, ld)];
        for (int j = 1; j < sd; j++) {
            float val = data[IDX2C(i, j, ld)];
            if (temp <= val) {
                temp = val;
                valout += 1.0f;
            }
        }
        dataout[i]  = valout;
    }
}

__global__ void argmax_kernel(float *data, float *dataout, int ld, int sd) {
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    argmax(data, dataout, ld, sd, tid, stride);
}

int main() {
    float *y_in, *y_out;
    float *d_yin, *d_yout;
    int rows_y = 20;
    int cols_y = 10;
    int size_y = rows_y*cols_y;
    float d_in[cols_y][rows_y] = {
{1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
{0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.}};

    y_in = (float*)malloc(size_y * sizeof(float));
    y_out = (float*)malloc(rows_y * sizeof(float));
    //transpose
    for (int r = 0; r < cols_y; r++)
      for (int c = 0; c < rows_y; c++)
        y_in[IDX2C(c,r,rows_y)] = d_in[r][c];
    cudaMalloc((void**)&d_yin, size_y * sizeof(*y_in));
    cudaMalloc((void**)&d_yout, rows_y * sizeof(*y_out));

    cudaMemcpy(d_yin, y_in, size_y * sizeof(float), cudaMemcpyHostToDevice);
    argmax(y_in, y_out, rows_y, cols_y, 0, 1);
    for (int i = 0; i <  rows_y; i++)
      std::cout << y_out[i] << " ";
    std::cout << std::endl;
    dim3 dimBlock(256);
    int threadBlocks = (size_y + (dimBlock.x - 1)) / dimBlock.x;
    if (threadBlocks > 65520) threadBlocks = 65520;
    dim3 dimGrid(threadBlocks);

    argmax_kernel<<<dimGrid, dimBlock>>>(d_yin, d_yout, rows_y, cols_y);
    cudaMemcpy(y_out, d_yout, rows_y*sizeof(*y_out), cudaMemcpyDeviceToHost);
    for (int i = 0; i <  rows_y; i++)
      std::cout << y_out[i] << " ";
    std::cout << std::endl;


    cudaFree(d_yin);
    cudaFree(d_yout);

    free(y_in);
    free(y_out);

}
$ nvcc -o t23 t23.cu
$ cuda-memcheck ./t23
========= CUDA-MEMCHECK
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10
========= ERROR SUMMARY: 0 errors

For other readers, I will point out that I don't consider this code to be a typical argmax implementation; it will not duplicate results for usual argmax functions on arbitrary data. But it should be usable with this kind of one-hot encoding data (only).


推荐阅读