首页 > 解决方案 > Ctypes Cuda - 指针乘法不会导致产品

问题描述

我仅在 C 中实现了一个成功运行的 Cuda 矩阵乘法。现在我正在尝试将 Matrix 初始化转换为 numpy 并使用 Python 的ctypes库来执行 c 代码。似乎带有指针的数组不包含相乘的值。我不太确定问题出在哪里,但已经在 CUDA 代码中 - 即使在调用内核方法并将值从设备加载回主机之后,值仍然为零。

CUDA 代码:

#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
#define RANDOM_MN_RANGE 64

struct Matrix {
    int width;
    int height;
    // contiguously stored Matrix, in row first order
    float *elements;
};

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){

    // runs for each col - row pair
    float tmpVal = 0;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    for (int i = 0; i < A.width; ++i)
        tmpVal += A.elements[row * A.width + i] *
                  B.elements[i * B.width + col];
    C.elements[ row * C.width + col ] = tmpVal;
}

extern "C" {
    void mMul( Matrix *A, Matrix *B, Matrix *C ){

        Matrix d_A, d_B, d_C;

        // Matrix d_A
        d_A.width    =   A->width;
        d_A.height   =   A->height;
        size_t sizeA =   A->width * A->height * sizeof(float);
        // dynamically allocate cudaMemory for elemenst array
        cudaMalloc(&d_A.elements, sizeA);
        cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);

        // Matrix d_B
        d_B.width    =   B->width;
        d_B.height   =   B->height;
        size_t sizeB =   B->width * B->height * sizeof(float);
        // dynamically allocate cudaMemory for elemenst array
        cudaMalloc(&d_B.elements, sizeB);
        cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);

        // Matrix d_C
        d_C.width    =   C->width;
        d_C.height   =   C->height;
        size_t sizeC =   C->width * C->height * sizeof(float);

        // dynamically allocate cudaMemory for elemenst array
        cudaMalloc(&d_C.elements, sizeC);

        // 16 * 16 = 256 threads per block
        dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

        // Blocks per grid
        dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);

        // calling the Kernel
        MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

        // copy results from result matrix C to the host again
        cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);

        // C->elements[0] contains still 0, the values do not seem to be loaded back to host memory.
        printf("A is %f\n", A->elements[0]);
        printf("B is %f\n", B->elements[0]);
        printf("C is %f\n", C->elements[0]);

        // free the cuda memory
        cudaFree(d_A.elements);
        cudaFree(d_B.elements);
        cudaFree(d_C.elements);

    }
}

要编译代码,我使用以下命令

nvcc  --shared --compiler-options '-fPIC' -o Sequential_Cuda_Python.so Sequential_Cuda_Python.cu

ctypes Python代码

import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *


class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]

libc = CDLL("./Sequential_Cuda_Python.so")

libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
libc.mMul.restype = None


def npArrtoMatrixPtr(data: np.ndarray) -> (POINTER(Matrix), tuple):
    """
    numpy arr to Matrix pointer
    @return (pointer to arr, shape)
    """
    c_float_p = POINTER(c_float)
    data = data.astype(np.float32)

    w, h = np.shape(data)
    # print((w, h))

    mXp = Matrix(height=h, width=w, elements=data.ctypes.data_as(c_float_p))
    return (pointer(mXp), np.shape(data))


def matMulSeqCuda( _mA, _mB, _mC ):
    """
    multiplies mA with mB sequentually using mC
    """

    pmA, mASz = ( _mA[0], _mA[-1] )
    pmB, mBSz = ( _mB[0], _mB[-1] )
    pmC, mCSz = ( _mC[0], _mC[-1] )

    assert np.shape( mASz )[0] == 2 and \
           np.shape( mBSz )[0] == 2 and \
           np.shape( mCSz )[0] == 2, "Only 2D arrays accepted"

    #assert np.shape(mA)[0] == np.shape(mB)[1], "Rows of mA need to be the same as Cols of mB"


    libc.mMul( pmA, pmB, pmC )
    c = pmC.contents

    mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.height))



    # The array still contains only 0. values
    print(mtxC)

    return 0


if __name__ == '__main__':

    a = np.ones( (16, 8) )
    b = np.ones( (16, 8) )
    c = np.zeros( (16, 16) )

    mA = npArrtoMatrixPtr(a)
    mB = npArrtoMatrixPtr(b)
    mC = npArrtoMatrixPtr(c)

    matMulSeqCuda(mA, mB, mC)

解决方案:

正如@Mark Tolonen 指出的错误在于 Python 脚本。通过在与函数npArrtoMatrixPptr相同的范围内调用(创建并返回指向 Matrix 结构的指针),我能够检索到正确的结果 Matrix 。CUDAlibc.mMulmtxC

import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *


class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]
libc =  CDLL("./Sequential_Cuda_Python.so")

libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
libc.mMul.restype = None

def npArrtoMatrixPtr(data: np.ndarray) -> (POINTER(Matrix), tuple):
    """
    numpy arr to Matrix pointer
    @return (pointer to arr, shape)
    """
    #c_float_p = POINTER(c_float)
    data = data.astype(np.float32)

    h, w = data.shape
    mXp = Matrix(w, h, data.ctypes.data_as(POINTER(c_float)))
    return (pointer(mXp), np.shape(data))


def matMulSeqCuda( npMa, npMb, npMc ):
    """
    multiplies mA with mB sequentually using mC
    """
    assert len(np.shape( npMa )) == 2 and \
           len(np.shape( npMb )) == 2 and \
           len(np.shape( npMc )) == 2, "Only 2D arrays accepted"

    pmA, szA = npArrtoMatrixPtr(npMa)
    pmB, szB = npArrtoMatrixPtr(npMb.T)
    pmC, szC = npArrtoMatrixPtr(npMc) # the resulting array

    libc.mMul( pmA, pmB, pmC )

    c = pmC.contents
    mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.width))

    # the result is correct
    print(mtxC)

    return 0


if __name__ == '__main__':

    a = np.ones( (16, 8) )
    b = np.ones( (16, 8) )
    c = np.zeros( (16, 16) )

    matMulSeqCuda(a, b, c)

标签: pythonccudactypes

解决方案


我无法按原样编译您的代码,但问题是np.shape返回(行,列)或等效的(高度,宽度),而不是(宽度,高度):

w, h = np.shape(data) # should be h,w

而且在目视检查时,这条线是错误的(c.height 两次)。

mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.height))

CUDA 代码与问题无关,它正确传递 Matrix* 并接收修改。我在下面做了一个最小的可重现示例,专注于正确传递矩阵:

test.cpp - 接收一个矩阵结构并将其中的值加倍。

#ifdef _WIN32
#   define API __declspec(dllexport)
#else
#   define API
#endif

struct Matrix {
    int width;
    int height;
    float *elements;
};

extern "C" API
void doubleit(Matrix *A) {
    for(int r = 0; r < A->height; ++r)
        for(int c = 0; c < A->width; ++c) {
            A->elements[r * A->width + c] *= 2;
    }
}

测试.py

import numpy as np
from ctypes import *

class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]

libc = CDLL('./test')

libc.doubleit.argtypes = POINTER(Matrix),
libc.doubleit.restype = None

def doubleit(a):
    h,w = a.shape # note h,w not w,h
    m = Matrix(w,h,a.ctypes.data_as(POINTER(c_float)))
    libc.doubleit(m)

a = np.arange(0.0,0.6,0.1,dtype=np.float32).reshape((2,3))
print(a)
doubleit(a)
print(a)

输出:

[[0.  0.1 0.2]
 [0.3 0.4 0.5]]
[[0.  0.2 0.4]
 [0.6 0.8 1. ]]

推荐阅读