python - Ctypes Cuda - 指针乘法不会导致产品
问题描述
我仅在 C 中实现了一个成功运行的 Cuda 矩阵乘法。现在我正在尝试将 Matrix 初始化转换为 numpy 并使用 Python 的ctypes
库来执行 c 代码。似乎带有指针的数组不包含相乘的值。我不太确定问题出在哪里,但已经在 CUDA 代码中 - 即使在调用内核方法并将值从设备加载回主机之后,值仍然为零。
CUDA 代码:
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
#define RANDOM_MN_RANGE 64
struct Matrix {
int width;
int height;
// contiguously stored Matrix, in row first order
float *elements;
};
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
// runs for each col - row pair
float tmpVal = 0;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = 0; i < A.width; ++i)
tmpVal += A.elements[row * A.width + i] *
B.elements[i * B.width + col];
C.elements[ row * C.width + col ] = tmpVal;
}
extern "C" {
void mMul( Matrix *A, Matrix *B, Matrix *C ){
Matrix d_A, d_B, d_C;
// Matrix d_A
d_A.width = A->width;
d_A.height = A->height;
size_t sizeA = A->width * A->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_A.elements, sizeA);
cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);
// Matrix d_B
d_B.width = B->width;
d_B.height = B->height;
size_t sizeB = B->width * B->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_B.elements, sizeB);
cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);
// Matrix d_C
d_C.width = C->width;
d_C.height = C->height;
size_t sizeC = C->width * C->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_C.elements, sizeC);
// 16 * 16 = 256 threads per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Blocks per grid
dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);
// calling the Kernel
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// copy results from result matrix C to the host again
cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);
// C->elements[0] contains still 0, the values do not seem to be loaded back to host memory.
printf("A is %f\n", A->elements[0]);
printf("B is %f\n", B->elements[0]);
printf("C is %f\n", C->elements[0]);
// free the cuda memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
}
要编译代码,我使用以下命令
nvcc --shared --compiler-options '-fPIC' -o Sequential_Cuda_Python.so Sequential_Cuda_Python.cu
ctypes Python代码
import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *
class Matrix(Structure):
_fields_ = [("width", c_int),
("height", c_int),
("elements", POINTER(c_float))]
libc = CDLL("./Sequential_Cuda_Python.so")
libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
libc.mMul.restype = None
def npArrtoMatrixPtr(data: np.ndarray) -> (POINTER(Matrix), tuple):
"""
numpy arr to Matrix pointer
@return (pointer to arr, shape)
"""
c_float_p = POINTER(c_float)
data = data.astype(np.float32)
w, h = np.shape(data)
# print((w, h))
mXp = Matrix(height=h, width=w, elements=data.ctypes.data_as(c_float_p))
return (pointer(mXp), np.shape(data))
def matMulSeqCuda( _mA, _mB, _mC ):
"""
multiplies mA with mB sequentually using mC
"""
pmA, mASz = ( _mA[0], _mA[-1] )
pmB, mBSz = ( _mB[0], _mB[-1] )
pmC, mCSz = ( _mC[0], _mC[-1] )
assert np.shape( mASz )[0] == 2 and \
np.shape( mBSz )[0] == 2 and \
np.shape( mCSz )[0] == 2, "Only 2D arrays accepted"
#assert np.shape(mA)[0] == np.shape(mB)[1], "Rows of mA need to be the same as Cols of mB"
libc.mMul( pmA, pmB, pmC )
c = pmC.contents
mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.height))
# The array still contains only 0. values
print(mtxC)
return 0
if __name__ == '__main__':
a = np.ones( (16, 8) )
b = np.ones( (16, 8) )
c = np.zeros( (16, 16) )
mA = npArrtoMatrixPtr(a)
mB = npArrtoMatrixPtr(b)
mC = npArrtoMatrixPtr(c)
matMulSeqCuda(mA, mB, mC)
解决方案:
正如@Mark Tolonen 指出的错误在于 Python 脚本。通过在与函数npArrtoMatrixPptr
相同的范围内调用(创建并返回指向 Matrix 结构的指针),我能够检索到正确的结果 Matrix 。CUDA
libc.mMul
mtxC
import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *
class Matrix(Structure):
_fields_ = [("width", c_int),
("height", c_int),
("elements", POINTER(c_float))]
libc = CDLL("./Sequential_Cuda_Python.so")
libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
libc.mMul.restype = None
def npArrtoMatrixPtr(data: np.ndarray) -> (POINTER(Matrix), tuple):
"""
numpy arr to Matrix pointer
@return (pointer to arr, shape)
"""
#c_float_p = POINTER(c_float)
data = data.astype(np.float32)
h, w = data.shape
mXp = Matrix(w, h, data.ctypes.data_as(POINTER(c_float)))
return (pointer(mXp), np.shape(data))
def matMulSeqCuda( npMa, npMb, npMc ):
"""
multiplies mA with mB sequentually using mC
"""
assert len(np.shape( npMa )) == 2 and \
len(np.shape( npMb )) == 2 and \
len(np.shape( npMc )) == 2, "Only 2D arrays accepted"
pmA, szA = npArrtoMatrixPtr(npMa)
pmB, szB = npArrtoMatrixPtr(npMb.T)
pmC, szC = npArrtoMatrixPtr(npMc) # the resulting array
libc.mMul( pmA, pmB, pmC )
c = pmC.contents
mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.width))
# the result is correct
print(mtxC)
return 0
if __name__ == '__main__':
a = np.ones( (16, 8) )
b = np.ones( (16, 8) )
c = np.zeros( (16, 16) )
matMulSeqCuda(a, b, c)
解决方案
我无法按原样编译您的代码,但问题是np.shape
返回(行,列)或等效的(高度,宽度),而不是(宽度,高度):
w, h = np.shape(data) # should be h,w
而且在目视检查时,这条线是错误的(c.height 两次)。
mtxC = np.ctypeslib.as_array(c.elements, shape=(c.height, c.height))
CUDA 代码与问题无关,它正确传递 Matrix* 并接收修改。我在下面做了一个最小的可重现示例,专注于正确传递矩阵:
test.cpp - 接收一个矩阵结构并将其中的值加倍。
#ifdef _WIN32
# define API __declspec(dllexport)
#else
# define API
#endif
struct Matrix {
int width;
int height;
float *elements;
};
extern "C" API
void doubleit(Matrix *A) {
for(int r = 0; r < A->height; ++r)
for(int c = 0; c < A->width; ++c) {
A->elements[r * A->width + c] *= 2;
}
}
测试.py
import numpy as np
from ctypes import *
class Matrix(Structure):
_fields_ = [("width", c_int),
("height", c_int),
("elements", POINTER(c_float))]
libc = CDLL('./test')
libc.doubleit.argtypes = POINTER(Matrix),
libc.doubleit.restype = None
def doubleit(a):
h,w = a.shape # note h,w not w,h
m = Matrix(w,h,a.ctypes.data_as(POINTER(c_float)))
libc.doubleit(m)
a = np.arange(0.0,0.6,0.1,dtype=np.float32).reshape((2,3))
print(a)
doubleit(a)
print(a)
输出:
[[0. 0.1 0.2]
[0.3 0.4 0.5]]
[[0. 0.2 0.4]
[0.6 0.8 1. ]]
推荐阅读
- spring - 使用 Spring Data 的 postgres 中的时间戳出错:列 $COLUMN_NAME 是没有时区的时间戳类型,但表达式是 bytea 类型
- c++ - 如何判断翻译单元是否正在使用分段堆栈进行编译
- c# - ASP.NET Core Automapper Entity Framework:无法更新对象
- javascript - 将正则表达式限制为仅非 html 文本
- python - 从 Matplotlib.pcolormesh() 图中提取颜色数据以转换为 .kml 或其他格式
- matlab - 诊断 MATLAB MySQL JDBC 驱动程序错误 (Windows)
- python - 从字符串到日期的数据框索引
- c++ - 如何实现一个模棱两可的纯虚方法
- java - 使用 onesignal 推送通知服务时,远程主机在握手期间关闭连接
- wpf - 删除 TextBox NoWrap 最大长度