首页 > 解决方案 > Puthon 中的 CUDA C 库。将 gpu 内存数据从一个函数传输到 ow

问题描述

在 python 中,我想使用用 CUDA 编写的函数编写一个算法。而且我试图避免将数据从设备复制到主机以将数据从 dll 函数传输到 python,然后再传输到另一个 dll 函数。

这是我尝试执行的一些示例代码:

第一个函数在 gpu 内存中创建数组。第二 - 接收这个数组,并复制到主机内存。

CUDA C:

extern "C" __declspec(dllexport) void HostToDevice(void *Device_data, int len)
{
    int  err;

    float *host_data = (float*)malloc(len*sizeof(float));
    float k = 0;
    for (int i = 0; i<len; i+=1) { host_data[i] = k; k+=1;}
    cudaMalloc((void**)&Device_data, len*sizeof(float));

    err = cudaMemcpy(Device_data, host_data, len*sizeof(float), cudaMemcpyHostToDevice);
    printf("array addres %p\n", Device_data);
    printf("MemcpyHostToDevice error %d\n",err);
}

extern "C" __declspec(dllexport) void DeviceToHost(void *Device_data, void *Host_data, int len)
{
    int  err;

    err = cudaMemcpy(Host_data, Device_data, len*sizeof(float), cudaMemcpyDeviceToHost);
    printf("array addres %p\n", Device_data);
    printf("MemcpyDeviceToHost error %d\n",err);

}

Python:

HostToDevice = Synth_dll.HostToDevice
DeviceToHost = Synth_dll.DeviceToHost
HostToDevice.argtypes = [c_void_p, c_size_t]
DeviceToHost.argtypes = [c_void_p, c_void_p, c_size_t]

host = np.empty((len)).astype(np.float32)
Device = np.empty((len)).astype(np.float32)


HostToDevice(c_void_p(Device.ctypes.data), len)
DeviceToHost(c_void_p(Device.ctypes.data), c_void_p(host.ctypes.data), len)

但这不起作用。cudaMemcpy 返回错误。

输出:

array addres 0000000B22A40600
MemcpyHostToDevice error 0
array addres 0000019BD96577C0
MemcpyDeviceToHost error 1

标签: cuda

解决方案


我找到了解决方案:

CUDA:

extern "C" __declspec(dllexport) void HostToDevice(float **Device_data, int len)
{
    int  err;

    float *host_data = (float*)malloc(len*sizeof(float));
    float k = 0;
    for (int i = 0; i<len; i+=1) { host_data[i] = k; k+=1;}

    float *Data;
    cudaMalloc((void**)&Data, len*sizeof(float));
    err = cudaMemcpy(Data, host_data, len*sizeof(float), cudaMemcpyHostToDevice);

    *Device_data = Data;

    printf("array addres %p\n", Device_data);
    printf("MemcpyHostToDevice error %d\n",err);
}


extern "C" __declspec(dllexport) void DeviceToHost(float **Device_data, int len)
{
    int  err;
    float *Host_data = (float*)malloc(len*sizeof(float));


    err = cudaMemcpy(Host_data, *Device_data, len*sizeof(float), cudaMemcpyDeviceToHost);
    printf("array addres %p\n", Device_data);
    for(int i=0; i<len; i+=1){printf("data %f\n", Host_data[i]);}
    printf("MemcpyDeviceToHost error %d\n",err);

}

Python:

HostToDevice = Synth_dll.HostToDevice
DeviceToHost = Synth_dll.DeviceToHost
HostToDevice.argtypes = [POINTER(POINTER(c_float)), c_size_t]
DeviceToHost.argtypes = [POINTER(POINTER(c_float)), c_size_t]


Device = POINTER(c_float)()

HostToDevice((Device), 4)
DeviceToHost((Device), 4)

输出:

array addres 00000276755E1F90
MemcpyHostToDevice error 0


array addres 00000276755E1F90
data 0.000000
data 1.000000
data 2.000000
data 3.000000
MemcpyDeviceToHost error 0

推荐阅读