cuda - Cublas 矩阵-矩阵乘法参数
问题描述
我正在尝试使用 Cublas 进行矩阵矩阵乘法,但它仍然无法正常工作,我不知道问题所在。由于这是我第一次使用 Cublas,我不确定我是否设置了正确的参数,尤其是前导维度
例如:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[i * 3 + j]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, size * sizeof(double));
cudaMalloc((void**)&dev_b, 12 * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 3, dev_b, 3, &beta, dev_c, 3);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
使用的两个矩阵是:
1 2 3
4 5 6
7 8 9
10 11 12
10 20 30
40 50 60
70 80 90
而输出是:
** On entry to DGEMM parameter number 8 had an illegal value
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
0.000000 0.000000 0.000000
解决方案
您的代码有 3 个问题。
您没有检查 CUDA 错误或 cuBLAS 错误。此处描述了 CUDA 错误检查使用 CUDA 运行时 API 检查错误的规范方法是什么?
通过适当的错误检查,您会发现
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
由于非法内存访问而失败。dev_a
并且dev_b
分配了错误的大小。dev_a 应该是 12,dev_b 应该是 size。您对矩阵的内存布局做出了错误的假设。cuBLAS 使用以列为主的存储格式。https://docs.nvidia.com/cuda/cublas/index.html#data-layout
这意味着 A 和 C 的前导维度是 4,而不是 3。这也意味着 A 和 B 是
1 5 9
2 6 10
3 7 11
4 8 12
and
10 40 70
20 50 80
30 60 90
,respectively
还必须更改 C 的打印以考虑列主要格式
工作代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include <stdio.h>
void mulWithCuda(double *c, const double *a, const double *b, unsigned int size);
int main(){
const int arraySize = 9;
const double a[12] = { 1, 2, 3, 4, 5, 6, 7, 8 ,9, 10, 11, 12 };
const double b[arraySize] = { 10, 20, 30, 40, 50, 60, 70, 80, 90 };
double c[12] = { 0 };
mulWithCuda(c, a, b, arraySize);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 3; j++) {
printf("%lf ", c[j * 4 + i]);
}
printf("\n");
}
return 0;
}
void mulWithCuda(double* c, const double* a, const double* b, unsigned int size){
double *dev_a = 0;
double *dev_b = 0;
double *dev_c = 0;
cudaMalloc((void**)&dev_c, 12 * sizeof(double));
cudaMalloc((void**)&dev_a, 12 * sizeof(double));
cudaMalloc((void**)&dev_b, size * sizeof(double));
cudaMemcpy(dev_a, a, 12 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(double), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 4, 3, 3, &alpha, dev_a, 4, dev_b, 3, &beta, dev_c, 4);
cudaMemcpy(c, dev_c, 12 * sizeof(double), cudaMemcpyDeviceToHost);
cublasDestroy(handle);
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
}
Output
380.000000 830.000000 1280.000000
440.000000 980.000000 1520.000000
500.000000 1130.000000 1760.000000
560.000000 1280.000000 2000.000000
推荐阅读
- ruby-on-rails - 在控制器的初始化(构造函数)中访问参数
- matlab - 从循环右侧开始的深度优先搜索
- ruby-on-rails - Klaviyo Form 在主页显示两次
- python - Python如何在不达到100% CPU的情况下优化map函数来处理海量数据
- r - R改变大数的值
- java - 当我不知道文件流在哪里打开时,有什么方法可以关闭它?
- wpf - 如何防止使用 GridView 列调整 Listview 的大小
- react-native - 反应本机路由器通量;加载回上一个组件很慢
- security - 在标头中包含 xsrf 令牌的最佳方法是 cookie 是 httpOnly
- sql - 如何查询范围为每个关联的最新记录的记录