cuda - 为什么 cuda 内核 w/i 发散的性能优于 w/o 发散?
问题描述
嗨,我正在编写 cuda 内核来了解经线发散的行为。这些是我拥有的 3 个内核:
#include <cuda_runtime.h>
#include <stdio.h>
#include "util.h"
#include <chrono>
__global__ void wardUp(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel1(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel2(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
initDevice(0);
int size = 64;
int blocksize = 64;
int nBytes = sizeof(float)*size;
float *a_d;
CHECK(cudaMalloc((float**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid((blocksize-1)/block.x+1, 1);
wardUp<<<grid, block>>>(a_d);
float elapsed = 0;
cudaEvent_t start1, stop1;
CHECK(cudaEventCreate(&start1));
CHECK(cudaEventCreate(&stop1));
CHECK(cudaEventRecord(start1, 0));
kernel1<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop1, 0));
CHECK(cudaEventSynchronize(stop1));
CHECK(cudaEventElapsedTime(&elapsed, start1, stop1));
printf("kernel1 take:%2f ms\n", elapsed);
float elapsed_1 = 0;
cudaEvent_t start2, stop2;
CHECK(cudaEventCreate(&start2));
CHECK(cudaEventCreate(&stop2));
CHECK(cudaEventRecord(start2, 0));
kernel2<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop2, 0));
CHECK(cudaEventSynchronize(stop2));
CHECK(cudaEventElapsedTime(&elapsed_1, start2, stop2));
printf("kernel2 take:%2f ms\n", elapsed_1);
cudaFree(a_d);
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
return 0;
}
如果我的理解是正确的,kernel1
则不会出现分歧问题,因为if
分支发生在线程 0-31 上,相同的扭曲。
kernel2
由于奇数线程和偶数线程不能同时执行,因此会有分歧问题。但我观察到这kernel1
比kernel2
. 为什么会发生这种情况?
Using device: 0: NVIDIA GeForce RTX 2080 Ti
kernel1 take:0.008864 ms
kernel2 take:0.006752 ms
我切换到使用cudaEventRecord
来重新计算持续时间,但它似乎kernel1
比kernel2
.
解决方案
您的方法存在/存在各种问题。我可能不会全部列出:
- 问题规模太小,无法进行基准测试
- 编译器优化对你不利
- 代码太简单了;编译器对预测的使用正在减轻扭曲发散的影响
- 您的内核持续时间测量方法有缺陷
- 您对
grid
based on的创建blocksize
是不明智的(尽管在size
==时它恰好是明智的blocksize
)。它应该基于问题size
,而不是blocksize
。
以下代码解决了这些问题,并显示内核持续时间增加了大约 2 倍,从基于 warp 边界执行 if/then 决策的代码到为每个其他线程执行此操作的代码:
$ cat t1877.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(x) x
__global__ void kernel1(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&32){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
__global__ void kernel2(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&1){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
int blocksize = 64;
int size = blocksize*1048576;
int nBytes = sizeof(int)*size;
int *a_d;
CHECK(cudaMalloc((int**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid(size/block.x, 1);
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start1 = std::chrono::system_clock::now();
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end1 = std::chrono::system_clock::now();
std::chrono::duration<double>diff1 = end1 - start1;
printf("kernel1 take:%2f s\n", diff1.count());
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start2 = std::chrono::system_clock::now();
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("kernel2 take:%2f s\n", diff2.count());
return 0;
}
$ nvcc -o t1877 t1877.cu -arch=sm_70
$ ./t1877
kernel1 take:0.205650 s
kernel2 take:0.406347 s
$
推荐阅读
- laravel - laravel不同的key有很多关系
- android - 从 Android Studio 获取默认 apk 密钥库,以便在其他 PC 上重新编译
- asp.net-web-api - 如何在 Heroku 上为我的 .net 核心 Web API 配置 Postgres
- amazon-web-services - 如何使用 SingleInstance CloudFormation 模板部署 64 位 Windows Server 2016/IIS 10.0
- react-native - 渲染时不出现的按钮
- node.js - 猫鼬和节点 js findOne 和更新查询承诺未在循环中解决
- android - 如何从图库上传多张图片?
- javascript - 我怎样才能从这个元素中获取文本?
- javascript - 相同的 javascript 变量和 html 元素名称位于不同的命名空间中并且不冲突?
- javascript - 如何使用 javascript/jquery 将数字格式化为印度货币(在数据表中)