首页 > 解决方案 > 为什么 cuda 内核 w/i 发散的性能优于 w/o 发散?

问题描述

嗨,我正在编写 cuda 内核来了解经线发散的行为。这些是我拥有的 3 个内核:

#include <cuda_runtime.h>
#include <stdio.h>
#include "util.h"
#include <chrono>

__global__ void wardUp(float *c)
{
    float a = 0.0;
    float b = 0.0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if ((idx/warpSize)%2 == 0){
        a = 100.0f;
    }
    else{
        b = 200.0f;
    }
    c[idx] = a+b;
}

__global__ void kernel1(float *c)
{
    float a = 0.0;
    float b = 0.0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if ((idx/warpSize)%2 == 0){
        a = 100.0f;
    }
    else{
        b = 200.0f;
    }
    c[idx] = a+b;
}

__global__ void kernel2(float *c)
{
    float a = 0.0;
    float b = 0.0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx%2 == 0){
        a = 100.0f;
    }
    else{
        b = 200.0f;
    }
    c[idx] = a+b;
}

int main(int argc, char **argv)
{
    initDevice(0);
    int size = 64;
    int blocksize = 64;
    int nBytes = sizeof(float)*size;
    float *a_d;
    CHECK(cudaMalloc((float**)&a_d, nBytes));
    dim3 block(blocksize, 1);
    dim3 grid((blocksize-1)/block.x+1, 1);

    wardUp<<<grid, block>>>(a_d);

    float elapsed = 0;
    cudaEvent_t start1, stop1;
    CHECK(cudaEventCreate(&start1));
    CHECK(cudaEventCreate(&stop1));
    CHECK(cudaEventRecord(start1, 0));
    kernel1<<<grid, block>>>(a_d);
    CHECK(cudaEventRecord(stop1, 0));
    CHECK(cudaEventSynchronize(stop1));
    CHECK(cudaEventElapsedTime(&elapsed, start1, stop1));
    printf("kernel1 take:%2f ms\n", elapsed);

    float elapsed_1 = 0;
    cudaEvent_t start2, stop2;
    CHECK(cudaEventCreate(&start2));
    CHECK(cudaEventCreate(&stop2));
    CHECK(cudaEventRecord(start2, 0));
    kernel2<<<grid, block>>>(a_d);
    CHECK(cudaEventRecord(stop2, 0));
    CHECK(cudaEventSynchronize(stop2));
    CHECK(cudaEventElapsedTime(&elapsed_1, start2, stop2));
    printf("kernel2 take:%2f ms\n", elapsed_1);
    
    cudaFree(a_d);
    cudaEventDestroy(start1);
    cudaEventDestroy(stop1);
    cudaEventDestroy(start2);
    cudaEventDestroy(stop2);
    return 0;
}

如果我的理解是正确的,kernel1则不会出现分歧问题,因为if分支发生在线程 0-31 上,相同的扭曲。 kernel2由于奇数线程和偶数线程不能同时执行,因此会有分歧问题。但我观察到这kernel1kernel2. 为什么会发生这种情况?

Using device: 0: NVIDIA GeForce RTX 2080 Ti
kernel1 take:0.008864 ms
kernel2 take:0.006752 ms

我切换到使用cudaEventRecord来重新计算持续时间,但它似乎kernel1kernel2.

标签: cuda

解决方案


您的方法存在/存在各种问题。我可能不会全部列出:

  • 问题规模太小,无法进行基准测试
  • 编译器优化对你不利
  • 代码太简单了;编译器对预测的使用正在减轻扭曲发散的影响
  • 您的内核持续时间测量方法有缺陷
  • 您对gridbased on的创建blocksize是不明智的(尽管在size==时它恰好是明智的blocksize)。它应该基于问题size,而不是blocksize

以下代码解决了这些问题,并显示内核持续时间增加了大约 2 倍,从基于 warp 边界执行 if/then 决策的代码到为每个其他线程执行此操作的代码:

$ cat t1877.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(x) x

__global__ void kernel1(int *c, int y, int z, int l1, int l2)
{
    int a = 0;
    int b = 0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx&32){
        for (int i = 0; i < l1; i++){
          a = a&y; a = a|z;}
    }
    else{
        for (int i = 0; i < l2; i++){
          b = b|y; b = b&z;}
    }
    c[idx] = a+b;
}

__global__ void kernel2(int *c, int y, int z, int l1, int l2)
{
    int a = 0;
    int b = 0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx&1){
        for (int i = 0; i < l1; i++){
          a = a&y; a = a|z;}
    }
    else{
        for (int i = 0; i < l2; i++){
          b = b|y; b = b&z;}
    }
    c[idx] = a+b;
}

int main(int argc, char **argv)
{
    int blocksize = 64;
    int size = blocksize*1048576;
    int nBytes = sizeof(int)*size;
    int *a_d;
    CHECK(cudaMalloc((int**)&a_d, nBytes));
    dim3 block(blocksize, 1);
    dim3 grid(size/block.x, 1);

    kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();

    auto start1 = std::chrono::system_clock::now();
    kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();
    auto end1 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff1 = end1 - start1;
    printf("kernel1 take:%2f s\n", diff1.count());

    kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();

    auto start2 = std::chrono::system_clock::now();
    kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();
    auto end2 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff2 = end2 - start2;
    printf("kernel2 take:%2f s\n", diff2.count());
    return 0;
}
$ nvcc -o t1877 t1877.cu -arch=sm_70
$ ./t1877
kernel1 take:0.205650 s
kernel2 take:0.406347 s
$

推荐阅读