首页 > 解决方案 > 分区子组的 Cuda 实现

问题描述

是否有更有效的方法来实现 Vulkan/OpenGL 的“分区子组”功能,而不必遍历子组中的所有元素?我当前的实现只使用从 0 到 WARP_SIZE 的循环。

参考:

(幻灯片 37+38)https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9909-nvidia-vulkan-features-update.pdf

https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GL_NV_shader_subgroup_partitioned.txt

简单实现:

__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
    uint32_t result = 0;
    for (int i = 0; i < 32; ++i)
    {
        int x = __shfl_sync(0xFFFFFFFF, p(0), i);
        int y = __shfl_sync(0xFFFFFFFF, p(1), i);

        uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
        if (i == threadIdx.x & 31) result = b;
    }
    return result;
}

__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
    float result = 0;
    for ( unsigned int i = 0; i < 32; ++i)
    {
        float other_value = __shfl_sync(0xFFFFFFFF, value, i);
        if ((1U << i)  & ballot) result += other_value;
    }
    return result;
}

标签: cuda

解决方案


感谢 Abator 的提示,我想出了一个更有效的解决方案。这有点难看,因为labeled_partition仅用于实现int但效果很好。

template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
    using namespace cooperative_groups;
    thread_block block                   = this_thread_block();
    thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);

    coalesced_group g1 = labeled_partition(tile32, p(0));
    coalesced_group g2 = labeled_partition(tile32, p(1));

    details::_coalesced_group_data_access acc;
    return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}


template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
    int s = group.size();
    int r = group.thread_rank();

    for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
    {
        auto v = group.template shfl_down(value, offset);
        if (r + offset < s) value += v;
    }
    return value;
}

推荐阅读