首页 > 解决方案 > 为什么这个基数排序 CUDA 代码只对 32 个元素进行排序?

问题描述

当我在 Visual Studio 15 中执行此代码时,它只对 32 个元素进行排序。如果我将 WSIZE 设置为大于 32 或小于 32,它会显示与输出相同的未排序元素。谁能帮帮我吗?

我的系统信息。

处理器 - Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz, 2400 Mhz, 2 Core(s), 4 Logical Processor(s)

内存 - 8GB

专用显卡 - NVIDIA GeForce 940M 4GB(384 CUDA 核心)

这是设置为 32 的 WSIZE 的输出https://i.imgur.com/geQ9VUZ.jpg

这是 WSIZE 的输出设置为 19 https://i.imgur.com/4K0xkep.jpg

这是设置为 50 的 WSIZE 的输出https://i.imgur.com/M8irQhs.jpg

#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

#define WSIZE 32 /*to set No. of elements to sort and also set No. of threads*/
#define LOOPS 2
#define UPPER_BIT 4
#define LOWER_BIT 0

__device__ unsigned int ddata[WSIZE];
__device__ int ddata_s[WSIZE];

__global__ void parallelRadix()
{
    // This data in shared memory
    __shared__ volatile unsigned int sdata[WSIZE];

    // Load from global into shared variable
    sdata[threadIdx.x] = ddata[threadIdx.x];

    unsigned int bitmask = 1;
    unsigned int offset = 0;
    // -1, -2, -4, -8, -16, -32, -64, -128, -256,...
    unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x;
    unsigned int mypos;

    // For each LSB to MSB
    for (int i = LOWER_BIT; i < UPPER_BIT; i++)
    {
        unsigned int mydata = sdata[((WSIZE - 1) - threadIdx.x) + offset];
        unsigned int mybit = mydata&bitmask;
        // Get population of ones and zeroes
        unsigned int ones = __ballot(mybit);
        unsigned int zeroes = ~ones;
        // Switch ping-pong buffers
        offset ^= WSIZE;

        // Do zeroes, then ones
        if (!mybit)
        {
            mypos = __popc(zeroes&thrmask);
        }
        else {      // Threads with a one bit
                    // Get my position in ping-pong buffer
            mypos = __popc(zeroes) + __popc(ones&thrmask);
        }

        // Move to buffer  (or use shfl for cc 3.0)
        sdata[mypos - 1 + offset] = mydata;
        // Repeat for next bit
        bitmask <<= 1;
    }
    // Put results to global
    ddata[threadIdx.x] = sdata[threadIdx.x + offset];
}

int main() {

    /* Parallel Radix Sort */

    unsigned int hdata[WSIZE];
    float totalTime = 0;
    LARGE_INTEGER cicles;
    for (int lcount = 0; lcount < LOOPS; lcount++)
    {
        //srand(time(NULL));
        // Array elements have value in range of 1024
        unsigned int range = 1U << UPPER_BIT;

        // Fill array with random elements
        // Range = 1024
        //srand(time(0));
        QueryPerformanceCounter(&cicles);
        srand(cicles.QuadPart);
        printf("\n input array  %d\n", lcount);
        for (int i = 0; i < WSIZE; i++)
        {
            hdata[i] = rand() % range;
            printf("%u ", hdata[i]);

        }


        // Copy data from host to device
        cudaMemcpyToSymbol(ddata, hdata, WSIZE * sizeof(unsigned int));

        // Execution time measurement, that point starts the clock
        high_resolution_clock::time_point t1 = high_resolution_clock::now();
        parallelRadix << < 1, WSIZE >> >();
        // Make kernel function synchronous
        cudaDeviceSynchronize();
        // Execution time measurement, that point stops the clock
        high_resolution_clock::time_point t2 = high_resolution_clock::now();

        // Execution time measurement, that is the result
        auto duration = duration_cast<milliseconds>(t2 - t1).count();

        // Summination of each loops' execution time
        totalTime += (float)duration / 1000.00;

        // Copy data from device to host
        cudaMemcpyFromSymbol(hdata, ddata, WSIZE * sizeof(unsigned int));

        printf("\n sorted array %d\n", lcount);
        for (int i = 0; i < WSIZE; i++)
            printf("%u ", hdata[i]);


    }
    printf("\n\n");
    printf("Parallel Radix Sort:\n");
    printf("Array size = %d\n", WSIZE * LOOPS);
    printf("Time elapsed = %fseconds\n", totalTime);

}

标签: visual-studiosortingcudagpuradix-sort

解决方案


代码来自这里。我写的。

它仅设计用于对 32 个元素进行排序,因为它旨在仅使用 CUDA 中的单个 warp,即 32 个线程。它从未设计或打算对超过 32 个元素进行排序,并且它在经线级别使用的许多技巧必须重新编写才能在大于经线规模的情况下工作。 WSIZE代表 WARP SIZE,在 CUDA 中,warp 的大小是 32 个线程。您无法更改经纱尺寸。

如果您想对更大的数据集进行排序,请使用推力cub中的操作。


推荐阅读