sorting - 如何提高推力排序的计算时间?
问题描述
我在下面的链接中找到了“矢量化/批量排序”和“嵌套排序”方法。 如何使用 Thrust 对矩阵的行进行排序?
当我对 500 行和 1000 个元素尝试此方法时,它们的结果是
- 矢量化/批量排序:66ms
- 嵌套排序:3290ms
我正在使用 1080ti HOF 模型来执行此操作,但与您的情况相比,它需要的时间太长。
但在下面的链接中,它可能不到 10 毫秒,几乎 100 微秒。
(如何使用 CUDA 在二维数组中找到每列的中值?)
您能否推荐如何优化此方法以减少操作时间?
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 500
#define DSIZE 1000
int my_mod_start = 0;
int my_mod() {
return (my_mod_start++) / DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2) {
return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
thrust::device_ptr<int> data;
int dsize;
__host__ __device__
void operator()(int start_idx)
{
thrust::sort(thrust::device, data + (dsize*start_idx), data + (dsize*(start_idx + 1)));
}
};
#include <time.h>
#include <windows.h>
unsigned long long dtime_usec(LONG start) {
SYSTEMTIME timer2;
GetSystemTime(&timer2);
LONG end = (timer2.wSecond * 1000) + timer2.wMilliseconds;
return (end-start);
}
int main() {
for (int i = 0; i < 3; i++) {
SYSTEMTIME timer1;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16 * DSIZE*NSORTS));
thrust::host_vector<int> h_data(DSIZE*NSORTS);
thrust::generate(h_data.begin(), h_data.end(), rand);
thrust::device_vector<int> d_data = h_data;
// first time a loop
thrust::device_vector<int> d_result1 = d_data;
thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
GetSystemTime(&timer1);
LONG time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
for (int i = 0; i < NSORTS; i++)
thrust::sort(r1ptr + (i*DSIZE), r1ptr + ((i + 1)*DSIZE));
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
//vectorized sort
thrust::device_vector<int> d_result2 = d_data;
thrust::host_vector<int> h_segments(DSIZE*NSORTS);
thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
thrust::device_vector<int> d_segments = h_segments;
GetSystemTime(&timer1);
time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
//nested sort
thrust::device_vector<int> d_result3 = d_data;
sort_functor f = { d_result3.data(), DSIZE };
thrust::device_vector<int> idxs(NSORTS);
thrust::sequence(idxs.begin(), idxs.end());
GetSystemTime(&timer1);
time_ms1 = (timer1.wSecond * 1000) + timer1.wMilliseconds;
thrust::for_each(idxs.begin(), idxs.end(), f);
cudaDeviceSynchronize();
time_ms1 = dtime_usec(time_ms1);
std::cout << "loop time: " << time_ms1 << "ms" << std::endl;
if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
}
return 0;
}
解决方案
-G
从您的推力体验中获得的主要收获是,当您对性能感兴趣时,您永远不应该编译调试项目或使用设备调试开关 ( )。编译设备调试代码会导致编译器忽略许多性能优化。您的案例中的差异非常显着,从调试到发布代码大约提高了 30 倍。
这是一个分段的幼崽排序,我们将启动 500 个块,每个块处理一个单独的 1024 元素数组。CUB 代码是从这里提取的。
$ cat t1761.cu
#include <cub/cub.cuh> // or equivalently <cub/block/block_radix_sort.cuh>
#include <iostream>
const int ipt=8;
const int tpb=128;
__global__ void ExampleKernel(int *data)
{
// Specialize BlockRadixSort for a 1D block of 128 threads owning 8 integer items each
typedef cub::BlockRadixSort<int, tpb, ipt> BlockRadixSort;
// Allocate shared memory for BlockRadixSort
__shared__ typename BlockRadixSort::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int thread_keys[ipt];
// just create some synthetic data in descending order 1023 1022 1021 1020 ...
for (int i = 0; i < ipt; i++) thread_keys[i] = (tpb-1-threadIdx.x)*ipt+i;
// Collectively sort the keys
BlockRadixSort(temp_storage).Sort(thread_keys);
__syncthreads();
// write results to output array
for (int i = 0; i < ipt; i++) data[blockIdx.x*ipt*tpb + threadIdx.x*ipt+i] = thread_keys[i];
}
int main(){
const int blks = 500;
int *data;
cudaMalloc(&data, blks*ipt*tpb*sizeof(int));
ExampleKernel<<<blks,tpb>>>(data);
int *h_data = new int[blks*ipt*tpb];
cudaMemcpy(h_data, data, blks*ipt*tpb*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; i++) std::cout << h_data[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1761 t1761.cu -I/path/to/cub/cub-1.8.0
$ CUDA_VISIBLE_DEVICES="2" nvprof ./t1761
==13713== NVPROF is profiling process 13713, command: ./t1761
==13713== Warning: Profiling results might be incorrect with current version of nvcc compiler used to compile cuda app. Compile with nvcc compiler 9.0 or later version to get correct profiling results. Ignore this warning if code is already compiled with the recommended nvcc version
0 1 2 3 4 5 6 7 8 9
==13713== Profiling application: ./t1761
==13713== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 60.35% 308.66us 1 308.66us 308.66us 308.66us [CUDA memcpy DtoH]
39.65% 202.79us 1 202.79us 202.79us 202.79us ExampleKernel(int*)
API calls: 98.39% 210.79ms 1 210.79ms 210.79ms 210.79ms cudaMalloc
0.72% 1.5364ms 1 1.5364ms 1.5364ms 1.5364ms cudaMemcpy
0.32% 691.15us 1 691.15us 691.15us 691.15us cudaLaunchKernel
0.28% 603.26us 97 6.2190us 400ns 212.71us cuDeviceGetAttribute
0.24% 516.56us 1 516.56us 516.56us 516.56us cuDeviceTotalMem
0.04% 79.374us 1 79.374us 79.374us 79.374us cuDeviceGetName
0.01% 13.373us 1 13.373us 13.373us 13.373us cuDeviceGetPCIBusId
0.00% 5.0810us 3 1.6930us 729ns 2.9600us cuDeviceGetCount
0.00% 2.3120us 2 1.1560us 609ns 1.7030us cuDeviceGet
0.00% 748ns 1 748ns 748ns 748ns cuDeviceGetUuid
$
(CUDA 10.2.89,RHEL 7)
上面我在 Tesla K20x 上运行,它的性能比 Tesla V100 更接近 1080ti。我们看到内核执行时间约为 200us。如果我在 Tesla V100 上运行完全相同的代码,内核执行时间会下降到 ~35us:
$ CUDA_VISIBLE_DEVICES="0" nvprof ./t1761
==13814== NVPROF is profiling process 13814, command: ./t1761
0 1 2 3 4 5 6 7 8 9
==13814== Profiling application: ./t1761
==13814== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 82.33% 163.43us 1 163.43us 163.43us 163.43us [CUDA memcpy DtoH]
17.67% 35.073us 1 35.073us 35.073us 35.073us ExampleKernel(int*)
API calls: 98.70% 316.92ms 1 316.92ms 316.92ms 316.92ms cudaMalloc
0.87% 2.7879ms 1 2.7879ms 2.7879ms 2.7879ms cuDeviceTotalMem
0.19% 613.75us 97 6.3270us 389ns 205.37us cuDeviceGetAttribute
0.19% 601.61us 1 601.61us 601.61us 601.61us cudaMemcpy
0.02% 72.718us 1 72.718us 72.718us 72.718us cudaLaunchKernel
0.02% 59.905us 1 59.905us 59.905us 59.905us cuDeviceGetName
0.01% 37.886us 1 37.886us 37.886us 37.886us cuDeviceGetPCIBusId
0.00% 4.6830us 3 1.5610us 546ns 2.7850us cuDeviceGetCount
0.00% 1.9900us 2 995ns 587ns 1.4030us cuDeviceGet
0.00% 677ns 1 677ns 677ns 677ns cuDeviceGetUuid
$
您会注意到没有“输入”数组,我只是在内核中合成数据,因为我们主要对性能感兴趣。如果您需要处理像 1000 这样的数组大小,您可能应该将每个数组填充到 1024(例如,填充一个非常大的数字,然后忽略排序结果中的最后一个数字。)
此代码主要来自外部文档。它是为教学目的而提供的。我并不是说它没有缺陷或适用于任何特定目的。需要您自担风险使用它。
推荐阅读
- python - 使用 BeautifulSoup 从网站上抓取日期和月份
- android - 如何创建带有倾斜模式的文本视图而不添加 \t 的 \n?
- redis - 一个redis集群可以支持的最大master数
- facebook - Facebook Instant Gaming 配对功能因“玩家当前不符合匹配条件”而被屏蔽。
- java - Firebase Android getValue(class) 不起作用
- python - Django 表单错误(__init__() 采用 1 个位置参数,但给出了 2 个)
- go - Leekchan 会计与 ShopSpring Decimal 冲突
- python - 如何继承python中的每个类?
- awk - 在制表符分隔的文件中拆分和移动特定列
- r - Plotly 分类热图中的对角线注释