c++ - Cuda 上的蒙特卡洛
问题描述
我试图编写一个程序来计算任意生成的 N 个数的 pi。
__global__ void kernel(int* count_d, float* randomnums, double N){
int i;
double x, y, z;
//Find the overall ID of the thread
int tid = blockDim.x*blockIdx.x+threadIdx.x;
i = tid;
int xidx=0;
int yidx=0;
//Start the MonteCarlo
xidx = i + i;
yidx = xidx +1;
//Get the random x,y points
x = randomnums[xidx];
y = randomnums[yidx];
z = ((x*x) + (y*y));
if (z<=1)
count_d[tid] = 1;
else
count_d[tid] = 0;
}
int main(){
double N = 100000;
float *randomnums;
double pi;
//Threads per thread block to be launched
int threads = 1024;
//Number of thread blocks launched
int blocks = 100;
int* count_d;
unsigned int reducedcount = 0;
for (int i=threads*blocks; i<=N; i+=threads*blocks){
//Allocate the array for the random numbers
cudaMalloc((void**)&randomnums,(i)*sizeof(float));
//Use CuRand to generate an array of random numbers on the device
int status;
curandGenerator_t gen;
status = curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
status |= curandSetPseudoRandomGeneratorSeed(gen, 4294967296ULL^time(NULL));
status |= curandGenerateUniform(gen, randomnums, (i));
status |= curandDestroyGenerator(gen);
//Check to see if there was any problem launching the CURAND kernels and generating
//the random numbers on the device
if (status != CURAND_STATUS_SUCCESS){
printf("CuRand Failure\n");
exit(EXIT_FAILURE);
}
int *count = (int*)malloc(blocks*threads*sizeof(int));
unsigned int reducedcount = 0;
//Allocate the array to hold a value (1,0) whether the point in is the circle (1) or not (0)
cudaMalloc((void**)&count_d, (blocks*threads)*sizeof(int));
kernel <<<blocks, threads>>> (count_d, randomnums, i);
cudaDeviceSynchronize();
cudaMemcpy(count, count_d, blocks*threads*sizeof(int),cudaMemcpyDeviceToHost);
//Reduce array into int
for(int j = 0; j<N; j++){
reducedcount += count[j];
}
//Free the cudaMalloc()'d arrays
cudaFree(randomnums);
cudaFree(count_d);
free(count);
}
//Find the ratio
pi = ((double)reducedcount/N)*4.0;
printf("Pi: %f\n", pi);
return
0;
}
我想出了一个想法,将 N 分成具有块 * 线程大小的部分,并为每个部分午餐内核。事实证明,当我增加 N 变量时,我得到了分段错误:。Segmentation fault (core dumped)
为什么会这样?另外我想问一下我在循环中午餐内核的想法是否正确。
解决方案
你似乎做了很多手动内存管理。这很糟糕,并导致错误和难以阅读的代码。考虑thrust::device_vector
使用此代码而不是原始指针的位置。
#include <curand.h>
#include <iostream>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <chrono>
//Pulls in library in MSVC
#pragma comment(lib, "curand.lib")
struct hypot_add {
//using float2 so may as well get twice the values, for memory throughput reasons you might find that float4 or some larger structure gives better performance
__host__ __device__ float2 operator()(float2 &input) const {
const auto a = hypot(input.x, input.y);
const auto b = hypot(input.y, input.x);
return float2{ a,b };
}
};
struct float2_add
{
__host__ __device__ float2 operator()(float2 &a, float2 &b) const {
return float2{ a.x + b.x, a.y+b.y};
}
};
int main(int argc, char **argv)
{
const auto samples = 1024 * 1024*8;
thrust::device_vector<float2> values(samples);
curandGenerator_t gpu_generator;
curandCreateGenerator(&gpu_generator, CURAND_RNG_PSEUDO_DEFAULT);
const auto seed = std::chrono::system_clock::now().time_since_epoch().count();
curandSetPseudoRandomGeneratorSeed(gpu_generator, seed);
{
auto values_ptr = thrust::raw_pointer_cast(values.data());
curandGenerateUniform(gpu_generator, reinterpret_cast<float*>(values_ptr), 2 * values.size());
}
// Notice that the number of blocks and threads is automatically determined
thrust::transform(values.begin(), values.end(), values.begin(), hypot_add());
const auto sum = thrust::reduce(values.begin(), values.end(), float2{ 0,0 }, float2_add());
std::cout << 2*(sum.x+sum.y) / samples << std::endl;
}
推荐阅读
- javascript - 将 react-native 添加到 monorepo 后,React apollo hooks 失败
- pandas - 在弹性搜索中使用空值但没有 nan 索引 pandas 数据帧
- unity3d - WebGL 构建和 VR/Oculus 构建在同一个项目中
- java - AWS Lambda 的 Chrome 二进制文件
- ffmpeg - 如何在软件中解码像素格式 AV_PIX_FMT_CUDA 的 AVFrame?
- javascript - *ngIf 创建一个无限循环并且永远不会解决
- google-app-engine - 仅允许安全 SSL 连接(Google App Engine 灵活)
- python - 读取 SocketServer 处理程序中的所有等待行
- julia - 在 Windows 10 上的 JuliaPro 中运行 Julia 文件的快捷方式(或如何将 Ctrl-Enter 设置为运行文件的快捷方式)?
- javascript - Express.js 路由:如何防止重新加载上一页?