c++ - 为什么我的程序对于大于 2^29 的数字会失败?
问题描述
我制作了以下程序,它采用长度为 2^i 的列表“a”(初始化为全 1),并将其包含的所有数字相加。当 i 至少为 30 时,它会返回一个无意义的答案。我不明白为什么,我对所有东西都使用了 long,在我的机器上 long 的大小是 8 字节 = 64 位,所以我会说它必须能够容纳高达 2^(8 * 8)/ 的整数2.
// FOR NOW ONLY WORKS WITH N A POWER OF 2
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
/*
Parallel reduce helper function. When run
with n/2 threads, adds a[n - 1 - i] to a[i]
for i = 0, ..., n - 1.
*/
__global__ void reduce(long* a, long n)
{
long i = threadIdx.x + blockDim.x * blockIdx.x;
long stride = gridDim.x * blockDim.x;
for (long j = i; j < n/2; j += stride)
{
a[j] += a[n - 1 - j];
}
}
/*
For an array a of length n, puts the sum of all elements in a[0]
*/
void parallelReduce(long* a, long n)
{
// Get some information about the GPU
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int multiProcessors = prop.multiProcessorCount;
// Repeatedly use the helper function reduce
while (n > 1) {
int threadsPerBlock = 256;
int numberOfBlocks = 32 * multiProcessors;
reduce << <numberOfBlocks, threadsPerBlock >> > (a, n);
cudaDeviceSynchronize();
n = (n + n % 2) / 2; // Rounds n/2 up.
}
}
int main()
{
// Initialize vector with N 1's.
long N = 2 << 28;
size_t size = N * sizeof(long);
long* h_a;
cudaMallocHost(&h_a, size);
for (long i = 0; i < N; i++) {
h_a[i] = 1;
}
// Copy to device (can be done asynchronically to hide transfer time, but
// that messes up the timing of the kernel).
long* d_a;
cudaMalloc(&d_a, size);
cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
// Calculate the sum sequentially and time it.
auto tic = std::chrono::high_resolution_clock::now();
long hostSolution = 0;
for (long i = 0; i < N; i++)
{
hostSolution += h_a[i];
}
auto toc = std::chrono::high_resolution_clock::now();
int duration = std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count();
std::cout << "The sequential function says the answer is " << hostSolution << " this took " << duration
<< " ms." << std::endl;
// Kernel computation
tic = std::chrono::high_resolution_clock::now();
parallelReduce(d_a, N);
toc = std::chrono::high_resolution_clock::now();
int parallelDuration = std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count();
// Copy result back to host
long solution;
cudaMemcpy(&solution, &d_a[0], sizeof(long), cudaMemcpyDeviceToHost);
// Print the parallel result and speed up:
std::cout << "The parallel function says the answer is " << solution << " this took " << parallelDuration
<< " ms." << std::endl;
std::cout << "This means we have achieved a speed up of " << duration / parallelDuration << std::endl;
}
解决方案
我们可以,您只需要正确的类型。编译并执行以下代码。
#include <iostream>
#include <limits>
int main() {
std::cout << "Max int value: " << std::numeric_limits<int>::max() << '\n';
std::cout << "Max long value: " << std::numeric_limits<long>::max() << '\n';
std::cout << "Max long long value: " << std::numeric_limits<long long>::max() << '\n';
}
您的输出取决于您的 ide/architecture/compiler flags 和其他东西,对我来说是以下内容。
Max int value: 2147483647
Max long value: 2147483647
Max long long value: 9223372036854775807
至于为什么'当 i 至少 30 岁时,它返回一个无意义的答案',溢出是 UB,你无法传达编译器在这种情况下会做什么。
推荐阅读
- asp.net-mvc - asp.net mvc中的Excel导出和导入
- microsoft-teams - Microsoft Teams 连接器配置保存失败
- angular - ngdart 生成组件不生成 css 文件?
- node.js - 为什么闪存错误并非一直有效?
- mongodb - 如何将数据库移动到mongodb中的另一个分片
- python-3.x - Python - 浏览器在脚本完成之前关闭请求
- python - 在标签 data-reactid 上使用 python 从 espn 中提取数据
- vcf-vcard - 使用链接到图像文件的 URL 导入的“PHOTO”创建 VCF 文件
- javascript - 如何使用 URL-name 获取对象属性的值?
- typescript - VueJS - 只渲染一次组件