c - mpi 进程在信号 11 上退出
问题描述
我有一个简单的 mpi 程序来执行 MPI_Scatter 和 MPI_Gather。它接收要发送到每个进程的元素的参数数,然后使用 MPI_Scatter 和 MPI_Gather 在不同进程上执行随机数的平均值。
float *create_rand_nums(int num_elements){
float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
assert(rand_nums != NULL);
int i;
for (i = 0 ; i < num_elements; i++){
*(rand_nums+i) = (rand()/ (float)RAND_MAX);
printf("Create_rand_nums func val %f \n", *(rand_nums+i));
//printf("Create_rand_nums func address %f \n", (rand_nums+i));
}
return rand_nums;
}
// Computes the average of an array of numbers
float compute_avg(float *array, int num_elements){
float sum = 0.f;
int i;
for (i = 0; i < num_elements; i++) {
sum += array[i];
}
return sum / num_elements;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "Usage: avg num_elements_per_proc\n");
exit(1);
}
int num_elements_per_proc = atoi(argv[0]);
// Seed the random number generator to get different results each time
srand(time(NULL));
MPI_Init(NULL, NULL);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Create a random array of elements on the root process. Its total
// size will be the number of elements per process times the number of processes
float *rand_nums = NULL;
if (world_rank == 0){
rand_nums = create_rand_nums(num_elements_per_proc * world_size);
}
// For each process, create a buffer that will hold a subset of the entire array
float *sub_rand_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
assert(sub_rand_nums != NULL);
// Scatter the random numbers from the root process to all processes in the MPI world
MPI_Scatter(rand_nums, num_elements_per_proc, MPI_FLOAT, sub_rand_nums, num_elements_per_proc, MPI_FLOAT, 0, MPI_COMM_WORLD);
// Compute the average of your subset
float sub_avg = compute_avg(sub_rand_nums, num_elements_per_proc);
int i;
printf("Avg inside %f \n", sub_avg);
// Gather all partial averages down to the root process
float *sub_avgs = NULL;
if (world_rank == 0) {
sub_avgs = (float *)malloc(sizeof(float) * world_size);
assert(sub_avgs != NULL);
}
MPI_Gather(&sub_avg, 1, MPI_FLOAT, sub_avgs, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
for (i = 0; i < world_size; i++) {
printf("Avg %f \n", *(sub_avgs+i));
}
// Now that we have all of the partial averages on the root, compute the
// total average of all numbers. Since we are assuming each process computed
// an average across an equal amount of elements, this computation will
// produce the correct answer.
if (world_rank == 0) {
float avg = compute_avg(sub_avgs, world_size);
printf("Avg of all elements is %f\n", avg);
// Compute the average across the original data for comparison
float original_data_avg =
compute_avg(rand_nums, num_elements_per_proc * world_size);
printf("Avg computed across original data is %f\n", original_data_avg);
}
// Clean up
if (world_rank == 0) {
free(rand_nums);
free(sub_avgs);
}
free(sub_rand_nums);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}
在运行此代码时,我得到:
mpicc -o MPI_Scatter_MPI_Gather MPI_Scatter_MPI_Gather.cc
mpirun --oversubscribe -host localhost -np 4 ./MPI_Scatter_MPI_Gather 2
Avg inside nan
Avg inside nan
Avg inside nan
Avg inside nan
[dhcp-10-142-19] *** Process received signal ***
[dhcp-10-142-19] Signal: Segmentation fault: 11 (11)
[dhcp-10-142-19] Signal code: Address not mapped (1)
[dhcp-10-142-19] Failing at address: 0x0
[dhcp-10-142-19] [ 0] 0 libsystem_platform.dylib 0x00007fff50b65f5a _sigtramp + 26
[dhcp-10-142-19] [ 1] 0 ??? 0x0000000000000000 0x0 + 0
[dhcp-10-142-19] [ 2] 0 libdyld.dylib 0x00007fff508e5145 start + 1
[dhcp-10-142-19] [ 3] 0 ??? 0x0000000000000002 0x0 + 2
[dhcp-10-142-19] *** End of error message ***
[dhcp-10-142-19] *** Process received signal ***
[dhcp-10-142-19] Signal: Segmentation fault: 11 (11)
[dhcp-10-142-19] Signal code: Address not mapped (1)
[dhcp-10-142-19] Failing at address: 0x0
[dhcp-10-142-19] [ 0] 0 libsystem_platform.dylib 0x0000--------------------------------------------------------------------------
mpirun noticed that process rank 3 with PID 0 on node dhcp-10-142-194-10 exited on signal 11 (Segmentation fault: 11).
--------------------------------------------------------------------------
7fff50b65f5a _sigtramp + 26
[dhcp-10-142-19] [ 1] 0 ??? 0x0000000000000000 0x0 + 0
[dhcp-10-142-19] [ 2] 0 libdyld.dylib 0x00007fff508e5145 start + 1
[dhcp-10-142-19] [ 3] 0 ??? 0x0000000000000002 0x0 + 2
[dhcp-10-142-19] *** End of error message ***
[dhcp-10-142-19] *** Process received signal ***
[dhcp-10-142-19] Signal: Segmentation fault: 11 (11)
[dhcp-10-142-19] Signal code: Address not mapped (1)
[dhcp-10-142-19] Failing at address: 0x0
[dhcp-10-142-19] [ 0] 0 libsystem_platform.dylib 0x00007fff50b65f5a _sigtramp + 26
[dhcp-10-142-19] [ 1] 0 ??? 0x0000000000000000 0x0 + 0
[dhcp-10-142-19] [ 2] 0 libdyld.dylib 0x00007fff508e5145 start + 1
[dhcp-10-142-19] [ 3] 0 ??? 0x0000000000000002 0x0 + 2
[dhcp-10-142-19] *** End of error message ***
[dhcp-10-142-19] *** Process received signal ***
[dhcp-10-142-19] Signal: Segmentation fault: 11 (11)
[dhcp-10-142-19] Signal code: (0)
[dhcp-10-142-19] Failing at address: 0x0
[dhcp-10-142-19] [ 0] 0 libsystem_platform.dylib 0x00007fff50b65f5a _sigtramp + 26
[dhcp-10-142-19] [ 1] 0 ??? 0x0000000000000000 0x0 + 0
[dhcp-10-142-19] [ 2] 0 libdyld.dylib 0x00007fff508e5145 start + 1
[dhcp-10-142-19] [ 3] 0 ??? 0x0000000000000002 0x0 + 2
[dhcp-10-142-19] *** End of error message ***
我想先在一台有 2 核的机器上运行这个程序,然后在多台机器上运行。但即使在一台机器上它也失败了。请帮忙
解决方案
正如Gilles Gouaillardet 的评论中所指出的,它的格式atoi(argv[0])
很糟糕,sub-avgs
应该只打印在 0 级。
推荐阅读
- c# - System.TypeLoadException:“MySql.Data.EntityFrameworkCore.Query.Internal.MySQLSqlTranslatingExpressionVisitorFactory”类型中的方法“创建”
- angular - 带有新选项的 Highcharts Angular 更新图表 - 包括链接系列
- macos - 通过 NSAppleScript 发送击键时出现错误 1002
- java - JPA Native Query - 导致未知列错误的参数,未转换为字符串
- firebase - Firebase 安全规则不起作用 - 无法弄清楚原因
- html - 如何在旁边显示固定文本
- php - Firebase 数据库查找密钥
- c++ - 在进行文本频率分析时出现浮点异常错误?
- ffmpeg - 评估 vmaf 时出现损坏的帧错误
- vue.js - Vue 组件中带有加载器的 Storybook 6.1x Args 功能