fortran - 请求的处理器多于允许的数量
问题描述
我用 MPI 并行化了三个嵌套循环。当我运行代码时,弹出一个错误,说“运行:错误:无法为作业 20258899 创建步骤:请求的处理器数量超过允许的数量”
这是我用来提交作业的脚本。
#!/bin/bash
#SBATCH --partition=workq
#SBATCH --job-name="code"
#SBATCH --nodes=2
#SBATCH --time=1:00:00
#SBATCH --exclusive
#SBATCH --err=std.err
#SBATCH --output=std.out
#---#
module switch PrgEnv-cray PrgEnv-intel
export OMP_NUM_THREADS=1
#---#
echo "The job "${SLURM_JOB_ID}" is running on "${SLURM_JOB_NODELIST}
#---#
srun --ntasks=1000 --cpus-per-task=${OMP_NUM_THREADS} --hint=nomultithread ./example_parallel
我在下面粘贴我的代码。有人能告诉我我的代码有什么问题吗?我使用的 MPI 是否错误?非常感谢。
PROGRAM THREEDIMENSION
USE MPI
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i, j, k, le(3)
REAL (KIND=dp), ALLOCATABLE :: kp(:,:,:,:), kpt(:,:), col1(:), col2(:)
REAL (KIND=dp) :: su, co, tot
INTEGER :: world_size, world_rank, ierr
INTEGER :: world_comm_1st, world_comm_2nd, world_comm_3rd
INTEGER :: th3_dimension_size, th3_dimension_size_max, th3_dimension_rank
INTEGER :: th2_dimension_size, th2_dimension_size_max, th2_dimension_rank
INTEGER :: th1_dimension_size, th1_dimension_size_max, th1_dimension_rank
INTEGER :: proc_1st_dimension_len, proc_2nd_dimension_len, proc_3rd_last_len, proc_i, proc_j, proc_k
REAL (KIND=dp) :: t0, t1
CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, world_rank, ierr)
IF (world_rank == 0) THEN
t0 = MPI_WTIME()
END IF
le(1) = 1000
le(2) = 600
le(3) = 900
ALLOCATE (kp(le(1),le(2),le(3),3))
ALLOCATE (kpt(le(3),3))
ALLOCATE (col1(le(1)))
ALLOCATE (col2(le(2)))
DO i = 1, le(1), 1
DO j = 1, le(2), 1
DO k = 1, le(3), 1
kp(i,j,k,1) = DBLE(i+j+j+1)
kp(i,j,k,2) = DBLE(i+j+k+2)
kp(i,j,k,3) = DBLE(i+j+k+3)
END DO
END DO
END DO
proc_1st_dimension_len = (world_size - 1) / le(1) + 1
proc_2nd_dimension_len = (world_size - 1 / (le(1) + le(2))) + 1
proc_3rd_last_len = MOD(world_size - 1, le(1)+le(2)) + 1
IF (world_rank <= proc_3rd_last_len*proc_2nd_dimension_len*proc_1st_dimension_len) THEN
proc_i = MOD(world_rank,proc_1st_dimension_len)
proc_j = world_rank / proc_1st_dimension_len
proc_k = world_rank / (proc_1st_dimension_len*proc_2nd_dimension_len)
ELSE
proc_i = MOD(world_rank-proc_3rd_last_len,proc_1st_dimension_len-1)
proc_j = (world_rank-proc_3rd_last_len) / proc_1st_dimension_len-1
proc_k = (world_rank-proc_3rd_last_len) / (proc_2nd_dimension_len*proc_2nd_dimension_len-1)
END IF
CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_i,world_rank,world_comm_1st,ierr)
CALL MPI_COMM_SIZE(world_comm_1st,th1_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_1st,th1_dimension_rank,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_j,world_rank,world_comm_2nd,ierr)
CALL MPI_COMM_SIZE(world_comm_2nd,th2_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_2nd,th2_dimension_rank,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_k,world_rank,world_comm_3rd,ierr)
CALL MPI_COMM_SIZE(world_comm_3rd,th3_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_3rd,th3_dimension_rank,ierr)
CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th1_dimension_size,th1_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th2_dimension_size,th2_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
IF (world_rank == 0) THEN
OPEN (UNIT=3, FILE='out.dat', STATUS='UNKNOWN')
END IF
col1 = 0.0
DO i = 1, le(1), 1
IF (MOD(i-1,th1_dimension_size_max) /= th1_dimension_rank) CYCLE
col2 = 0.0
DO j = 1, le(2), 1
IF (MOD(j-1,th2_dimension_size_max) /= th2_dimension_rank) CYCLE
kpt = kp(i,j,:,:)
su = 0.0
DO k = 1, le(3), 1
IF(MOD(k-1,th1_dimension_size*th2_dimension_size) /= th3_dimension_rank) CYCLE
CALL CAL(kpt(k,3),co)
su = su + co
END DO
CALL MPI_BARRIER(world_comm_3rd,ierr)
CALL MPI_REDUCE(su,col2(j),1,MPI_DOUBLE,MPI_SUM,0,world_comm_3rd,ierr)
END DO
CALL MPI_BARRIER(world_comm_2nd,ierr)
CALL MPI_REDUCE(col2,col1(i),le(2),MPI_DOUBLE,MPI_SUM,0,world_comm_2nd,ierr)
END DO
CALL MPI_BARRIER(world_comm_1st,ierr)
tot = 0.0
IF (th1_dimension_rank == 0) THEN
CALL MPI_REDUCE(col1,tot,le(1),MPI_DOUBLE,MPI_SUM,0,world_comm_1st,ierr)
WRITE (UNIT=3, FMT=*) tot
CLOSE (UNIT=3)
END IF
DEALLOCATE (kp)
DEALLOCATE (kpt)
DEALLOCATE (col1)
DEALLOCATE (col2)
IF (world_rank == 0) THEN
t1 = MPI_WTIME()
WRITE (UNIT=3, FMT=*) 'Total time:', t1 - t0, 'seconds'
END IF
CALL MPI_FINALIZE (ierr)
STOP
END PROGRAM THREEDIMENSION
SUBROUTINE CAL(arr,co)
IMPLICIT NONE
INTEGER, PARAMETER :: dp=SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i
REAL (KIND=dp) :: arr(3), co
co = 0.0d0
co = co + (arr(1) ** 2 + arr(2) * 3.1d1) / (arr(3) + 5.0d-1)
RETURN
END SUBROUTINE CAL
解决方案
使用#SBATCH
文件头中的指令,您显式地请求两个节点,并且,由于您没有指定--ntasks
,您获得每个节点一个任务的默认值,因此您隐式地请求两个任务。
然后,当作业开始时,您的srun
生产线会尝试“使用”1000 个任务。你应该有一条线
#SBATCH --ntasks=1000
按照@Gilles 的建议在标题中。默认情况下,该srun
命令将从这 1000 个任务继承,因此在这种情况下无需在此处指定它。
此外,如果${OMP_NUM_THREADS}
不是 1,则必须--cpu-per-tasks
在标头中将 指定为SBATCH
指令,否则您将面临相同的错误。
推荐阅读
- python - 在不迭代项目的情况下获取 dict 键的值
- json - 如何使用jq创建一个没有值的干净json模板文件
- python - Flask Celery TypeError: send_() 为参数“time_”获取了多个值
- laravel - Laravel: Designing a DB table for schedules
- javascript - javascript 过滤/搜索 Html 列表
- python - django 获取切片后无法过滤查询
- extjs - 更改时未检查 RadioButton
- json - How do I programmatically find the path for an asset file I have in Angular?
- flutter - 如何在 Flutter 应用中创建堆叠图像的 GridView?
- linker-errors - ScalaJS:引用不存在的类 play.twirl.api.Html