首页 > 解决方案 > 请求的处理器多于允许的数量

问题描述

我用 MPI 并行化了三个嵌套循环。当我运行代码时,弹出一个错误,说“运行:错误:无法为作业 20258899 创建步骤:请求的处理器数量超过允许的数量”

这是我用来提交作业的脚本。

#!/bin/bash
#SBATCH --partition=workq
#SBATCH --job-name="code"
#SBATCH --nodes=2
#SBATCH --time=1:00:00
#SBATCH --exclusive
#SBATCH --err=std.err
#SBATCH --output=std.out
#---#
module switch PrgEnv-cray PrgEnv-intel
export OMP_NUM_THREADS=1
#---#
echo "The job "${SLURM_JOB_ID}" is running on "${SLURM_JOB_NODELIST}
#---#
srun --ntasks=1000 --cpus-per-task=${OMP_NUM_THREADS} --hint=nomultithread ./example_parallel

我在下面粘贴我的代码。有人能告诉我我的代码有什么问题吗?我使用的 MPI 是否错误?非常感谢。

PROGRAM THREEDIMENSION
USE MPI
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i, j, k, le(3)
REAL (KIND=dp), ALLOCATABLE :: kp(:,:,:,:), kpt(:,:), col1(:), col2(:)
REAL (KIND=dp) :: su, co, tot
INTEGER :: world_size, world_rank, ierr
INTEGER :: world_comm_1st, world_comm_2nd, world_comm_3rd
INTEGER :: th3_dimension_size, th3_dimension_size_max, th3_dimension_rank
INTEGER :: th2_dimension_size, th2_dimension_size_max, th2_dimension_rank
INTEGER :: th1_dimension_size, th1_dimension_size_max, th1_dimension_rank
INTEGER :: proc_1st_dimension_len, proc_2nd_dimension_len, proc_3rd_last_len, proc_i, proc_j, proc_k
REAL (KIND=dp) :: t0, t1

CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, world_rank, ierr)

IF (world_rank == 0) THEN
   t0 = MPI_WTIME()
END IF

le(1) = 1000
le(2) = 600
le(3) = 900
ALLOCATE (kp(le(1),le(2),le(3),3))
ALLOCATE (kpt(le(3),3))
ALLOCATE (col1(le(1)))
ALLOCATE (col2(le(2)))

DO i = 1, le(1), 1
   DO j = 1, le(2), 1
      DO k = 1, le(3), 1
         kp(i,j,k,1) = DBLE(i+j+j+1)
         kp(i,j,k,2) = DBLE(i+j+k+2)
         kp(i,j,k,3) = DBLE(i+j+k+3)
      END DO
   END DO
END DO

proc_1st_dimension_len = (world_size - 1) / le(1) + 1
proc_2nd_dimension_len = (world_size - 1 / (le(1) + le(2))) + 1
proc_3rd_last_len = MOD(world_size - 1, le(1)+le(2)) + 1

IF (world_rank <= proc_3rd_last_len*proc_2nd_dimension_len*proc_1st_dimension_len) THEN
   proc_i = MOD(world_rank,proc_1st_dimension_len)
   proc_j = world_rank / proc_1st_dimension_len
   proc_k = world_rank / (proc_1st_dimension_len*proc_2nd_dimension_len)
ELSE
   proc_i = MOD(world_rank-proc_3rd_last_len,proc_1st_dimension_len-1)
   proc_j = (world_rank-proc_3rd_last_len) / proc_1st_dimension_len-1
   proc_k = (world_rank-proc_3rd_last_len) / (proc_2nd_dimension_len*proc_2nd_dimension_len-1)
END IF

CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)

CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_i,world_rank,world_comm_1st,ierr)
CALL MPI_COMM_SIZE(world_comm_1st,th1_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_1st,th1_dimension_rank,ierr)

CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_j,world_rank,world_comm_2nd,ierr)
CALL MPI_COMM_SIZE(world_comm_2nd,th2_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_2nd,th2_dimension_rank,ierr)

CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_k,world_rank,world_comm_3rd,ierr)
CALL MPI_COMM_SIZE(world_comm_3rd,th3_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_3rd,th3_dimension_rank,ierr)

CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th1_dimension_size,th1_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th2_dimension_size,th2_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)

IF (world_rank == 0) THEN
   OPEN (UNIT=3, FILE='out.dat', STATUS='UNKNOWN')
END IF

col1 = 0.0
DO i = 1, le(1), 1
   IF (MOD(i-1,th1_dimension_size_max) /= th1_dimension_rank) CYCLE
   col2 = 0.0
   DO j = 1, le(2), 1
      IF (MOD(j-1,th2_dimension_size_max) /= th2_dimension_rank) CYCLE
      kpt = kp(i,j,:,:)
      su = 0.0
      DO k = 1, le(3), 1
         IF(MOD(k-1,th1_dimension_size*th2_dimension_size) /= th3_dimension_rank) CYCLE
         CALL CAL(kpt(k,3),co)
         su = su + co
      END DO
      CALL MPI_BARRIER(world_comm_3rd,ierr)
      CALL MPI_REDUCE(su,col2(j),1,MPI_DOUBLE,MPI_SUM,0,world_comm_3rd,ierr)
   END DO
   CALL MPI_BARRIER(world_comm_2nd,ierr)
   CALL MPI_REDUCE(col2,col1(i),le(2),MPI_DOUBLE,MPI_SUM,0,world_comm_2nd,ierr)
END DO

CALL MPI_BARRIER(world_comm_1st,ierr)
tot = 0.0
IF (th1_dimension_rank == 0) THEN
   CALL MPI_REDUCE(col1,tot,le(1),MPI_DOUBLE,MPI_SUM,0,world_comm_1st,ierr)
   WRITE (UNIT=3, FMT=*) tot
   CLOSE (UNIT=3)
END IF

DEALLOCATE (kp)
DEALLOCATE (kpt)
DEALLOCATE (col1)
DEALLOCATE (col2)

IF (world_rank == 0) THEN
   t1 = MPI_WTIME()
   WRITE (UNIT=3, FMT=*) 'Total time:', t1 - t0, 'seconds'
END IF

CALL MPI_FINALIZE (ierr)

STOP
END PROGRAM THREEDIMENSION

SUBROUTINE CAL(arr,co)
IMPLICIT NONE
INTEGER, PARAMETER :: dp=SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i
REAL (KIND=dp) :: arr(3), co

co = 0.0d0
co = co + (arr(1) ** 2 + arr(2) * 3.1d1) / (arr(3) + 5.0d-1)

RETURN
END SUBROUTINE CAL

标签: fortranmpibatch-processingslurm

解决方案


使用#SBATCH文件头中的指令,您显式地请求两个节点,并且,由于您没有指定--ntasks,您获得每个节点一个任务的默认值,因此您隐式地请求两个任务。

然后,当作业开始时,您的srun生产线会尝试“使用”1000 个任务。你应该有一条线

#SBATCH --ntasks=1000 

按照@Gilles 的建议在标题中。默认情况下,该srun命令将从这 1000 个任务继承,因此在这种情况下无需在此处指定它。

此外,如果${OMP_NUM_THREADS}不是 1,则必须--cpu-per-tasks在标头中将 指定为SBATCH指令,否则您将面临相同的错误。


推荐阅读