首页 > 解决方案 > 如何在启动内核函数之前和之后最小化 cuda 内存相关操作带来的开销?

问题描述

为方便起见,下面是可以编译和运行的示例代码:

module elemWiseOps
   USE cudafor
   USE cublas
   !
   ! Definition of symbols for real types (RP) and complex ones (CP)
   !
   IMPLICIT NONE
   !
   INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6,   37)     ! REAL32
   INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307)     ! REAL64
   !
   INTERFACE VMUL
       MODULE PROCEDURE dVmul
   END INTERFACE

   INTERFACE VADD
       MODULE PROCEDURE dVadd
   END INTERFACE

   Contains

       attributes(global) subroutine dVmul(a, b)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)    :: a
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
       !
       ! local variables
       !
       INTEGER, DEVICE                  :: i1, j, n(2)

       i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
       j = (blockIdx%y-1)*blockDim%y + threadIdx%y
       n(1) = size(a,1)
       n(2) = size(a,2)
       if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) * b(i1,j)
     end subroutine dVmul

     attributes(global) subroutine dVadd(a, b)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)    :: a
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
       !
       ! local variables
       !
       INTEGER, DEVICE                  :: i1, j, n(2)

       i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
       j = (blockIdx%y-1)*blockDim%y + threadIdx%y
       n(1) = size(a,1)
       n(2) = size(a,2)
       if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) + b(i1,j)
     end subroutine dVadd

     attributes(global) subroutine dPrintV(a)
       !
       IMPLICIT NONE
       !
       REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in)  :: a
       !
       ! local variables
       !
       INTEGER, DEVICE    :: i1, j1, n(2)

       n(1) = size(a,1)
       n(2) = size(a,2)

       DO j1 = 1, n(2)
           Do i1 = 1, n(1)
              print*, a(i1,j1)
           ENDDO
       ENDDO
     end subroutine dPrintV

   end module elemWiseOps

   PROGRAM Test
   !
   ! This is the main program for Test
   !
   USE cudafor
   USE cublas
   USE elemWiseOps

   !
   IMPLICIT NONE
   !
   REAL(DP), ALLOCATABLE, DIMENSION(:,:)           :: a,   b
   REAL(DP), ALLOCATABLE, DEVICE, DIMENSION(:,:)   :: a_d, b_d
   !
   INTEGER, PARAMETER                              :: m = 500, n = 1000
   INTEGER    :: i1, i2, istat
   type(dim3) :: grid, tBlock
   !
   tBlock    = dim3(32,32,1)
   grid      = dim3(ceiling(real(m)/tBlock%x), &
                    ceiling(real(n)/tBlock%y), 1)
   !
   ! Allocate storage for the arrays
   !
   Allocate(a(m,n),b(m,n))
   Allocate(a_d(m,n),b_d(m,n))
   !
   ! Initialize the host arrays
   !
   Do i2 = 1, n
       Do i1 = 1, m
           a(i1, i2) = REAL(i1, DP)
           b(i1, i2) = REAL(i1, DP)
       Enddo
   Enddo
   !
   ! Copy to the device arrays
   !
   istat = cudaMemcpy2D(a_d, m, a, m, m, n)
   istat = cudaMemcpy2D(b_d, m, b, m, m, n)
   !!!
   !
   ! Now invoke the kernels
   !
   Call vadd<<<grid,tBlock>>>(a_d, b_d)
   Call vadd<<<grid,tBlock>>>(a_d, b_d)
   Call vadd<<<grid,tBlock>>>(a_d, b_d)
   !
   ! Free storage for the arrays
   !
   Deallocate(a,b)
   Deallocate(a_d,b_d)
   !
END PROGRAM Test

我编译了代码

nvfortran -O3 -cuda -fast -gpu=cc60 -lcufft -lcublas -Minfo=accel test.f90

然后我用

nvprof --print-api-trace ./a.out

如果我连续 3 次调用内核,会发生以下情况:

... ...
171.32ms  287.59ms  cudaMalloc
458.94ms  98.290us  cudaMalloc
462.69ms  816.76us  cudaMemcpy2D
463.51ms  869.13us  cudaMemcpy2D
464.39ms  134.45us  cudaMalloc
464.52ms  12.703us  cudaMemcpy
464.54ms  5.2460us  cudaMalloc
464.54ms  4.1840us  cudaMemcpy
464.55ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [119])
464.61ms  123.66us  cudaFree
464.73ms  78.753us  cudaFree
464.81ms  107.87us  cudaMalloc
464.92ms  6.6480us  cudaMemcpy
464.92ms  3.7720us  cudaMalloc
464.93ms  3.8070us  cudaMemcpy
464.93ms  10.245us  cudaLaunchKernel (elemwiseops_dvadd_ [126])
464.94ms  124.35us  cudaFree
465.07ms  56.498us  cudaFree
465.12ms  93.173us  cudaMalloc
465.22ms  6.2260us  cudaMemcpy
465.22ms  3.6010us  cudaMalloc
465.23ms  3.6800us  cudaMemcpy
465.23ms  7.6920us  cudaLaunchKernel (elemwiseops_dvadd_ [133])
465.24ms  125.72us  cudaFree
465.37ms  47.850us  cudaFree
465.64ms  60.294us  cudaFree
465.70ms  208.73us  cudaFree

                          

我的问题是,在 cuda Fortran 中调用内核时是否可以避免“cudaMalloc”、“cudaMemcpy”和“cudaFree”的开销?非常感谢!

标签: optimizationcudafortrankernel

解决方案


找到原因。代替使用延迟维度数组作为参数,使用显式维度数组消除了临时数组的创建。即,将原始内核修改为以下形式即可解决问题:

attributes(global) subroutine dVadd(m, n, a, b)
!
IMPLICIT NONE
!
integer, value :: m, n
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(in)    :: a
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE                  :: i1, j

i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y

if (i1<=m .and. j<=n) b(i1,j) = a(i1,j) + b(i1,j)

end subroutine dVadd

这是更新的 nvprof 输出:

... ...
171.32ms  287.59ms  cudaMalloc
458.94ms  98.290us  cudaMalloc
462.69ms  816.76us  cudaMemcpy2D
463.51ms  869.13us  cudaMemcpy2D
464.55ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [119])
465.12ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [126])
465.61ms  52.608us  cudaLaunchKernel (elemwiseops_dvadd_ [133])
466.93ms  123.66us  cudaFree
467.75ms  78.753us  cudaFree

这个结论在某种程度上与我在普通 Fortran 代码中的经验相冲突,在将可分配数组作为实际参数传递给将 DEFERRED-SHAPE 数组声明为虚拟参数的子例程时,不应创建临时数组。


推荐阅读