optimization - 如何在启动内核函数之前和之后最小化 cuda 内存相关操作带来的开销?
问题描述
为方便起见,下面是可以编译和运行的示例代码:
module elemWiseOps
USE cudafor
USE cublas
!
! Definition of symbols for real types (RP) and complex ones (CP)
!
IMPLICIT NONE
!
INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6, 37) ! REAL32
INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307) ! REAL64
!
INTERFACE VMUL
MODULE PROCEDURE dVmul
END INTERFACE
INTERFACE VADD
MODULE PROCEDURE dVadd
END INTERFACE
Contains
attributes(global) subroutine dVmul(a, b)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j, n(2)
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) * b(i1,j)
end subroutine dVmul
attributes(global) subroutine dVadd(a, b)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j, n(2)
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
if (i1<=n(1) .and. j<=n(2)) b(i1,j) = a(i1,j) + b(i1,j)
end subroutine dVadd
attributes(global) subroutine dPrintV(a)
!
IMPLICIT NONE
!
REAL(DP), DIMENSION(:,:), DEVICE, INTENT(in) :: a
!
! local variables
!
INTEGER, DEVICE :: i1, j1, n(2)
n(1) = size(a,1)
n(2) = size(a,2)
DO j1 = 1, n(2)
Do i1 = 1, n(1)
print*, a(i1,j1)
ENDDO
ENDDO
end subroutine dPrintV
end module elemWiseOps
PROGRAM Test
!
! This is the main program for Test
!
USE cudafor
USE cublas
USE elemWiseOps
!
IMPLICIT NONE
!
REAL(DP), ALLOCATABLE, DIMENSION(:,:) :: a, b
REAL(DP), ALLOCATABLE, DEVICE, DIMENSION(:,:) :: a_d, b_d
!
INTEGER, PARAMETER :: m = 500, n = 1000
INTEGER :: i1, i2, istat
type(dim3) :: grid, tBlock
!
tBlock = dim3(32,32,1)
grid = dim3(ceiling(real(m)/tBlock%x), &
ceiling(real(n)/tBlock%y), 1)
!
! Allocate storage for the arrays
!
Allocate(a(m,n),b(m,n))
Allocate(a_d(m,n),b_d(m,n))
!
! Initialize the host arrays
!
Do i2 = 1, n
Do i1 = 1, m
a(i1, i2) = REAL(i1, DP)
b(i1, i2) = REAL(i1, DP)
Enddo
Enddo
!
! Copy to the device arrays
!
istat = cudaMemcpy2D(a_d, m, a, m, m, n)
istat = cudaMemcpy2D(b_d, m, b, m, m, n)
!!!
!
! Now invoke the kernels
!
Call vadd<<<grid,tBlock>>>(a_d, b_d)
Call vadd<<<grid,tBlock>>>(a_d, b_d)
Call vadd<<<grid,tBlock>>>(a_d, b_d)
!
! Free storage for the arrays
!
Deallocate(a,b)
Deallocate(a_d,b_d)
!
END PROGRAM Test
我编译了代码
nvfortran -O3 -cuda -fast -gpu=cc60 -lcufft -lcublas -Minfo=accel test.f90
然后我用
nvprof --print-api-trace ./a.out
如果我连续 3 次调用内核,会发生以下情况:
... ...
171.32ms 287.59ms cudaMalloc
458.94ms 98.290us cudaMalloc
462.69ms 816.76us cudaMemcpy2D
463.51ms 869.13us cudaMemcpy2D
464.39ms 134.45us cudaMalloc
464.52ms 12.703us cudaMemcpy
464.54ms 5.2460us cudaMalloc
464.54ms 4.1840us cudaMemcpy
464.55ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [119])
464.61ms 123.66us cudaFree
464.73ms 78.753us cudaFree
464.81ms 107.87us cudaMalloc
464.92ms 6.6480us cudaMemcpy
464.92ms 3.7720us cudaMalloc
464.93ms 3.8070us cudaMemcpy
464.93ms 10.245us cudaLaunchKernel (elemwiseops_dvadd_ [126])
464.94ms 124.35us cudaFree
465.07ms 56.498us cudaFree
465.12ms 93.173us cudaMalloc
465.22ms 6.2260us cudaMemcpy
465.22ms 3.6010us cudaMalloc
465.23ms 3.6800us cudaMemcpy
465.23ms 7.6920us cudaLaunchKernel (elemwiseops_dvadd_ [133])
465.24ms 125.72us cudaFree
465.37ms 47.850us cudaFree
465.64ms 60.294us cudaFree
465.70ms 208.73us cudaFree
我的问题是,在 cuda Fortran 中调用内核时是否可以避免“cudaMalloc”、“cudaMemcpy”和“cudaFree”的开销?非常感谢!
解决方案
找到原因。代替使用延迟维度数组作为参数,使用显式维度数组消除了临时数组的创建。即,将原始内核修改为以下形式即可解决问题:
attributes(global) subroutine dVadd(m, n, a, b)
!
IMPLICIT NONE
!
integer, value :: m, n
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(in) :: a
REAL(DP), DIMENSION(1:m,1:n), DEVICE, INTENT(inout) :: b
!
! local variables
!
INTEGER, DEVICE :: i1, j
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
if (i1<=m .and. j<=n) b(i1,j) = a(i1,j) + b(i1,j)
end subroutine dVadd
这是更新的 nvprof 输出:
... ...
171.32ms 287.59ms cudaMalloc
458.94ms 98.290us cudaMalloc
462.69ms 816.76us cudaMemcpy2D
463.51ms 869.13us cudaMemcpy2D
464.55ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [119])
465.12ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [126])
465.61ms 52.608us cudaLaunchKernel (elemwiseops_dvadd_ [133])
466.93ms 123.66us cudaFree
467.75ms 78.753us cudaFree
这个结论在某种程度上与我在普通 Fortran 代码中的经验相冲突,在将可分配数组作为实际参数传递给将 DEFERRED-SHAPE 数组声明为虚拟参数的子例程时,不应创建临时数组。
推荐阅读
- excel - 如果写入同一行的一个单元格,则在空单元格上写入 N/A 或“-”
- javascript - 从本地存储导入 svg/png 图像
- python - Heroku 没有找到 procfile - Flask-SocketIO
- julia - Julia:Flux.jl:“函数渐变不接受关键字参数”
- vba - 如何使用 VBA 将我的 PowerPoint 保存为只读 PDF 文件?
- linux - 我如何将我的软件发布到公共 linux 存储库,以便与“apt”安装程序等一起使用
- python - 在类中创建一个接受虚拟参数的方法
- excel - 如何避免重复记录?
- javascript - 当我单击一个 div 时,我想隐藏所有其他 div
- cassandra - Cassandra 的替代 OR 查询