首页 > 解决方案 > 使用 CUDA 在 Python 中循环

问题描述

我试图在合理的时间内解决大量耦合微分方程。使用常规 Numpy 求解很快就会变得非常缓慢,因为对于大量迭代,我想求解的方程数量约为 10^7。

这基本上是大量的并行矩阵运算,对于 GPU 来说似乎是一项不错的任务。在方程上使用 jit 的 vectorize 和 CUDA 装饰器有助于加快代码速度,而不是常规的 numpy。我遇到的问题是,为了让它工作,我在循环的每一步都从 GPU 复制数据。据我了解,这是一个缓慢的过程,所以如果可能的话,我想在 GPU 上执行此操作,并且仅在计算完成后将数据复制回来,但我不确定如何设置这样的循环来在 GPU 上运行。

这是我到目前为止的简化版本:



import numpy as np
import matplotlib.pyplot as plt

from numba import jit
from numba import vectorize
from numba import cuda


## size of matrices
N=4

## Time for integration
T=0.1
## Length of Time steps
dt=0.0001
## Number of time steps
n = int(T/dt)


##mockup initial values, real arrays are much larger
F_0 = np.array([[ 0.0, -1.74998928e-03, -1.74998928e-03,  0.0],
 [ 1.74998928e-03,  0.00000000e+00+0.00005j, -4.42925874e-19+1j,  1.74998928e-03-1j,],
 [ 1.74998928e-03,  4.42925874e-19,  0.0,  1.74998928e-03,],
 [ 0.0, -1.74998928e-03, -1.74998928e-03,  0.0]], dtype = np.complex64)


G_0 = np.array([[0.00000000e+00, 3.06247186e-06, 3.06247186e-06, 0.0],
 [3.06247186e-06, 1.0, 1.0, 3.06247186e-06,],
 [3.06247186e-06+0.0005j, 1.0, 1.00000000e+00, 3.06247186e-06-0.0004j,],
 [0.0, 3.06247186e-06+ -0.04j, 3.06247186e-06, 0.0]], dtype = np.complex64)


delta_i = np.complex64(1)



### Time derivatives of functions we want to integrate
@cuda.jit(device=True)
def dFdt(F,G,delta):
    first_factor = -2j*F
    second_factor = -1j*delta*(2*G-1)
    return first_factor+second_factor

@cuda.jit(device=True)
def dGdt(F, delta):
    prefactor = -1j
    secondfactor =(delta.conjugate())*F -delta*F.conjugate()
    return prefactor*secondfactor

#Standard RK4 for the specified equation set, all inputs are NxN complex matrices
@vectorize(['c8(c8, c8,c8,c8,c8)'], target='cuda')
def RK4_step_F(G, F, delta, dt, index):  

    G1 = dGdt(F, delta) 
    F1 = dFdt(F, G, delta)
    G2 = dGdt(F + dt/2*F1, delta)
    F2 = dFdt(F + dt/2*F1, G + dt/2*G1, delta)
    G3 = dGdt(F + dt/2*F2, delta)
    F3 = dFdt(F + dt/2*F2, G + dt/2*G2, delta) 
    F4 = dFdt(F + dt*F3, G + dt*G3, delta)
    return (F+(dt/6)*(F1 +(2*F2) +(2*F3) + F4))


@vectorize(['c8(c8, c8,c8,c8,c8)'], target='cuda')
def RK4_step_G(G, F, delta, dt, index):  

    G1 = dGdt(F, delta) 
    F1 = dFdt(F, G, delta)
    G2 = dGdt(F + dt/2*F1, delta)
    F2 = dFdt(F + dt/2*F1, G + dt/2*G1, delta)
    G3 = dGdt(F + dt/2*F2, delta)
    F3 = dFdt(F + dt/2*F2, G + dt/2*G2, delta)
    G4 = dGdt(F + dt*F3, delta)
    return (G + (dt/6)*(G1+ (2*G2) + (2*G3) + G4)) 


### Runs the integration, F_0 and G_0 are NxN matrices of initial values for the functions F and G, delta_i is the initial value of the parameter delta,
### dt is the step size and n is the number of steps
def RK4_method(F_0, G_0):
    delta = np.zeros(n+1, dtype=np.complex64)
    delta[0]= delta_i
    for i in range(n):

        
        RK4_step_G( G_0, F_0,delta[i], dt, 0, out = out_G)
        RK4_step_F( G_0, F_0,delta[i], dt, 0, out = out_F)
        F_0 = out_F.copy_to_host()
        G_0 = out_G.copy_to_host()

        delta[i+1] = np.sum(F_0)
        print("working")
        
    return delta

###copying initial matrices
G_cuda = cuda.to_device(G_0)
F_cuda = cuda.to_device(F_0)

##defining matrices
out_F = cuda.device_array(shape=(N,N), dtype=np.complex64)
out_G = cuda.device_array(shape=(N,N), dtype=np.complex64)


##start calculation
delta_time = RK4_method(F_cuda,G_cuda)

time = np.linspace(0, T, n + 1)

plt.plot(time[1:], delta_time[1:])
plt.show()



我想知道如何设置这样的循环而不必来回复制数据?

我的经验几乎完全是在 Python 中,所以 CUDA 对我来说还是有些陌生。

编辑:包含更完整的代码

标签: pythoncudajitnumba

解决方案


这应该是一组相当简单的更改来做你想做的事。不幸的是,numbavectorize似乎有一个使事情复杂化的特性。因此,我们必须安装额外的管道来实现目标。与此相关,如果将所有内容都转换为@cuda.jit内核,则可能会降低整体代码复杂性,但是由于您的实现重点是@vectorize,我将尝试尽可能接近这一点,尽管我需要使用一个 numba CUDA内核,最终。

目标是能够在循环处理期间以最少的数据传输量(主机->设备或设备->主机)执行您的 for 循环,并使用一组背靠背的内核调用。

在我们进行实现目标所需的更改之前,使用分析器按原样运行代码可能会很有用/有启发性,以了解我们正在尝试的每次循环迭代中“不必要的”复制避免。为了简单起见,我们将循环迭代减少到只有 2 次。我们还将删除绘图,因为我们对代码分析感兴趣,而不是结果:

$ cat t39.py
import numpy as np
# import matplotlib.pyplot as plt

from numba import jit
from numba import vectorize
from numba import cuda


## size of matrices
N=4

## Time for integration
T=0.1
## Length of Time steps
dt=0.0001
## Number of time steps
n = int(T/dt)


##mockup initial values, real arrays are much larger
F_0 = np.array([[ 0.0, -1.74998928e-03, -1.74998928e-03,  0.0],
 [ 1.74998928e-03,  0.00000000e+00+0.00005j, -4.42925874e-19+1j,  1.74998928e-03-1j,],
 [ 1.74998928e-03,  4.42925874e-19,  0.0,  1.74998928e-03,],
 [ 0.0, -1.74998928e-03, -1.74998928e-03,  0.0]], dtype = np.complex64)


G_0 = np.array([[0.00000000e+00, 3.06247186e-06, 3.06247186e-06, 0.0],
 [3.06247186e-06, 1.0, 1.0, 3.06247186e-06,],
 [3.06247186e-06+0.0005j, 1.0, 1.00000000e+00, 3.06247186e-06-0.0004j,],
 [0.0, 3.06247186e-06+ -0.04j, 3.06247186e-06, 0.0]], dtype = np.complex64)


delta_i = np.complex64(1)



### Time derivatives of functions we want to integrate
@cuda.jit(device=True)
def dFdt(F,G,delta):
    first_factor = -2j*F
    second_factor = -1j*delta*(2*G-1)
    return first_factor+second_factor

@cuda.jit(device=True)
def dGdt(F, delta):
    prefactor = -1j
    secondfactor =(delta.conjugate())*F -delta*F.conjugate()
    return prefactor*secondfactor

#Standard RK4 for the specified equation set, all inputs are NxN complex matrices
@vectorize(['c8(c8, c8,c8,c8,c8)'], target='cuda')
def RK4_step_F(G, F, delta, dt, index):

    G1 = dGdt(F, delta)
    F1 = dFdt(F, G, delta)
    G2 = dGdt(F + dt/2*F1, delta)
    F2 = dFdt(F + dt/2*F1, G + dt/2*G1, delta)
    G3 = dGdt(F + dt/2*F2, delta)
    F3 = dFdt(F + dt/2*F2, G + dt/2*G2, delta)
    F4 = dFdt(F + dt*F3, G + dt*G3, delta)
    return (F+(dt/6)*(F1 +(2*F2) +(2*F3) + F4))


@vectorize(['c8(c8, c8,c8,c8,c8)'], target='cuda')
def RK4_step_G(G, F, delta, dt, index):

    G1 = dGdt(F, delta)
    F1 = dFdt(F, G, delta)
    G2 = dGdt(F + dt/2*F1, delta)
    F2 = dFdt(F + dt/2*F1, G + dt/2*G1, delta)
    G3 = dGdt(F + dt/2*F2, delta)
    F3 = dFdt(F + dt/2*F2, G + dt/2*G2, delta)
    G4 = dGdt(F + dt*F3, delta)
    return (G + (dt/6)*(G1+ (2*G2) + (2*G3) + G4))


### Runs the integration, F_0 and G_0 are NxN matrices of initial values for the functions F and G, delta_i is the initial value of the parameter delta,
### dt is the step size and n is the number of steps
def RK4_method(F_0, G_0):
    delta = np.zeros(n+1, dtype=np.complex64)
    delta[0]= delta_i
    for i in range(2):


        RK4_step_G( G_0, F_0,delta[i], dt, 0, out = out_G)
        RK4_step_F( G_0, F_0,delta[i], dt, 0, out = out_F)
        F_0 = out_F.copy_to_host()
        G_0 = out_G.copy_to_host()

        delta[i+1] = np.sum(F_0)
#        print("working")

    return delta

###copying initial matrices
G_cuda = cuda.to_device(G_0)
F_cuda = cuda.to_device(F_0)

##defining matrices
out_F = cuda.device_array(shape=(N,N), dtype=np.complex64)
out_G = cuda.device_array(shape=(N,N), dtype=np.complex64)


##start calculation
delta_time = RK4_method(F_cuda,G_cuda)

#time = np.linspace(0, T, n + 1)

#plt.plot(time[1:], delta_time[1:])
#plt.show()
$ nvprof --print-gpu-trace python t39.py
==30045== NVPROF is profiling process 30045, command: python t39.py
==30045== Profiling application: python t39.py
==30045== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
319.56ms  1.4080us                    -               -         -         -         -      128B  86.698MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
319.98ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
321.38ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
321.78ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
322.17ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
491.96ms  7.3290us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_G$248(Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>) [84]
493.21ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
493.61ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
494.00ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
653.33ms  6.2400us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_F$249(Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>) [130]
653.64ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]
653.73ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]
654.53ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
654.93ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
655.32ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
655.72ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
656.11ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
656.92ms  5.8250us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_G$248(Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>) [169]
657.82ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
658.22ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
658.62ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
659.01ms  1.0240us                    -               -         -         -         -      128B  119.21MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
659.40ms  1.0560us                    -               -         -         -         -      128B  115.60MB/s    Pageable      Device  GeForce GTX 960         1         7  [CUDA memcpy HtoD]
660.03ms  5.3450us              (1 1 1)       (576 1 1)        56        0B        0B         -           -           -           -  GeForce GTX 960         1         7  cudapy::__main__::__vectorized_RK4_step_F$249(Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>, Array<complex64, int=1, C, mutable, aligned>) [213]
660.24ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]
660.33ms  1.1840us                    -               -         -         -         -      128B  103.10MB/s      Device    Pageable  GeForce GTX 960         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$

查看nvprof上面的输出,我们看到对于 2 次循环迭代中的每一次,有 2 个内核被调用(每个vectorize函数一个),但也有 10 个或更多的复制操作。其中 6 个操作是由于上述标量/临时数组问题。

我们将采用以下技术来改善每循环复制情况。我们无法完全消除它,但我们可以将它降低到一个不会随着问题规模的增加而增加的非常低的水平。我们将执行以下操作:

  1. 将所有输入数组移动到设备(例如F, G)并将它们留在那里。
  2. 在循环中创建一个乒乓结构,以便在交替循环通过时,输入数组成为输出数组,反之亦然。乒乓球意味着我们避免在每个循环结束时将输出复制到输入,正如您正在做的那样。
  3. 由于前面提到的 numbavectorize问题,我们将手动将您的标量输入(例如)转换为与其他输入数组dt具有相同大小( )的设备数组。NxN这是我发现防止 numba 创建自己的临时数组并在每次循环迭代时将它们复制到设备的唯一方法。
  4. np.sum您在每个循环通过 ( )时都会发生基于主机的操作。我们需要将数据留在设备上,并为此使用基于设备的操作。幸运的是,numba 通过特殊的reduce 功能提供了这一点。
  5. 减少的结果可以保存在设备上,但它仍然只是一个值。这让我们再次回到标量船。我们需要将其转换回NxN数组以避免已经讨论过的向量化问题。我考虑了几种 numba “本机”方法,例如guvectorize,但最终放弃并使用 numba cuda 内核 ( @cuda.jit) 来执行该distribute功能(获取单个值并在整个数组中广播它)。

通过这些重大更改,我们可以得到一个代码,每次循环迭代只执行单个设备到设备的单个complex64数量的复制,这是由步骤 4 中的 reduce 操作产生的,并且在每个循环迭代结束时,单个设备到单个complex64数量的主机副本,用于将每次迭代结果传输回主机数组。

对于小型测试用例(例如N = 4),这几乎没有什么区别,而且整个问题无论如何都是微不足道的。对于我在下面展示的更大的测试用例,这些重构更改可以使整体性能提高约 5 倍。这里的改进会根据最终执行的循环数量而有所不同,并且很大程度上取决于问题的大小。

这是一个完整的示例,您可以使用上面示例中所示的类似技术对其进行分析(我建议将循环计数减少到可管理的值,例如 3),以查看每次循环迭代的行为差异:

$ cat t40.py
import numpy as np
# import matplotlib.pyplot as plt
import time
from numba import jit
from numba import vectorize
from numba import cuda


## size of matrices
N=4

## Time for integration
T =0.01
## Length of Time steps
dt=0.0001
## Number of time steps
n = int(T/dt)


##mockup initial values, real arrays are much larger
F_0 = np.array([[ 0.0, -1.74998928e-03, -1.74998928e-03,  0.0],
 [ 1.74998928e-03,  0.00000000e+00+0.00005j, -4.42925874e-19+1j,  1.74998928e-03-1j,],
 [ 1.74998928e-03,  4.42925874e-19,  0.0,  1.74998928e-03,],
 [ 0.0, -1.74998928e-03, -1.74998928e-03,  0.0]], dtype = np.complex64)


G_0 = np.array([[0.00000000e+00, 3.06247186e-06, 3.06247186e-06, 0.0],
 [3.06247186e-06, 1.0, 1.0, 3.06247186e-06,],
 [3.06247186e-06+0.0005j, 1.0, 1.00000000e+00, 3.06247186e-06-0.0004j,],
 [0.0, 3.06247186e-06+ -0.04j, 3.06247186e-06, 0.0]], dtype = np.complex64)
#create a larger test case
N=512
F_0 = np.ones((N,N), dtype=np.complex64)
G_0 = np.ones((N,N), dtype=np.complex64)


delta_i = np.complex64(1)



### Time derivatives of functions we want to integrate
@cuda.jit(device=True)
def dFdt(F,G,delta):
    first_factor = -2j*F
    second_factor = -1j*delta*(2*G-1)
    return first_factor+second_factor

@cuda.jit(device=True)
def dGdt(F, delta):
    prefactor = -1j
    secondfactor =(delta.conjugate())*F -delta*F.conjugate()
    return prefactor*secondfactor

#Standard RK4 for the specified equation set, all inputs are NxN complex matrices
@vectorize(['c8(c8, c8,c8,c8,c8)'], target='cuda')
def RK4_step_F(G, F, delta, dt, index):

    G1 = dGdt(F, delta)
    F1 = dFdt(F, G, delta)
    G2 = dGdt(F + dt/2*F1, delta)
    F2 = dFdt(F + dt/2*F1, G + dt/2*G1, delta)
    G3 = dGdt(F + dt/2*F2, delta)
    F3 = dFdt(F + dt/2*F2, G + dt/2*G2, delta)
    F4 = dFdt(F + dt*F3, G + dt*G3, delta)
    return (F+(dt/6)*(F1 +(2*F2) +(2*F3) + F4))


@vectorize(['c8(c8, c8,c8,c8,c8)'], target='cuda')
def RK4_step_G(G, F, delta, dt, index):

    G1 = dGdt(F, delta)
    F1 = dFdt(F, G, delta)
    G2 = dGdt(F + dt/2*F1, delta)
    F2 = dFdt(F + dt/2*F1, G + dt/2*G1, delta)
    G3 = dGdt(F + dt/2*F2, delta)
    F3 = dFdt(F + dt/2*F2, G + dt/2*G2, delta)
    G4 = dGdt(F + dt*F3, delta)
    return (G + (dt/6)*(G1+ (2*G2) + (2*G3) + G4))

#code modifications begin here, the above functions are unmodified

@cuda.reduce
def sum_reduce(a, b):
    return a+b

@cuda.jit
def distribute(A, B, s):
    i = cuda.grid(1)
    if i < s:
        B[i] = A[0]


### Runs the integration, F_0 and G_0 are NxN matrices of initial values for the functions F and G, delta_i is the initial value of the parameter delta,
### dt is the step size and n is the number of steps
def RK4_method_new(F_arr, G_arr):
    delta_h = np.zeros(shape=(n+1), dtype=np.complex64)
    delta_h[0] = delta_i
    ##defining device matrices
    in_F = cuda.to_device(F_arr)
    in_G = cuda.to_device(G_arr)
    out_F = cuda.device_array(shape=(N,N), dtype=np.complex64)
    out_G = cuda.device_array(shape=(N,N), dtype=np.complex64)
    #setup "scalar arrays"
    dt_arr    = np.full((N,N), np.complex64(dt), dtype=np.complex64)
    dev_dt    = cuda.to_device(dt_arr)
    index_arr = np.full((N,N), np.complex64(0) , dtype=np.complex64)
    dev_index = cuda.to_device(dev_dt)
    delta_arr = np.full((N,N), delta_i, dtype=np.complex64)
    dev_delta = cuda.to_device(delta_arr)
    b = 256
    g = (N*N+b-1)//b
    for i in range(n):

        RK4_step_G( in_G, in_F,dev_delta, dev_dt, dev_index, out = out_G)
        RK4_step_F( in_G, in_F,dev_delta, dev_dt, dev_index, out = out_F)
        # ping-pong
        tmp_G = in_G
        tmp_F = in_F
        in_G = out_G
        in_F = out_F
        out_G = tmp_G
        out_F = tmp_F

        sum_reduce(in_F.reshape(N*N), res = dev_delta.reshape(N*N)) # causes 8 byte D->D copy
        distribute[g,b](dev_delta.reshape(N*N), dev_delta.reshape(N*N),N*N)
        delta_h[i+1] = dev_delta[0,0] # causes 8 byte D->H copy
#        print("working")
    return delta_h

def RK4_method(F_0, G_0):
    delta = np.zeros(n+1, dtype=np.complex64)
    delta[0]= delta_i
    out_F = cuda.device_array(shape=(N,N), dtype=np.complex64)
    out_G = cuda.device_array(shape=(N,N), dtype=np.complex64)
    for i in range(n):


        RK4_step_G( G_0, F_0,delta[i], dt, 0, out = out_G)
        RK4_step_F( G_0, F_0,delta[i], dt, 0, out = out_F)
        F_0 = out_F.copy_to_host()
        G_0 = out_G.copy_to_host()

        delta[i+1] = np.sum(F_0)
    #    print("working")

    return delta

##start calculation
# "warm-up runs, to get JIT compilation effects removed from timing comparison
delta_time     = RK4_method(F_0, G_0)
delta_time_new = RK4_method_new(F_0,G_0)
s = time.time()
delta_time     = RK4_method(F_0, G_0)
e = time.time()
print("old time: ", e-s)
s = time.time()
delta_time_new = RK4_method_new(F_0,G_0)
e = time.time()
print("new time: ", e-s)
for i in range(3):
    print("delta time: ", delta_time[i], " delta_time_new: ", delta_time_new[i])
#time = np.linspace(0, T, n + 1)

#plt.plot(time[1:], delta_time[1:])
#plt.show()
$ python t40.py
old time:  2.48877215385437
new time:  0.5504293441772461
delta time:  (1+0j)  delta_time_new:  (1+0j)
delta time:  (262143.98-78.64321j)  delta_time_new:  (262143.98-78.64322j)
delta time:  (1361288.4+3141394700j)  delta_time_new:  (1361289.1+3141394700j)
$

推荐阅读