python - 将回溯算法移植到 cuda 内核时遇到问题
问题描述
我一直在尝试用 优化我的图像处理库numba.cuda
,大部分都成功了。但是,下面的回溯功能却让我很头疼。
该函数接收代表图像numpy.ndarray
的类型uint8
和形状。(r, c, 3)
它计算类型float32
和形状图像的能量图(r,c)
。然后它通过能量图回溯,计算函数返回的回溯数组和最小能量数组。
import numpy as np
from numba import cuda
FILTER_DU = np.stack([np.array([
[1.0, 2.0, 1.0],
[0.0, 0.0, 0.0],
[-1.0, -2.0, -1.0],
])] * 3, axis=2)
FILTER_DV = np.stack([np.array([
[1.0, 0.0, -1.0],
[2.0, 0.0, -2.0],
[1.0, 0.0, -1.0],
])] * 3, axis=2)
@cuda.jit
def b_convolve(result, img):
filter_du = cuda.const.array_like(FILTER_DU)
filter_dv = cuda.const.array_like(FILTER_DV)
# 2D coords of the current thread
i, j = cuda.grid(2)
# Ignore thread if coords are outside image
img_x, img_y, img_z = img.shape
if (i >= img_x) or (j >= img_y):
return
delta_x = filter_du.shape[0] // 2
delta_y = filter_du.shape[1] // 2
#delta_z = filter_du.shape[1] // 2 <- not needed since 3rd dim is always 3 long (R,G,B)
# The result at coordinates (i, j) is equal to
# abs(sum_{k,l,m} filter_du[k, l, m] * img[i-k+delta_x, j-l+delta_y, 1-delta_z]) +
# abs(sum_{k,l,m} filter_dv[k, l, m] * img[i-k+delta_x, j-l+delta_y, 1-delta_z])
# with k, l and m going through the whole mask array
s1=0
s2=0
for k in range(filter_du.shape[0]):
for l in range(filter_du.shape[1]):
i_k = i - k + delta_x
j_l = j - l + delta_y
# Check if (i_k, j_k) coordinates are inside the image:
if (0 <= i_k < img_x) and (0 <= j_l < img_y):
s1 += filter_du[k, l, 0] * img[i_k, j_l, 1]
s1 += filter_du[k, l, 1] * img[i_k, j_l, 0]
s1 += filter_du[k, l, 2] * img[i_k, j_l, -1]
s2 += filter_dv[k, l, 0] * img[i_k, j_l, 1]
s2 += filter_dv[k, l, 1] * img[i_k, j_l, 0]
s2 += filter_dv[k, l, 2] * img[i_k, j_l, -1]
result[i, j] = abs(s1)+abs(s2)
def calc_energy(img):
img = np.ascontiguousarray(img.astype('float32'))
energy_map = np.empty(img.shape[:2]).astype('float32')
blockdim = (32,32)
griddim = (img.shape[0] // blockdim[0] + 1, img.shape[1] // blockdim[1] + 1)
b_convolve[griddim, blockdim](energy_map, img)
return energy_map
def create_backtrack(img):
r, c, _ = img.shape
energy_map = calc_energy(img)
M = energy_map.copy()
backtrack = np.zeros_like(M, dtype=np.int)
for i in range(1, r):
for j in range(0, c):
# Handle the left edge of the image, to ensure we don't index a -1
if j == 0:
idx = np.argmin(M[i-1, j:j + 2])
backtrack[i, j] = idx + j
min_energy = M[i-1, idx + j]
else:
idx = np.argmin(M[i - 1, j - 1:j + 2])
backtrack[i, j] = idx + j - 1
min_energy = M[i - 1, idx + j - 1]
M[i, j] += min_energy
return M, backtrack
下面是我尝试将此功能移植到 cuda 内核。注意idx + j
被替换为idx
因为cuda_argmin
已经考虑j
(或至少应该)。
@cuda.jit('int32(float32[:,:], int32, int32, int32)', device=True)
def cuda_argmin(M, i, j, k):
min_val = M[i, j]
min_ind = j
for x in range(j+1, k):
if M[i, x] < min_val:
min_val = M[i, x]
min_ind = x
return min_ind
@cuda.jit('void(int32[:,:], float32[:,:])')
def cuda_backtrack(backtrack, M):
i, j = cuda.grid(2)
if i == 0:
return
min_energy = 0
if j == 0:
idx = cuda_argmin(M, i-1, j, j+2)
backtrack[i, j] = idx
min_energy = M[i-1, idx]
else:
idx = cuda_argmin(M, i-1, j-1, j+2)
backtrack[i, j] = idx-1
min_energy = M[i-1, idx-1]
M[i, j] += min_energy
def create_backtrack(img):
r, c, _ = img.shape
energy_map = calc_energy(img)
M = energy_map.copy()
backtrack = np.zeros_like(M, dtype=np.int)
blockdim = (32, 32)
griddim = (r // blockdim[0] + 1, c // blockdim[1] + 1)
cuda_backtrack[griddim, blockdim](backtrack, M)
return M, backtrack
查看给定4x4 图像的函数输出,我们可以看到结果甚至不接近匹配。
# Regular backtracking function
M = array([[1750., 1622., 1752., 1176.],
[3364., 2982., 3124., 2538.],
[5060., 4754., 4228., 4212.],
[6278., 6528., 6354., 5866.]], dtype=float32)
backtrack = array([[0, 0, 0, 0],
[1, 1, 3, 3],
[1, 1, 3, 3],
[1, 2, 3, 3]])
# Cuda backtrack function
M = array([[4.2e-45, 2.9e-44, 3.1e-44, 3.2e-44],
[3.6e-44, 1.5e-44, 1.7e-44, 2.0e-44],
[2.2e-44, 2.8e-44, 2.8e-44, 2.9e-44],
[3.2e-44, 3.8e-44, 4.2e-44, 4.3e-44]], dtype=float32)
backtrack = array([[ 0, -1, 2, 3],
[ 4, 0, 2, 2],
[ 2, 4, 4, 6],
[ 6, 9, 10, 11]])
我显然做错了什么,但我不太清楚在哪里。为什么输出变化如此之大?
编辑:根据@talonmies 的建议,我将该calc_energy
函数包含在代码中,因此只要您拥有numba
并numpy
安装它,它至少可以运行。
解决方案
您的代码中至少有 3 个问题,其中第一个问题已经被指出:
您同时
M
用作输入和输出。的输出值M
取决于 的其他值M
。由于 CUDA 没有提供有关线程执行顺序的语句,因此将 CUDA 线程分配给 的每个输出值M
通常不会给出与您的串行版本相同的结果。然而,我们可以观察到M
给定行 (i
) 的值仅取决于M
前一行 (i-1
) 的值。这意味着我们可以跨行并行化,但不能跨列并行化(至少不容易)。在创建
backtrack
数组时,您似乎正确处理了左边缘(j==0
),但没有正确处理右边缘(j==c-1
)。此编码错误将导致 cuda 输出和 numpy 输出之间存在数值差异。您的两个
argmin
函数不会返回相同的内容,并且您没有正确解释这一点。numpy argmin 返回 0、1 或 2。当您添加j-1
到该值时,您将获得j-1
、j
或j+1
作为backtrack
条目的可能值。您的 cuda argmin 函数返回j-1
,j
或j+1
. 当您减去 1 时,这些值与 numpy 不匹配。
正如建议的那样,我们可以通过跨行并行来解决第一个问题。然而,在我们可以继续计算下一行之前,必须完全计算每一行。为了有效地使用它,我们需要一个跨所有 CUDA 线程的同步点。CUDA 在块级别提供此功能,但在网格级别的 CUDA 中提供此功能并未通过 numba CUDA 公开。
以下代码已解决了足够多的上述项目,因此至少两种不同方法的输出匹配。然而,剩下的问题与第 1 项有关。这种方法只使用单个线程块,以便我们可以利用所有线程的每行同步。对于 GPU 上的性能,这不是一个好的最终解决方案。如果您有多组这些操作要做,那么您可以通过为每个块分配一个图像处理任务来并行执行这些操作,并以这种方式获得良好的性能。
$ cat t48.py
import numpy as np
from numba import cuda
FILTER_DU = np.stack([np.array([
[1.0, 2.0, 1.0],
[0.0, 0.0, 0.0],
[-1.0, -2.0, -1.0],
])] * 3, axis=2)
FILTER_DV = np.stack([np.array([
[1.0, 0.0, -1.0],
[2.0, 0.0, -2.0],
[1.0, 0.0, -1.0],
])] * 3, axis=2)
@cuda.jit
def b_convolve(result, img):
filter_du = cuda.const.array_like(FILTER_DU)
filter_dv = cuda.const.array_like(FILTER_DV)
# 2D coords of the current thread
i, j = cuda.grid(2)
# Ignore thread if coords are outside image
img_x, img_y, img_z = img.shape
if (i >= img_x) or (j >= img_y):
return
delta_x = filter_du.shape[0] // 2
delta_y = filter_du.shape[1] // 2
#delta_z = filter_du.shape[1] // 2 <- not needed since 3rd dim is always 3 long (R,G,B)
# The result at coordinates (i, j) is equal to
# abs(sum_{k,l,m} filter_du[k, l, m] * img[i-k+delta_x, j-l+delta_y, 1-delta_z]) +
# abs(sum_{k,l,m} filter_dv[k, l, m] * img[i-k+delta_x, j-l+delta_y, 1-delta_z])
# with k, l and m going through the whole mask array
s1=0
s2=0
for k in range(filter_du.shape[0]):
for l in range(filter_du.shape[1]):
i_k = i - k + delta_x
j_l = j - l + delta_y
# Check if (i_k, j_k) coordinates are inside the image:
if (0 <= i_k < img_x) and (0 <= j_l < img_y):
s1 += filter_du[k, l, 0] * img[i_k, j_l, 1]
s1 += filter_du[k, l, 1] * img[i_k, j_l, 0]
s1 += filter_du[k, l, 2] * img[i_k, j_l, -1]
s2 += filter_dv[k, l, 0] * img[i_k, j_l, 1]
s2 += filter_dv[k, l, 1] * img[i_k, j_l, 0]
s2 += filter_dv[k, l, 2] * img[i_k, j_l, -1]
result[i, j] = abs(s1)+abs(s2)
def calc_energy(img):
img = np.ascontiguousarray(img.astype('float32'))
energy_map = np.empty(img.shape[:2]).astype('float32')
blockdim = (32,32)
griddim = (img.shape[0] // blockdim[0] + 1, img.shape[1] // blockdim[1] + 1)
b_convolve[griddim, blockdim](energy_map, img)
return energy_map
def create_backtrack(img):
r, c, _ = img.shape
energy_map = calc_energy(img)
print(energy_map)
M = energy_map.copy()
backtrack = np.zeros_like(M, dtype=np.int)
for i in range(1, r):
for j in range(0, c):
# Handle the left edge of the image, to ensure we don't index a -1
if j == 0:
idx = np.argmin(M[i-1, j:j + 2])
backtrack[i, j] = idx + j
min_energy = M[i-1, idx + j]
elif j == c-1:
idx = np.argmin(M[i-1, j-1:j + 1])
backtrack[i, j] = idx + j - 1
min_energy = M[i-1, idx + j - 1]
else:
idx = np.argmin(M[i - 1, j - 1:j + 2])
backtrack[i, j] = idx + j - 1
min_energy = M[i - 1, idx + j - 1]
M[i, j] += min_energy
return M, backtrack
@cuda.jit('int32(float32[:,:], int32, int32, int32)', device=True)
def cuda_argmin(M, i, j, k):
min_val = M[i, j]
min_ind = j
for x in range(j+1, k):
if M[i, x] < min_val:
min_val = M[i, x]
min_ind = x
return min_ind
@cuda.jit('void(int32[:,:], float32[:,:])')
def cuda_backtrack(backtrack, M):
j = cuda.grid(1)
if (j >= backtrack.shape[1]):
return
for i in range (1,backtrack.shape[0]):
cuda.syncthreads()
min_energy = 0
if j == 0:
idx = cuda_argmin(M, i-1, j, j+2)
backtrack[i, j] = idx
min_energy = M[i-1, idx]
elif j == backtrack.shape[1]-1:
idx = cuda_argmin(M, i-1, j-1, j+1)
backtrack[i, j] = idx
min_energy = M[i-1, idx]
else:
idx = cuda_argmin(M, i-1, j-1, j+2)
backtrack[i, j] = idx
min_energy = M[i-1, idx]
M[i, j] += min_energy
def cuda_create_backtrack(img):
r, c, _ = img.shape
energy_map = calc_energy(img)
print(energy_map)
M = energy_map.copy()
backtrack = np.zeros_like(M, dtype=np.int)
#shape[1] must be less than 1025
blockdim = backtrack.shape[1]
griddim = 1
cuda_backtrack[griddim, blockdim](backtrack, M)
return M, backtrack
img = np.array([[[ 55, 53, 55],
[ 81, 70, 64],
[ 57, 51, 49],
[ 50, 51, 55]],
[[ 59, 56, 57],
[189, 157, 142],
[133, 104, 92],
[ 34, 33, 35]],
[[ 31, 32, 34],
[174, 144, 131],
[176, 145, 131],
[ 18, 18, 20]],
[[ 22, 23, 25],
[ 80, 70, 66],
[118, 104, 97],
[ 14, 15, 17]]], dtype=np.uint8)
print("host backtrack")
M,backtrack = create_backtrack(img)
print(backtrack)
print(M)
print("GPU backtrack")
M,backtrack = cuda_create_backtrack(img)
print(backtrack)
print(M)
$ python t48.py
host backtrack
[[1750. 1622. 1752. 1176.]
[1742. 1360. 1948. 1362.]
[2078. 1772. 1690. 1674.]
[1524. 2300. 2142. 1654.]]
[[0 0 0 0]
[1 1 3 3]
[1 1 3 3]
[1 2 3 3]]
[[1750. 1622. 1752. 1176.]
[3364. 2982. 3124. 2538.]
[5060. 4754. 4228. 4212.]
[6278. 6528. 6354. 5866.]]
GPU backtrack
[[1750. 1622. 1752. 1176.]
[1742. 1360. 1948. 1362.]
[2078. 1772. 1690. 1674.]
[1524. 2300. 2142. 1654.]]
[[0 0 0 0]
[1 1 3 3]
[1 1 3 3]
[1 2 3 3]]
[[1750. 1622. 1752. 1176.]
[3364. 2982. 3124. 2538.]
[5060. 4754. 4228. 4212.]
[6278. 6528. 6354. 5866.]]
$
我并不是说上述代码没有缺陷或适用于任何特定目的。它主要是你的代码,有一些“修复”的项目来概述我发现的问题。特别是,如果您想将此代码扩展到上面给出的 1-threadblock 示例(或宽度大于 1024 的图像)之外,则需要进行一些额外的设计工作,如前所述。
推荐阅读
- javascript - 以表格格式将 JSON 数据附加到 HTML div 元素
- java - How do I read from or save to an avro file containing '\00' character in Java?
- git - 在 Jenkins / Maven 中仅构建受影响的模块
- javascript - 在对象解构中使用单个对象字段
- java - 谁能解释一下下面的命令?
- javascript - jQuery触发功能不起作用
- ios - UIWebView 在加载 PDF 文件时没有释放内存
- java - 无法解析“onRequestPermissionsResult”
- git - Visual Studio 代码如何将 zip 下载中的代码与服务器上的代码同步
- r - 无法在 rstudio 服务器中部署闪亮的应用程序