graphics - Metal:采样纹理比加载线程组内存更快?
问题描述
我编写了以下计算着色器来使用一些复杂的内核来模糊图像:
// use Packhalf5 to align to 16 B
struct Packhalf5
{
half4 a;
half4 b;
};
kernel void cs_main(
texture2d_array<float> t_f123 [[texture(0)]]
, texture2d_array<float, access::write> t_normal [[texture(3)]]
, sampler s_f123 [[sampler(0)]]
, uint3 gl_GlobalInvocationID [[thread_position_in_grid]]
, uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]
)
{
// use Packhalf5 to align to 16 B
// BLOCK_SIZE_Y = 1
// BLOCK_SIZE_X = 128 or 64 or 32
// MAX_PIXR = 20
threadgroup Packhalf5 gCache[BLOCK_SIZE_Y][BLOCK_SIZE_X + 2 * MAX_PIXR];
// cache texture samples in thread group memory gCache so that we can read it quickly and avoid most texture samples in loop
if (gl_LocalInvocationID.y < _35)
{
// ...
Packhalf5 pkh5;
pkh5.a.xyz = half3(half4(t_f123.sample(s_f123, _2.xy, uint(round(_2.z)), level(0.0))).xyz);
pkh5.b.xy = half2(half4(t_f45.sample(s_f45, _2.xy, uint(round(_2.z)), level(0.0))).xy);
gCache[gl_LocalInvocationID.x][gl_LocalInvocationID.y] = pkh5;
}
if (gl_LocalInvocationID.y >= uint(BLOCK_SIZE_X - _34))
{
// ...
Packhalf5 pkh5;
pkh5.a.xyz = half3(half4(t_f123.sample(s_f123, _3.xy, uint(round(_3.z)), level(0.0))).xyz);
pkh5.b.xy = half2(half4(t_f45.sample(s_f45, _3.xy, uint(round(_3.z)), level(0.0))).xy);
gCache[gl_LocalInvocationID.x][_36] = pkh5;
}
Packhalf5 pkh5;
pkh5.a.xyz = half3(half4(t_f123.sample(s_f123, _31.xy, uint(round(_31.z)), level(0.0))).xyz);
pkh5.b.xy = half2(half4(t_f45.sample(s_f45, _31.xy, uint(round(_31.z)), level(0.0))).xy);
gCache[gl_LocalInvocationID.x][_37] = pkh5;
threadgroup_barrier(mem_flags::mem_threadgroup);
// use gCache to blur image
for (int i = 0;i<kernel_size;++i)
{
// calculate index
int a = f1(i);
int b = f2(i);
Packhalf5 pkh5;
//it is extremly slow than directly sampling texture (t_f123.sample(s_f123, _3.xy, uint(round(_3.z)), level(0.0))
pkh5 = gCache[gl_LocalInvocationID.x][a];
float2 _42 = float2(pkh5.b.xy);
float3 _43 = (float3(pkh5.a.xyz) * float3(1.0, 0.5, 0.5)) + float3(0.0, -0.25, -0.25);
//it is extremly slow than directly sampling texture
pkh5 = gCache[gl_LocalInvocationID.x][b];
float2 _45 = float2(pkh5.b.xy);
float3 _46 = (float3(pkh5.a.xyz) * float3(1.0, 0.5, 0.5)) + float3(0.0, -0.25, -0.25);
// use _42,_43,_45_46
}
// wirte blur result
t_normal.write(_16, uint2(gl_GlobalInvocationID.xy), uint(gl_GlobalInvocationID.z));
}
我编写了这个着色器来优化一些模糊操作,类似于128x128 图片上的高斯模糊,我在iPhone XR上的 Xcode 帧调试器中对其进行测试和分析发现:
从循环中的线程组内存“gCache”加载比直接采样纹理(即)慢得多(着色器需要30%的总时间来加载 gCache,但如果我直接更改为只需要< 5%的总时间采样纹理到采样纹理)t_f123.sample(s_f123, _3.xy, uint(round(_3.z)), level(0.0)
至于性能计数器,纹理读取实际上正在下降,但奇怪的是主内存带宽几乎没有改变。
我猜是因为使用了过多的线程组内存而存在数据危害?
解决方案
推荐阅读
- matlab - matlab中计算均方根平均值的函数
- javascript - 输入处于活动状态时更改标签文本颜色
- laravel - 如何在 laravel 中基于 Auth 用户显示另一个表中的数据
- powershell - 如果指令在 Powershell 中无法正常工作
- javascript - 如何显示函数的输入
- python - 如何在python中移动矩阵的元素?
- javascript - TypeError:无法读取 React 中未定义的属性“多个对象中的状态名称”
- haskell - Haskell 递归中的非详尽模式
- r - 混合效应模型代码中的错误:“(p <- ncol(X)) == ncol(Y) is not TRUE”和“variable lengths different”
- python - 我的类在 Python shell 中创建后立即删除