math - 如何在GPU上找到除法的魔法乘数?
问题描述
我正在考虑实现以下计算,其中divisor
非零且不是 2 的幂
unsigned multiplier(unsigned divisor)
{
unsigned shift = 31 - clz(divisor);
uint64_t t = 1ull << (32 + shift);
return t / div;
}
以一种对缺乏 64 位整数和浮点指令但可能具有 32 位融合乘加(例如 GPU,它也将缺乏除法)的处理器有效的方式。
此计算对于查找优化除法所涉及的“魔术乘法器”很有用,当除数提前知道时,乘高指令后跟按位移位。与编译器中使用的代码和 libdivide 中的参考代码不同,它会找到最大的乘数。
另一个转折是,在我正在查看的应用程序中,我预计它divisor
几乎总是可以用float
类型表示。因此,有一个有效的“快速路径”来处理这些除数,以及一个大小优化的“慢路径”来处理其余部分是有意义的。
解决方案
我提出的解决方案在“快速路径”上执行 6 次或 8 次 FMA 操作,在“快速路径”上执行 6 次或 8 次 FMA 操作中专门针对此特定场景(股息是 2 的幂)的长除法,然后在“慢路”。
以下程序对建议的解决方案执行详尽的测试(在支持 FMA 的 CPU 上需要大约 1-2 分钟)。
#include <math.h>
#include <stdint.h>
#include <stdio.h>
struct quomod {
unsigned long quo;
unsigned long mod;
};
// Divide 1 << (32 + SHIFT) by DIV, return quotient and modulus
struct quomod
quomod_ref(unsigned div, unsigned shift)
{
uint64_t t = 1ull << (32 + shift);
return (struct quomod){t / div, t % div};
}
// Reinterpret given bits as float
static inline float int_as_float(uint32_t bits)
{
return (union{ unsigned b; float f; }){bits}.f;
}
// F contains integral value in range [-2**32 .. 2**32]. Convert it to integer,
// with wrap-around on overflow. If the GPU implements saturating conversion,
// it also may be used
static inline uint32_t cvt_f32_u32_wrap(float f)
{
return (uint32_t)(long long)f;
}
struct quomod
quomod_alt(unsigned div, unsigned shift)
{
// t = float(1ull << (32 + shift))
float t = int_as_float(0x4f800000 + (shift << 23));
// mask with max(0, shift - 23) low bits zero
uint32_t mask = (int)(~0u << shift) >> 23;
// No roundoff in conversion
float div_f = div & mask;
// Caution: on the CPU this is correctly rounded, but on the GPU
// native reciprocal may be off by a few ULP, in which case a
// refinement step may be necessary:
// recip = fmaf(fmaf(recip, -div_f, 1), recip, recip)
float recip = 1.f / div_f;
// Higher part of the quotient, integer in range 2^31 .. 2^32
float quo_hi = t * recip;
// No roundoff
float res = fmaf(quo_hi, -div_f, t);
float quo_lo_approx = res * recip;
float res2 = fmaf(quo_lo_approx, -div_f, res);
// Lower part of the quotient, may be negative
float quo_lo = floorf(fmaf(res2, recip, quo_lo_approx));
// Remaining part of the dividend
float mod_f = fmaf(quo_lo, -div_f, res);
// Quotient as sum of parts
unsigned quo = cvt_f32_u32_wrap(quo_hi) + (int)quo_lo;
// Adjust quotient down if remainder is negative
if (mod_f < 0) {
quo--;
}
if (div & ~mask) {
// The quotient was computed for a truncated divisor, so
// it matches or exceeds the true result
// High part of the dividend
uint32_t ref_hi = 1u << shift;
// Unless quotient is zero after wraparound, increment it so
// it's higher than true quotient (its high bit must be 1)
quo -= (int)quo >> 31;
// Binary search for the true quotient; search invariant:
// quo is higher than true quotient, quo-2*bit is lower
for (unsigned bit = 256; bit; bit >>= 1) {
unsigned try = quo - bit;
// One multiply-high instruction
uint32_t prod_hi = 1ull * try * div >> 32;
if (prod_hi >= ref_hi)
quo = try;
}
// quo is zero or exceeds the true quotient, so quo-1 must be it
quo--;
}
// Use the "left-pointing short magic wand" operator
// to recover the remainder
return (struct quomod){quo, quo *- div};
}
int main()
{
fprintf(stderr, "%66c\r[", ']');
unsigned step = 1;
for (unsigned div = 3; div; div += step) {
// Progress bar
if (!(div & 0x03ffffff)) fprintf(stderr, "=");
// Skip powers of two
if (!(div & (div-1))) continue;
unsigned shift = 31 - __builtin_clz(div);
struct quomod ref = quomod_ref(div, shift);
struct quomod alt = quomod_alt(div, shift);
if (ref.quo != alt.quo || ref.mod != alt.mod) {
printf("\nerror at %u\n", div);
return 1;
}
}
fprintf(stderr, "=\nAll ok\n");
return 0;
}
推荐阅读
- acumatica - 在表中保存一行后,通用查询中的 Int32 转换错误
- javascript - 使用单个 URL 创建多页面设计
- mysql - MySQL查询字符串替换并设置字符串
- postgresql - 需要索引一个巨大的 postgres 表文本列,该列在开头和结尾都适用于通配符搜索 '%xyz%'
- websocket - WebSocket onMessage 是原子的吗?
- oracle - Oracle 数据插入奇怪的行为。插入时数据被截断
- hyperledger-fabric - 如何在使用 fabric-mock-stub 测试链码时模拟两个链码
- c# - 从 Xamarin 表单中的 Listview 获取检查值
- java - java- ClassCastException- BigInt 不能转换为 Long
- centos - 坚持配置清漆