c++ - _mm256_setr_epi32() 的延迟和吞吐量
解决方案
英特尔本身未指定,此处为https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference -manual-325383.pdf表 C.2 COMPOSITE INTRINSIC 部分暗示将有一组指令,具体取决于编译器的输入。
在以下情况下,当编译器无法预测输入时,它会根据以下指令进行优化:
volatile int a = 1, b = 2, c = 3, d = 4, e = 5, f = 6, g = 7, h = 8;
00007FF71DE91013 mov dword ptr [rbp+1Ch],1
00007FF71DE9101A mov dword ptr [rbp+18h],2
00007FF71DE91021 mov dword ptr [rbp+14h],3
00007FF71DE91028 mov dword ptr [rbp+10h],4
00007FF71DE9102F mov dword ptr [rbp+0Ch],5
00007FF71DE91036 mov dword ptr [rbp+8],6
00007FF71DE9103D mov dword ptr [rbp+4],7
00007FF71DE91044 mov dword ptr [rbp],8
volatile __m256i reg = _mm256_setr_epi32(a,b,c,d,e,f,g,h);
00007FF71DE9104B mov ebx,dword ptr [rbp]
00007FF71DE9104E mov r11d,dword ptr [g]
00007FF71DE91052 mov r10d,dword ptr [f]
00007FF71DE91056 mov r9d,dword ptr [e]
00007FF71DE9105A mov r8d,dword ptr [d]
00007FF71DE9105E mov edx,dword ptr [c]
00007FF71DE91061 mov ecx,dword ptr [b]
00007FF71DE91064 mov eax,dword ptr [a]
00007FF71DE91067 vmovd xmm1,eax
00007FF71DE9106B vpinsrd xmm1,xmm1,ecx,1
00007FF71DE91071 vpinsrd xmm1,xmm1,edx,2
00007FF71DE91077 vmovd xmm0,r9d
00007FF71DE9107C vpinsrd xmm0,xmm0,r10d,1
00007FF71DE91082 vpinsrd xmm0,xmm0,r11d,2
00007FF71DE91088 vpinsrd xmm1,xmm1,r8d,3
00007FF71DE9108E vpinsrd xmm0,xmm0,ebx,3
00007FF71DE91094 vinsertf128 ymm0,ymm1,xmm0,1
00007FF71DE9109A vmovdqu ymmword ptr [rbp+20h],ymm0
但是如果编译器知道输入,它看起来要短得多......
volatile __m256i reg = _mm256_setr_epi32(1,2,3,4,5,6,7,8);
00007FF7789C100F vmovdqu ymm0,ymmword ptr [__ymm@0000000800000007000000060000000500000004000000030000000200000001 (07FF7789C2200h)]
00007FF7789C1017 vmovdqu ymmword ptr [rbp],ymm0
所以延迟和周期甚至不知道粗略。无论如何,我认为在汇编器参考中查看的正确部分是 VINSERTI128 描述(如果按照我上面的链接,则当前为第 1670 页)
推荐阅读
- arrays - 当我将输入作为字符串输入到大小小于字符串的数组中时会发生什么?
- python - 如何在数据框中句子标记化
- java - 使用 DiffUtil 删除旧的更新列表到 recyclerview 适配器?
- python - 将行分配给其他行
- count - 如何计算unix中的目录?
- sonarqube - 创建一个新的覆盖率规则来阻止代码覆盖率低于 80% 的 PR 以及如何修复 not inside git tree 问题
- python - 如何使用 biobert 嵌入对医学数据进行二元分类?
- azure-cosmosdb - Azure sdk for python cosmosdb 属性错误
- javascript - Filter an array of objects with a nested array
- nlp - 质量保证系统中缺少黄金语料库