c# - C# 编译成程序集。为什么运行更少的汇编指令需要更长的时间?
问题描述
我有一些 C# 代码仅在一行中有所不同。
生成的 Asm 有 4 行不同。1 是重复的,但在不同的位置,而 3 被完全删除。
我希望较短的代码运行得更快,但我看到的是显着放缓。
请忽略一个事实,还有很多东西可以优化。这是我正在写的一篇文章,我逐步优化它,我没想到这个中间结果是回归。
这是我放入Sharplab.io以获得 Asm 输出的内容。它与BenchmarkDotNet的反汇编程序生成的相匹配:
using System;
using System.Collections.Generic;
using System.Text;
namespace PerformanceTesting
{
public readonly struct SuggestItem
{
public readonly string Term;
public readonly ulong Count;
public SuggestItem(string term, ulong count)
{
Term = term;
Count = count;
}
}
public sealed class CompressedSparseRowGraph
{
public readonly char[] EdgeCharacters;
public readonly uint[] FirstChildEdgeIndex;
public readonly int[] EdgeToNodeIndex;
public readonly int RootNodeIndex;
public readonly ushort[] ReachableTerminalNodes;
public readonly ulong[] WordCounts;
}
public sealed class Dawg
{
private readonly CompressedSparseRowGraph _graph;
public Dawg(CompressedSparseRowGraph graph)
{
_graph = graph;
}
private readonly struct ClosureVariable
{
public ClosureVariable(string word, uint maxEdits, CompressedSparseRowGraph graph, int[][] matrix, StringBuilder builder, List<SuggestItem> results) : this()
{
this.word = word;
this.maxEdits = maxEdits;
this.graph = graph;
this.matrix = matrix;
this.builder = builder;
this.results = results;
}
public readonly string word;
public readonly uint maxEdits;
public readonly CompressedSparseRowGraph graph;
public readonly int[][] matrix;
public readonly StringBuilder builder;
public readonly List<SuggestItem> results;
}
public IEnumerable<SuggestItem> Lookup(string word, uint maxEdits)
{
var builder = new StringBuilder(word.Length + (int)maxEdits);
builder.Append(new string(' ', word.Length + (int)maxEdits));
var results = new List<SuggestItem>();
var matrix = new int[word.Length + maxEdits + 1][];
for (var i = 0; i < matrix.Length; i++)
{
matrix[i] = new int[word.Length + 1];
matrix[i][0] = i - (int)maxEdits;
var stripeEnd = i + maxEdits + 1;
if (stripeEnd <= word.Length)
{
matrix[i][stripeEnd] = 0;
}
}
for (var i = 0; i < matrix[0].Length; i++)
{
matrix[0][i] = i - (int)maxEdits;
}
var closure = new ClosureVariable(word, maxEdits, _graph, matrix, builder, results);
Recurse(_graph.RootNodeIndex, 0, ref closure);
return results;
}
private static void Recurse(int currentNode, int depth, ref ClosureVariable closure)
{
if (depth == closure.word.Length + closure.maxEdits)
{
return;
}
var firstChild = closure.graph.FirstChildEdgeIndex[currentNode];
var lastChild = closure.graph.FirstChildEdgeIndex[currentNode + 1];
var from = depth - (int)closure.maxEdits;
if (from < 0)
{
from = 0;
}
from++;
- var to = Math.Min(closure.word.Length + 1, depth + closure.maxEdits + 2);
+ var to = (long)Math.Min(closure.word.Length + 1, depth + (int)closure.maxEdits + 2);
var previousCharacter = depth > 0 ? closure.builder[depth - 1] : (char)0;
var previousRow = closure.matrix[depth];
var currentRow = closure.matrix[depth + 1];
for (var childEdge = firstChild; childEdge < lastChild; childEdge++)
{
var any = false;
var currentCharacter = closure.graph.EdgeCharacters[childEdge];
closure.builder[depth] = currentCharacter;
var calculatedCost = depth + 1;
var previousRowEntry = previousRow[from - 1];
var targetCharacter = (char)0;
for (var i = from; i < to; i++)
{
var previousTargetCharacter = targetCharacter;
targetCharacter = closure.word[i - 1];
var previousRowPreviousEntry = previousRowEntry;
previousRowEntry = previousRow[i];
if (currentCharacter == targetCharacter)
{
calculatedCost = previousRowPreviousEntry;
}
else
{
if (previousRowEntry < calculatedCost)
{
calculatedCost = previousRowEntry;
}
if (targetCharacter == previousCharacter
&& previousTargetCharacter == currentCharacter)
{
previousRowPreviousEntry = closure.matrix[depth - 1][i - 2];
}
if (previousRowPreviousEntry < calculatedCost)
{
calculatedCost = previousRowPreviousEntry;
}
calculatedCost++;
}
if (calculatedCost <= 0)
{
any = true;
}
currentRow[i] = calculatedCost;
}
if (!any)
{
continue;
}
var nextNode = closure.graph.EdgeToNodeIndex[childEdge];
if (nextNode < 0)
{
nextNode = -nextNode;
if (depth >= closure.word.Length - closure.maxEdits - 1
&& calculatedCost <= 0)
{
closure.results.Add(new SuggestItem(closure.builder.ToString(0, depth + 1), 0));
}
}
Recurse(nextNode, depth + 1, ref closure);
}
}
}
}
生成程序集的指令差异如下:
- L0099: movsxd r15, ecx
- L009c: movsxd rcx, edi
- L009f: mov edx, eax
- L00a1: lea r12, [rcx+rdx+2]
+ L0099: lea edx, [rdi+rax+2]
- L00a6: cmp r15, r12
+ L009d: cmp ecx, edx
- L00a9: jle short L00ad
+ L009f: jle short L00a3
- L00ab: jmp short L00b0
+ L00a1: jmp short L00a5
- L00ad: mov r12, r15
+ L00a3: mov edx, ecx
+ L00a5: movsxd r15, edx
所以使用的寄存器发生了变化,最后有一个 movxsd 而不是比较之前的两个。
由于寄存器差异,存在一些下游变化,第一个样本在 L0339 完成,而第二个样本在 L0336 完成。其中的差异分别小于 L00b0 和 L00a5 差异后的前两条指令之间的差异。
我在使用 zen 2 架构的 ryzen 3800X 上运行它。
我已经阅读了 Agner Fog 的优化手册,没有什么让我印象深刻的原因是减速的明显原因。也许对齐边界是罪魁祸首,但我不知道如何验证。
1000 次 Lookup 调用的基准差异约为 740 毫秒,而第二个版本的基准差异约为 770 毫秒。
解决方案
推荐阅读
- html - HTML 中的制表符顺序是否合乎逻辑?
- python - Python - 相当于 Ruby 新方法
- java - 如何反转组成句子的字符数组
- asp.net - ASP.NET MVC @Html.Checkbox() 问题
- css - 如何仅使用 css 查找 HTML 标记的前一个兄弟
- jenkins - 詹金斯:如何暂停父作业,直到子作业的后期构建完成
- php - 在php中的数据库中插入表单输入值
- java - 当原始地图包含集合作为值时如何创建反向地图?
- authentication - Cognito - 使用电子邮件注册,使用首选用户名登录
- wpf - 如果我从代码中设置名称,Template.FindName 不起作用