首页 > 解决方案 > C# 编译成程序集。为什么运行更少的汇编指令需要更长的时间?

问题描述

我有一些 C# 代码仅在一行中有所不同。
生成的 Asm 有 4 行不同。1 是重复的,但在不同的位置,而 3 被完全删除。

我希望较短的代码运行得更快,但我看到的是显着放缓。

请忽略一个事实,还有很多东西可以优化。这是我正在写的一篇文章,我逐步优化它,我没想到这个中间结果是回归。

这是我放入Sharplab.io以获得 Asm 输出的内容。它与BenchmarkDotNet反汇编程序生成的相匹配:

using System;
using System.Collections.Generic;
using System.Text;

namespace PerformanceTesting
{
    public readonly struct SuggestItem
    {
        public readonly string Term;
        public readonly ulong Count;

        public SuggestItem(string term, ulong count)
        {
            Term = term;
            Count = count;
        }
    }
    
    public sealed class CompressedSparseRowGraph
    {
        public readonly char[] EdgeCharacters;
        public readonly uint[] FirstChildEdgeIndex;
        public readonly int[] EdgeToNodeIndex;
        public readonly int RootNodeIndex;
        public readonly ushort[] ReachableTerminalNodes;
        public readonly ulong[] WordCounts;
    }
    
    public sealed class Dawg
    {
        private readonly CompressedSparseRowGraph _graph;

        public Dawg(CompressedSparseRowGraph graph)
        {
            _graph = graph;
        }

        private readonly struct ClosureVariable
        {
            public ClosureVariable(string word, uint maxEdits, CompressedSparseRowGraph graph, int[][] matrix, StringBuilder builder, List<SuggestItem> results) : this()
            {
                this.word = word;
                this.maxEdits = maxEdits;
                this.graph = graph;
                this.matrix = matrix;
                this.builder = builder;
                this.results = results;
            }

            public readonly string word;
            public readonly uint maxEdits;
            public readonly CompressedSparseRowGraph graph;
            public readonly int[][] matrix;
            public readonly StringBuilder builder;
            public readonly List<SuggestItem> results;
        }

        public IEnumerable<SuggestItem> Lookup(string word, uint maxEdits)
        {
            var builder = new StringBuilder(word.Length + (int)maxEdits);
            builder.Append(new string(' ', word.Length + (int)maxEdits));
            var results = new List<SuggestItem>();

            var matrix = new int[word.Length + maxEdits + 1][];
            for (var i = 0; i < matrix.Length; i++)
            {
                matrix[i] = new int[word.Length + 1];
                matrix[i][0] = i - (int)maxEdits;

                var stripeEnd = i + maxEdits + 1;
                if (stripeEnd <= word.Length)
                {
                    matrix[i][stripeEnd] = 0;
                }
            }

            for (var i = 0; i < matrix[0].Length; i++)
            {
                matrix[0][i] = i - (int)maxEdits;
            }

            var closure = new ClosureVariable(word, maxEdits, _graph, matrix, builder, results);
            Recurse(_graph.RootNodeIndex, 0, ref closure);
            return results;
        }

        private static void Recurse(int currentNode, int depth, ref ClosureVariable closure)
        {
            if (depth == closure.word.Length + closure.maxEdits)
            {
                return;
            }

            var firstChild = closure.graph.FirstChildEdgeIndex[currentNode];
            var lastChild = closure.graph.FirstChildEdgeIndex[currentNode + 1];

            var from = depth - (int)closure.maxEdits;
            if (from < 0)
            {
                from = 0;
            }

            from++;

-            var to = Math.Min(closure.word.Length + 1, depth + closure.maxEdits + 2);
+            var to = (long)Math.Min(closure.word.Length + 1, depth + (int)closure.maxEdits + 2);
            var previousCharacter = depth > 0 ? closure.builder[depth - 1] : (char)0;

            var previousRow = closure.matrix[depth];
            var currentRow = closure.matrix[depth + 1];

            for (var childEdge = firstChild; childEdge < lastChild; childEdge++)
            {
                var any = false;
                var currentCharacter = closure.graph.EdgeCharacters[childEdge];
                closure.builder[depth] = currentCharacter;
                var calculatedCost = depth + 1;
                var previousRowEntry = previousRow[from - 1];
                var targetCharacter = (char)0;
                for (var i = from; i < to; i++)
                {
                    var previousTargetCharacter = targetCharacter;
                    targetCharacter = closure.word[i - 1];

                    var previousRowPreviousEntry = previousRowEntry;
                    previousRowEntry = previousRow[i];

                    if (currentCharacter == targetCharacter)
                    {
                        calculatedCost = previousRowPreviousEntry;
                    }
                    else
                    {
                        if (previousRowEntry < calculatedCost)
                        {
                            calculatedCost = previousRowEntry;
                        }

                        if (targetCharacter == previousCharacter
                            && previousTargetCharacter == currentCharacter)
                        {
                            previousRowPreviousEntry = closure.matrix[depth - 1][i - 2];
                        }

                        if (previousRowPreviousEntry < calculatedCost)
                        {
                            calculatedCost = previousRowPreviousEntry;
                        }

                        calculatedCost++;
                    }

                    if (calculatedCost <= 0)
                    {
                        any = true;
                    }

                    currentRow[i] = calculatedCost;
                }

                if (!any)
                {
                    continue;
                }

                var nextNode = closure.graph.EdgeToNodeIndex[childEdge];
                if (nextNode < 0)
                {
                    nextNode = -nextNode;
                    if (depth >= closure.word.Length - closure.maxEdits - 1
                        && calculatedCost <= 0)
                    {
                        closure.results.Add(new SuggestItem(closure.builder.ToString(0, depth + 1), 0));
                    }
                }

                Recurse(nextNode, depth + 1, ref closure);
            }
        }
    }
}

生成程序集的指令差异如下:

-    L0099: movsxd r15, ecx
-    L009c: movsxd rcx, edi
-    L009f: mov edx, eax
    
-    L00a1: lea r12, [rcx+rdx+2]
+    L0099: lea edx, [rdi+rax+2]
-    L00a6: cmp r15, r12
+    L009d: cmp ecx, edx
-    L00a9: jle short L00ad
+    L009f: jle short L00a3
-    L00ab: jmp short L00b0
+    L00a1: jmp short L00a5
-    L00ad: mov r12, r15
+    L00a3: mov edx, ecx
   
+    L00a5: movsxd r15, edx

所以使用的寄存器发生了变化,最后有一个 movxsd 而不是比较之前的两个。
由于寄存器差异,存在一些下游变化,第一个样本在 L0339 完成,而第二个样本在 L0336 完成。其中的差异分别小于 L00b0 和 L00a5 差异后的前两条指令之间的差异。

我在使用 zen 2 架构的 ryzen 3800X 上运行它。

我已经阅读了 Agner Fog 的优化手册,没有什么让我印象深刻的原因是减速的明显原因。也许对齐边界是罪魁祸首,但我不知道如何验证。

1000 次 Lookup 调用的基准差异约为 740 毫秒,而第二个版本的基准差异约为 770 毫秒。

标签: c#assemblyx86-64cpu-architectureamd-processor

解决方案


推荐阅读