首页 > 解决方案 > 偏移大数组的 C# 内存访问优化(为什么这段代码很慢)

问题描述

我正在使用 C# 进行图像处理。我在 C# 中获得偏移大内存访问的性能时遇到问题。速度与相同大小的零偏移内存明显不同。在 C++ 的情况下,差异没有 C# 中那么大。

你能告诉我为什么我的代码有这个问题吗?另外,有什么解决办法吗?

资源

using System;
using System.Runtime.InteropServices;
using System.Diagnostics;
using System.Numerics;

namespace Test
{
    class Program
    {
        unsafe static void Main(string[] args)
        {
            var width = 8000;
            var height = 8000;

            // var data = new Vector4[height * width]; <- similar problem occur
            var data = (Vector4*)Marshal.AllocHGlobal(height * width * sizeof(Vector4));
            var data2 = (Vector4*)Marshal.AllocHGlobal(height * width * sizeof(Vector4));

            // MATRIX
            float m11 = .7297023F, m12 = 0, m13 = 0, m14 = 0, m21 = 0, m22 = .6109577F,
                m23 = 0, m24 = 0, m31 = 0, m33 = .597218F, m32 = 0, m34 = 0, m41 = 0, m42 = 0,
                m43 = 0, m44 = 1F, m51 = .105F, m52 = .145F, m53 = .155F, m54 = 0;

            var sw = new Stopwatch();
            sw.Start();

            for (int y = 0; y < height; ++y)
            {
                var offset = width * y;
                for (int x = 0; x < width; ++x)
                {
                    // Slow ( 600ms )
                    ref var sData = ref data[offset + x];
                    ref var dData = ref data2[offset + x];

                    // Fast ( 200ms )
                    // ref var sData = ref data[x];
                    // ref var dData = ref data2[x];

                    float b = sData.X;
                    float g = sData.Y;
                    float r = sData.Z;
                    float a = sData.W;

                    dData.X = (b * m11) + (g * m21) + (r * m31) + (a * m41) + m51;
                    dData.Y = (b * m12) + (g * m22) + (r * m32) + (a * m42) + m52;
                    dData.Z = (b * m13) + (g * m23) + (r * m33) + (a * m43) + m53;
                    dData.W = (b * m14) + (g * m24) + (r * m34) + (a * m44) + m54;
                }
            }

            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds);

            Marshal.FreeHGlobal((IntPtr)data);
            Marshal.FreeHGlobal((IntPtr)data2);
        }
    }
}

使用托管数组指针时

var array1 = new Vector4[width * height];
var array2 = new Vector4[width * height];
fixed (Vector4* data = &array1[0])
fixed (Vector4* data2 = &array2[0])
    for (int y = 0; y < height; ++y)
    {
        for (int x = 0; x < width; ++x)
        {
            // Slow ( 600ms )
            ref var sData = ref data[width * y + x];
            ref var dData = ref data2[width * y + x];

在外循环中偏移指针

(有点改进)</p>


for (int y = 0; y < height; ++y)
{
    var offsetData1 = data + width * y;
    var offsetData2 = data2 + width * y;
    for (int x = 0; x < width; ++x)
    {
        // Slow ( 470ms )
        ref var sData = ref offsetData1[x];
        ref var dData = ref offsetData2[x];

C++版本

#include <iostream>
#include <chrono>

struct Vector4 {
    float X = 0;
    float Y = 0;
    float Z = 0;
    float W = 0;
};

int main()
{
    long width = 8000;
    long height = 8000;

    auto buffer = new Vector4[width * height];
    auto buffer2 = new Vector4[width * height];

    // MATRIX
    float m11 = .7297023F, m12 = 0, m13 = 0, m14 = 0, m21 = 0, m22 = .6109577F,
        m23 = 0, m24 = 0, m31 = 0, m33 = .597218F, m32 = 0, m34 = 0, m41 = 0, m42 = 0,
        m43 = 0, m44 = 1, m51 = .105F, m52 = .145F, m53 = .155F, m54 = 0;

    std::chrono::system_clock::time_point  start, end;
    start = std::chrono::system_clock::now();

    for (int y = 0; y < height; ++y)
    {
        int offset = width * y;
        for (int x = 0; x < width; ++x)
        {
            Vector4& sData = buffer[offset + x];
            Vector4& dData = buffer2[offset + x];

            float b = sData.X;
            float g = sData.Y;
            float r = sData.Z;
            float a = sData.W;
            dData.X = (b * m11) + (g * m21) + (r * m31) + (a * m41) + m51;
            dData.Y = (b * m12) + (g * m22) + (r * m32) + (a * m42) + m52;
            dData.Z = (b * m13) + (g * m23) + (r * m33) + (a * m43) + m53;
            dData.W = (b * m14) + (g * m24) + (r * m34) + (a * m44) + m54;
        }
    }

    end = std::chrono::system_clock::now();
    double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    std::cout << elapsed << "\n";

    delete[] buffer;
    delete[] buffer2;
}

基准

描述 时间(毫秒)
C# 零偏移指针 600毫秒
C# 偏移指针 200毫秒
C++ 零偏移指针 190 毫秒
C++ 偏移指针 260毫秒
C# 在外循环中偏移指针 370毫秒
C# 带偏移量的托管数组指针 990 毫秒

其他信息

SharpLab 中的 IL

中央处理器 英特尔酷睿 i7-6700k
记忆 DDR4 16GB
操作系统 视窗 10 20H2
运行 .NET 5
朗维 C# 9
平台 X64

标签: c#arraysperformancememory

解决方案


推荐阅读