首页 > 解决方案 > 如何从 C# 执行 HLSL?

问题描述

因此,我在 Visual Studio 2019 社区中编写了一个 C# 程序,但是对于某些操作,我希望它在 GPU 而不是 CPU 上运行。
我对 HLSL 有一点经验,例如为一些 Unity 项目编写了一些计算着色器,但是我真的无法(通过 Google)找到任何从使用 Unity 之外的 C# 程序执行 HLSL 代码的方法。

假设我对内核一词的理解是正确的(以及它在 Unity 中如何用于运行计算着色器),我想具体说明:
1:从 CPU 将一些数据写入着色器内核的缓冲区,
2:运行内核一定次数,并且
3:让 CPU 从内核读取一些缓冲区。

举一个我想要的例子,下面是我如何让 C# 代码使用 UnityEngine 运行 HLSL 内核:(
例如:在 C# 中它生成一些从 -1 到 1 的随机数,然后在着色器中将每个条目乘以 4)
C#

using UnityEngine;

public class Test : MonoBehaviour
{
    ComputeBuffer buffer;
    public ComputeShader shader; //Has been set to reference the shader in Unity

    void Start ()
    {
        //Create array of random values from -1 to 1
        int[] v = new int[4 * 4];
        for (int i = 0; i < v.Length; i++)
        {
            v[i] = Random.Range(-1, 2);
        }
        //Create buffer
        buffer = new ComputeBuffer(v.Length, sizeof(int));
        shader.SetBuffer(0, "Result", buffer);
        //Set values of buffer to random values
        buffer.SetData(v);
        //Execute the shader
        shader.Dispatch(0, 4 / 2, 4 / 2, 1);
        //Get values of buffer
        buffer.GetData(v);
        //Dispose buffer
        buffer.Dispose();
        //Print values
        for (int i = 0; i < v.Length; i += 4)
        {
            print(v[i + 0] + "," + v[i + 1] + "," + v[i + 2] + "," + v[i + 3]);
        }
    }
}

HLSL

#pragma kernel CSMain

RWStructuredBuffer<int> Result;

[numthreads(2,2,1)]
void CSMain (uint3 id : SV_DispatchThreadID)
{
    Result[id.x + id.y * 4] = 4 * Result[id.x + id.y * 4];
}

打印:

-4, 4, -4, 0
0, 4, 0, 0
-4, -4, -4, 4
4, 0, 4, 4

编辑:我尝试过的事情: 2 周后,虽然没有人回答这个问题,但我一直在寻找解决方案。仍在搜索,但想用我迄今为止看到的一些方式来更新它:(
请注意,我没有对其中一些进行最多的研究,例如,因为它们似乎不是我想要的)

标签: c#visual-studiohlslcompute-shader

解决方案


通过 SharpDX 在 DirextX 中使用计算的简单示例(未优化,仅 POC)。一个实际的着色器应该是相似的......

关于使用 DirectX 和 HLSL(包括 Compute)进行编程的一个非常好的来源 (imo) 是“使用 DirectX 进行 3D 游戏编程简介”(Frank D. Luna,isbn 978-1-942270-06-5)

using SharpDX;
using SharpDX.D3DCompiler;
using SharpDX.Direct3D;
using SharpDX.Direct3D11;
using System;
using System.Diagnostics;
using Buffer = SharpDX.Direct3D11.Buffer;

namespace GpGpuDemo.Backend

{
  public class DirectComputeCalculatorWithReadBackSharpDx : IParallelCalculator
  {
    public string Description => "GPU-accelerated via SharpDX/DirectCompute (with readback) ";

    public unsafe void Calculate(float[] arrayA, float[] arrayB, float[] arrayC, Action<string> Report)
    {
        var sw = new Stopwatch();
        int count = arrayA.Length;

        var device = new Device(DriverType.Hardware, DeviceCreationFlags.None);

        const int warpsize = 128;
        string DCShaderSource = @"
        
            StructuredBuffer<float> a;
            StructuredBuffer<float> b;
            RWStructuredBuffer<float> c;
            [numthreads(" + warpsize.ToString() + @",1,1)]
            void VectorAdd(uint3 threadId : SV_DispatchThreadID)
            {
                    uint index = threadId.x;
                    c[index] = a[index] * b[index]+a[index];                                          
            }
        ";

        // Compile the shader.
        var computeShaderCode = ShaderBytecode.Compile(DCShaderSource, "VectorAdd", "cs_5_0", ShaderFlags.None, EffectFlags.None);
        var computeShader = new ComputeShader(device, computeShaderCode);
        device.ImmediateContext.ComputeShader.Set(computeShader);

        // description for input buffers
        var inputBufferDescription = new BufferDescription
        {
            BindFlags = BindFlags.ShaderResource,
            OptionFlags = ResourceOptionFlags.BufferStructured,
            Usage = ResourceUsage.Dynamic,
            CpuAccessFlags = CpuAccessFlags.Write,
            SizeInBytes = count * sizeof(float),
            StructureByteStride = sizeof(float)
        };


        // Description for the output buffer itself, and the view required to bind it to the pipeline.
        var outputBufferDescription = new BufferDescription
        {
            BindFlags = BindFlags.UnorderedAccess,
            OptionFlags = ResourceOptionFlags.BufferStructured,
            Usage = ResourceUsage.Default,
            CpuAccessFlags = CpuAccessFlags.None,
            SizeInBytes = count * sizeof(float),
            StructureByteStride = sizeof(float)
        };



        var stagingBufferDescription = new BufferDescription
        {
            BindFlags = BindFlags.None,
            CpuAccessFlags = CpuAccessFlags.Read,
            OptionFlags = ResourceOptionFlags.BufferStructured,
            SizeInBytes = count * sizeof(float),
            StructureByteStride = sizeof(float),
            Usage = ResourceUsage.Staging,
        };

        var stagingBuffer = new Buffer(device, stagingBufferDescription);
        var outputBuffer = new Buffer(device, outputBufferDescription);

        var outputViewDescription = new UnorderedAccessViewDescription
        {
            Buffer = new UnorderedAccessViewDescription.BufferResource() { FirstElement = 0, Flags = UnorderedAccessViewBufferFlags.None, ElementCount = count },

            Format = SharpDX.DXGI.Format.Unknown,
            Dimension = UnorderedAccessViewDimension.Buffer
        };
        var outputView = new UnorderedAccessView(device, outputBuffer, outputViewDescription);


        float[] DCArrC = new float[count];

        // prepare input buffers

        DataStream dsA;
        fixed (float* aAddress = arrayA)
        {
            dsA = new DataStream((IntPtr)aAddress, System.Buffer.ByteLength(arrayA), true, false);
        }
        var ArrayA = new Buffer(device, dsA, inputBufferDescription);
        var ArrayAView = new ShaderResourceView(device, ArrayA);

        DataStream dsB;
        fixed (float* bAddress = arrayB)
        {
            dsB = new DataStream((IntPtr)bAddress, System.Buffer.ByteLength(arrayB), true, false);
        }
        var ArrayB = new Buffer(device, dsB, inputBufferDescription);
        var ArrayBView = new ShaderResourceView(device, ArrayB);

        DataBox output;
        device.ImmediateContext.ComputeShader.SetUnorderedAccessView(0, outputView);
        device.ImmediateContext.ComputeShader.SetShaderResource(0, ArrayAView);
        device.ImmediateContext.ComputeShader.SetShaderResource(1, ArrayBView);

        for (int i = 0; i < 5; i++)
        {
            sw.Restart();
            for (int teller = 0; teller < 10; teller++)
            {
                device.ImmediateContext.Dispatch(count / warpsize, 1, 1);
            }
            device.ImmediateContext.CopyResource(outputBuffer, stagingBuffer);
            DataStream result;
            output = device.ImmediateContext.MapSubresource(stagingBuffer, MapMode.Read, MapFlags.None, out result);
            fixed (float* cAddress = arrayC)
            {
                result.Read((IntPtr)cAddress, 0, System.Buffer.ByteLength(arrayC));
            }
            device.ImmediateContext.UnmapSubresource(stagingBuffer, 0);
            sw.Stop();
            var s = sw.Elapsed;

            Report($"Operation finished in {s.Minutes} minutes, {s.Seconds} seconds, {s.Milliseconds} milliseconds");
        }
        ArrayA.Dispose();
        ArrayB.Dispose();
        dsA.Dispose();
        dsB.Dispose();
    }
  }
}

您可以邮寄给我一个完整的工作解决方案,比较 CPU 上的执行(单线程和多线程,在 OpenCL 和 DirectCompute 中)


推荐阅读