c# - 填充 Span 的最快方式在.NET 中使用整数枚举?
问题描述
我正在寻找最快的 C# / .NET Core 方法,能够用Span<int>
枚举 0、1、2、3...填充 afor
选项。
Span<int> buffer = ..; // snipped
for(var i = 0; i < buffer.Length; i++)
buffer[i] = i;
如何使用 SIMD 加速这种缓冲区填充方法?
解决方案
下面,一些优化尝试。第一个Default
是基本的 for 循环。第二个,Batch4
,相同,但在单个循环迭代中初始化 4 个索引。第 4 和第 5 和第 2 一样,但通过迭代更做作。
第三个是使用System.Numerics.Vector<T>
. jit 知道此数据类型,它用 SIMD 对应物替换算术运算。在我的机器上,它比默认实现快两倍。
这里的缺点是缓冲区大小必须是 4 的倍数。( / 为 8/16 Batch16
)Batch16
。如果没有,最后一行必须在主循环之外手动处理。
using System;
using System.Numerics;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
namespace bench
{
class Program
{
static void Main(string[] args)
{
var summary = BenchmarkRunner.Run<Sp>();
}
}
[SimpleJob]
[MemoryDiagnoser]
//[DisassemblyDiagnoser(printAsm: true, printIL: true, printSource: true, printDiff: true)]
public class Sp
{
private readonly int[] spanBack = new int[100000];
private readonly Vector<int> baseV;
private readonly Vector<int> accV;
public Sp()
{
if (spanBack.Length % Vector<int>.Count != 0) throw new Exception("Invalid array size");
if (Vector<int>.Count == 4)
{
baseV = new Vector<int>(new[] { 4, 4, 4, 4 });
accV = new Vector<int>(new[] { 0, 1, 2, 3, });
}
else if (Vector<int>.Count == 8)
{
baseV = new Vector<int>(new[] { 8, 8, 8, 8, 8, 8, 8, 8 });
accV = new Vector<int>(new[] { 0, 1, 2, 3, 4, 5, 6, 7 });
}
else
{
throw new Exception("Invalid vector size");
}
}
[Benchmark(Baseline = true)]
public int[] Default()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i++)
buffer[i] = i;
return spanBack;
}
[Benchmark]
public int[] Batch4()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 4)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
}
return spanBack;
}
[Benchmark]
public int[] BatchSimd()
{
int batchSize = Vector<int>.Count;
var accV = this.accV;
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + batchSize)
{
var currentSlice = buffer.Slice(i, batchSize);
var v = new Vector<int>(currentSlice);
v = v + accV;
accV = accV + baseV;
v.CopyTo(currentSlice);
}
return spanBack;
}
[Benchmark]
public int[] Batch8()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 8)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
buffer[i + 4] = i + 4;
buffer[i + 5] = i + 5;
buffer[i + 6] = i + 6;
buffer[i + 7] = i + 7;
}
return spanBack;
}
[Benchmark]
public int[] Batch16()
{
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + 16)
{
buffer[i + 0] = i + 0;
buffer[i + 1] = i + 1;
buffer[i + 2] = i + 2;
buffer[i + 3] = i + 3;
buffer[i + 4] = i + 4;
buffer[i + 5] = i + 5;
buffer[i + 6] = i + 6;
buffer[i + 7] = i + 7;
buffer[i + 8] = i + 8;
buffer[i + 9] = i + 9;
buffer[i + 10] = i + 10;
buffer[i + 11] = i + 11;
buffer[i + 12] = i + 12;
buffer[i + 13] = i + 13;
buffer[i + 14] = i + 14;
buffer[i + 15] = i + 15;
}
return spanBack;
}
}
}
Csproj:
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.0</TargetFramework>
<DebugType>pdbonly</DebugType>
<DebugSymbols>true</DebugSymbols>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.12.0" />
</ItemGroup>
</Project>
结果dotnet run -c Release
:
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-2600K CPU 3.40GHz (Sandy Bridge), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.1.100-preview1-014459
[Host] : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
DefaultJob : .NET Core 3.1.0 (CoreCLR 4.700.19.50403, CoreFX 4.700.19.50410), X64 RyuJIT
| Method | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
| Default | 45.55 us | 0.081 us | 0.067 us | 1.00 | - | - | - | - |
| Batch4 | 34.23 us | 0.069 us | 0.065 us | 0.75 | - | - | - | 1 B |
| Batch4Simd | 22.23 us | 0.054 us | 0.051 us | 0.49 | - | - | - | - |
| Batch8 | 31.53 us | 0.160 us | 0.134 us | 0.69 | - | - | - | - |
| Batch16 | 32.10 us | 0.197 us | 0.164 us | 0.70 | - | - | - | - |
编辑:来自@harold 的建议
[Benchmark]
public int[] BatchSimd_harold()
{
int batchSize = Vector<int>.Count;
var accV = this.accV;
Span<int> buffer = spanBack.AsSpan();
for (var i = 0; i < buffer.Length; i = i + batchSize)
{
var currentSlice = buffer.Slice(i, batchSize);
accV.CopyTo(currentSlice);
accV = accV + baseV;
}
return spanBack;
}
结果 :
| Method | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|------:|------:|------:|------:|----------:|
| Default | 46.08 us | 0.331 us | 0.310 us | 1.00 | - | - | - | - |
| BatchSimd | 22.37 us | 0.150 us | 0.141 us | 0.49 | - | - | - | - |
| BatchSimd_harold | 18.72 us | 0.255 us | 0.239 us | 0.41 | - | - | - | - |
编辑 2:在最近的 cpu 上工作
BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18362
Intel Core i7-6820HQ CPU 2.70GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=3.0.100
[Host] : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
DefaultJob : .NET Core 3.0.0 (CoreCLR 4.700.19.46205, CoreFX 4.700.19.46214), X64 RyuJIT
| Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|----------------- |---------:|---------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
| Default | 59.05 us | 1.169 us | 2.362 us | 59.01 us | 1.00 | 0.00 | - | - | - | - |
| Batch4 | 44.39 us | 0.865 us | 0.722 us | 44.48 us | 0.76 | 0.03 | - | - | - | - |
| BatchSimd | 15.37 us | 0.364 us | 1.049 us | 15.07 us | 0.26 | 0.02 | - | - | - | - |
| BatchSimd_harold | 11.77 us | 0.219 us | 0.205 us | 11.80 us | 0.20 | 0.01 | - | - | - | - |
| Batch8 | 43.62 us | 0.871 us | 1.838 us | 43.46 us | 0.74 | 0.04 | - | - | - | - |
| Batch16 | 42.53 us | 0.846 us | 2.317 us | 41.92 us | 0.73 | 0.05 | - | - | - | - |
推荐阅读
- javascript - 我在@Component 中遇到主机问题
- entity-framework - 在 EF 核心中禁用跟踪返回跟踪错误
- c# - 无法解析符号“notnull”
- apache-spark - 调用 split() 函数时出现“split”的 Pyspark 错误不在列表中
- netcdf - 连接具有不同变量的 netcdf 文件 - 使用 nco
- vue.js - 嵌套样式在我的组件样式中不起作用
- java - JSON Schema 中使用 Jackson 注释的自定义属性关键字
- ios - iOS 13:带有 LeftView 间距问题的 UITextField - Xcode 11
- typescript - Typescript - 从另一种类型创建详尽的元组类型
- mongodb - 发生异常。MongoDartError (MongoDart Error: Invalid scheme in uri: mongodb+srv://) with flutter