Redesign Mesh
Requirements
-
Initially specify mesh data with given arrays (constructor)
-
Completely exchange mesh data
- Changes in buffer sizes are not allowed
-
Change single array entries
-
Automatically identify changed indices
-
Allow users to explicitely identify changed indices
-
Collect changed indices into ranges and trigger copying changed ranges to GPU once a frame (or at user specified frames)
-
Enable mechanism to be easily applied to new vertex attributes without code copying
-
Keep AABB up-to-date
-
Meshes can be "static" or "dynamic"
- Combine classes 'Mesh' and 'GpuMesh'
-
[ ] Evaluate performance differences in mechanisms to track changed indices
- Observable Collection (involves events and probably data copying)
- Indexer setter (involves data copying)
- SetVertices (involves data copying)
- Explicit index specification after changing original data array (no copying)
-
[ ] Implement mechanism taking "changed" indices, performantly store them and combine them into more or less contiguous ranges.
First observation: Observable Collection (involves events and probably data copying) crashes the benchmark with 100_000_000 elements, as all necessary operations are either too slow or too memory hungry. Removing from test preemptively.
Code
using System.Collections.ObjectModel;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
namespace MeshPerfTest
{
public struct float3
{
public float X;
public float Y;
public float Z;
}
[SimpleJob(RuntimeMoniker.Net60)]
[MemoryDiagnoser] // check used memory
[HardwareCounters(
HardwareCounter.BranchMispredictions,
HardwareCounter.BranchInstructions)]
[HtmlExporter] // export results in a neat html file
[MarkdownExporter] // export results for markdown
public class TestMesh
{
// before with getter, this should be very slow
public float3[] GetData
{
get => _data;
set => _data = value;
}
// direct access
private float3[] _data = { };
private float3[] _secondDataFieldForRealWorld = { };
// span access
private Span<float3> DataAsSpan => new Span<float3>(_data);
// span access
private ReadOnlySpan<float3> DataAsReadOnlySpanSpan => new ReadOnlySpan<float3>(_data);
[Params(10000, 100_000, 100_000_000)] public int N;
public float3 this[int idx]
{
get => _data[idx];
set => _data[idx] = value;
}
[GlobalSetup]
public void Setup()
{
_data = new float3[N];
_secondDataFieldForRealWorld = new float3[N];
var rnd = new Random();
Array.Fill(_data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(_secondDataFieldForRealWorld, new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
}
[Benchmark]
public void ForLoop_Baseline_AddValue()
{
for (var i = 0; i < GetData.Length; i++)
{
var currentVal = GetData[i];
GetData[i] = new float3
{ X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
}
}
[Benchmark]
public void ForLoop_AddValue()
{
for (var i = 0; i < _data.Length; i++)
{
_data[i] = new float3 { X = _data[i].X * (i + 1), Y = _data[i].Y * (i + 3), Z = _data[i].Z * (i + 2) };
}
}
[Benchmark]
public void ForLoop_AsSpan_AddValue()
{
for (var i = 0; i < DataAsSpan.Length; i++)
{
var currentVal = DataAsSpan[i];
DataAsSpan[i] = new float3
{ X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
}
}
[Benchmark]
public void ForLoop_AsReadOnlySpan_AddValue()
{
for (var i = 0; i < DataAsReadOnlySpanSpan.Length; i++)
{
var currentVal = DataAsReadOnlySpanSpan[i];
_data[i] = new float3
{ X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
}
}
[Benchmark]
public void ForLoop_AsIndexer_AddValue()
{
for (var i = 0; i < _data.Length; i++)
{
var currentVal = this[i];
this[i] = new float3
{ X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
}
}
[Benchmark]
public void ForLoop_Baseline_ReplaceValue()
{
for (var i = 0; i < GetData.Length; i++)
{
var currentVal = GetData[i];
GetData[i].X *= (i + 1);
GetData[i].Y *= (i + 3);
GetData[i].Z *= (i + 2);
}
}
[Benchmark]
public void ForLoop_ReplaceValue()
{
for (var i = 0; i < _data.Length; i++)
{
_data[i].X *= (i + 1);
_data[i].Y *= (i + 3);
_data[i].Z *= (i + 2);
}
}
[Benchmark]
public void ForLoop_AsSpan_ReplaceValue()
{
for (var i = 0; i < DataAsSpan.Length; i++)
{
DataAsSpan[i].X *= (i + 1);
DataAsSpan[i].Y *= (i + 3);
DataAsSpan[i].Z *= (i + 2);
}
}
[Benchmark]
public void ForLoop_AsReadOnlySpan_ReplaceValue()
{
for (var i = 0; i < DataAsReadOnlySpanSpan.Length; i++)
{
var currentVal = DataAsReadOnlySpanSpan[i];
_data[i].X = currentVal.X * (i + 1);
_data[i].Y = currentVal.Y * (i + 3);
_data[i].Z = currentVal.Z * (i + 2);
;
}
}
[Benchmark]
public void ForLoop_AsIndexer_ReplaceValue()
{
for (var i = 0; i < _data.Length; i++)
{
var currentVal = this[i];
currentVal.X *= (i + 1);
currentVal.Y *= (i + 3);
currentVal.Z *= (i + 2);
this[i] = currentVal;
}
}
[Benchmark]
public void RealWorldImpl_WithSimpleSetVertices_Copy()
{
for (var i = N / 2; i < N - 100; i++)
{
_data[i] = _secondDataFieldForRealWorld[i];
}
}
[Benchmark]
public void RealWorldImpl_ReplacingAllDataWithOtherArray()
{
for (var i = 0; i < _secondDataFieldForRealWorld.Length; i++)
{
_data[i] = _secondDataFieldForRealWorld[i];
}
}
}
public static class Program
{
public static void Main()
{
Console.WriteLine("Starting Benchmark");
var summary = BenchmarkRunner.Run<TestMesh>();
}
}
}
Results
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT [AttachedDebugger]
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Error | StdDev | BranchInstructions/Op | BranchMispredictions/Op | Allocated |
|---|---|---|---|---|---|---|---|
| ForLoop_Baseline_AddValue | 10000 | 17.753 μs | 0.1542 μs | 0.1367 μs | 20,176 | 6 | - |
| ForLoop_AddValue | 10000 | 17.511 μs | 0.2026 μs | 0.1796 μs | 20,181 | 8 | - |
| ForLoop_AsSpan_AddValue | 10000 | 26.377 μs | 0.1449 μs | 0.1284 μs | 60,277 | 13 | - |
| ForLoop_AsReadOnlySpan_AddValue | 10000 | 23.909 μs | 0.2351 μs | 0.2084 μs | 50,267 | 13 | - |
| ForLoop_AsIndexer_AddValue | 10000 | 17.874 μs | 0.2151 μs | 0.2012 μs | 20,183 | 7 | - |
| ForLoop_Baseline_ReplaceValue | 10000 | 19.452 μs | 0.0987 μs | 0.0824 μs | 20,193 | 7 | - |
| ForLoop_ReplaceValue | 10000 | 18.801 μs | 0.0683 μs | 0.0606 μs | 20,182 | 6 | - |
| ForLoop_AsSpan_ReplaceValue | 10000 | 29.063 μs | 0.3297 μs | 0.2923 μs | 80,315 | 18 | - |
| ForLoop_AsReadOnlySpan_ReplaceValue | 10000 | 23.931 μs | 0.1213 μs | 0.1076 μs | 60,254 | 13 | - |
| ForLoop_AsIndexer_ReplaceValue | 10000 | 28.203 μs | 0.1537 μs | 0.1363 μs | 30,275 | 9 | - |
| RealWorldImpl_WithSimpleSetVertices_Copy | 10000 | 5.937 μs | 0.0320 μs | 0.0284 μs | 14,764 | 4 | - |
| RealWorldImpl_ReplacingAllDataWithOtherArray | 10000 | 12.703 μs | 0.1225 μs | 0.1086 μs | 30,169 | 9 | - |
| ForLoop_Baseline_AddValue | 100000 | 177.257 μs | 0.6038 μs | 0.5353 μs | 201,745 | 51 | - |
| ForLoop_AddValue | 100000 | 172.977 μs | 0.5293 μs | 0.4132 μs | 201,698 | 50 | - |
| ForLoop_AsSpan_AddValue | 100000 | 262.381 μs | 1.1548 μs | 1.0237 μs | 602,706 | 122 | - |
| ForLoop_AsReadOnlySpan_AddValue | 100000 | 239.112 μs | 1.6832 μs | 1.3142 μs | 502,566 | 118 | - |
| ForLoop_AsIndexer_AddValue | 100000 | 178.840 μs | 2.4601 μs | 2.1808 μs | 201,785 | 55 | - |
| ForLoop_Baseline_ReplaceValue | 100000 | 195.011 μs | 0.8040 μs | 0.7127 μs | 201,907 | 54 | - |
| ForLoop_ReplaceValue | 100000 | 189.299 μs | 1.1598 μs | 1.0282 μs | 201,861 | 53 | - |
| ForLoop_AsSpan_ReplaceValue | 100000 | 288.885 μs | 1.4949 μs | 1.3252 μs | 802,954 | 161 | - |
| ForLoop_AsReadOnlySpan_ReplaceValue | 100000 | 239.613 μs | 1.3238 μs | 1.1054 μs | 602,505 | 122 | - |
| ForLoop_AsIndexer_ReplaceValue | 100000 | 283.333 μs | 0.7748 μs | 0.7248 μs | 302,769 | 76 | - |
| RealWorldImpl_WithSimpleSetVertices_Copy | 100000 | 62.468 μs | 0.5889 μs | 0.5509 μs | 150,368 | 32 | - |
| RealWorldImpl_ReplacingAllDataWithOtherArray | 100000 | 128.034 μs | 1.0584 μs | 0.9901 μs | 301,365 | 63 | - |
| ForLoop_Baseline_AddValue | 100000000 | 184,885.571 μs | 1,360.7537 μs | 1,206.2727 μs | 202,385,362 | 86,380 | 128 B |
| ForLoop_AddValue | 100000000 | 180,727.119 μs | 968.1744 μs | 858.2613 μs | 202,264,485 | 78,643 | 1,371 B |
| ForLoop_AsSpan_AddValue | 100000000 | 268,765.647 μs | 1,576.5453 μs | 1,474.7014 μs | 603,424,905 | 164,386 | 1,364 B |
| ForLoop_AsReadOnlySpan_AddValue | 100000000 | 244,021.818 μs | 1,775.4558 μs | 1,482.5860 μs | 503,051,423 | 142,905 | 128 B |
| ForLoop_AsIndexer_AddValue | 100000000 | 184,510.056 μs | 1,133.5554 μs | 1,060.3284 μs | 202,299,438 | 80,100 | 341 B |
| ForLoop_Baseline_ReplaceValue | 100000000 | 202,071.036 μs | 1,925.9486 μs | 1,707.3033 μs | 202,344,585 | 83,376 | 1,371 B |
| ForLoop_ReplaceValue | 100000000 | 197,450.618 μs | 1,739.1837 μs | 1,452.2972 μs | 202,532,454 | 91,659 | 128 B |
| ForLoop_AsSpan_ReplaceValue | 100000000 | 295,217.950 μs | 1,857.6172 μs | 1,737.6162 μs | 803,724,766 | 194,833 | 2,096 B |
| ForLoop_AsReadOnlySpan_ReplaceValue | 100000000 | 245,030.749 μs | 1,414.8232 μs | 1,181.4414 μs | 602,980,716 | 150,278 | 1,341 B |
| ForLoop_AsIndexer_ReplaceValue | 100000000 | 291,626.064 μs | 2,277.9259 μs | 2,019.3220 μs | 303,394,543 | 117,009 | 2,056 B |
| RealWorldImpl_WithSimpleSetVertices_Copy | 100000000 | 74,050.853 μs | 1,422.2323 μs | 2,336.7684 μs | 150,965,551 | 47,628 | 55 B |
| RealWorldImpl_ReplacingAllDataWithOtherArray | 100000000 | 148,863.958 μs | 1,371.5788 μs | 1,282.9757 μs | 301,881,754 | 86,084 | 256 B |
Findings
This code is faster
[Benchmark]
public void ForLoop_AddValue()
{
for (var i = 0; i < _data.Length; i++)
{
_data[i] = new float3 { X = _data[i].X * (i + 1), Y = _data[i].Y * (i + 3), Z = _data[i].Z * (i + 2) };
}
}
than this
[Benchmark]
public void ForLoop_ReplaceValue()
{
for (var i = 0; i < _data.Length; i++)
{
_data[i].X *= (i + 1);
_data[i].Y *= (i + 3);
_data[i].Z *= (i + 2);
}
}
And replacing array elements by already existing data consumes more memory (however not during the actual file operation), but is faster than everything else.
The results speak for themselves. I think in this setting, it is usually slower to change existing array elements than to replace them with data from another array or just assign a new float3. That's kinda strange ...
Lookup via getter or indexer is slower and more memory hungry (my guess is due to defensive copies).
Furthermore, I've implemented two versions (without Span<T> as it is slower than the other methods).
The first version is the "new" implementation with BeginEdit() and EndEdit() in a slightly modified version (ref instead of Span<T>). The second version is with SetVertices(float3[] data, int startIdx, int length) and compared both implementations with some standard file operations.
My expectation is, given the results above, that the second version with the SetVertices method is faster, even while copying data (at least that is what the human suspects is happening here). Results are pending...
Mesh Implementations
Code
using System.Collections.ObjectModel;
using System.ComponentModel.DataAnnotations;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
namespace MeshPerfTest
{
public struct float3
{
public float X;
public float Y;
public float Z;
}
public class MeshVariantOne
{
public float3[] Data; // do not access by public var!
public EventHandler<(int startIdx, int length)> callMe;
public ReadOnlySpan<float3> GetReadonlyRefToData => new ReadOnlySpan<float3>(Data);
public void SetVertices(ref float3[] dataIn, int startIdx = 0, int length = -1)
{
for (var i = startIdx; i < (length == -1 ? dataIn.Length : length); i++)
{
Data[i] = dataIn[i];
}
callMe?.Invoke(this, (startIdx, length));
}
}
public class MeshVariantTwo
{
public float3[] Data; // do not access by public var!
public EventHandler<(int startIdx, int length)> callMe;
public ref float3[] BeginEdit()
{
return ref Data;
}
public void EndEdit(int startIdx, int length) {
callMe?.Invoke(this, (startIdx, length));
}
}
//[SimpleJob(RuntimeMoniker.Net60)]
//[MemoryDiagnoser] // check used memory
//[HardwareCounters(
// HardwareCounter.BranchMispredictions,
// HardwareCounter.BranchInstructions)]
//[HtmlExporter] // export results in a neat html file
[MarkdownExporter] // export results for markdown
public class TestMesh
{
[Params(10000, 100_000, 100_000_000)]
public int N;
public MeshVariantOne m1;
public MeshVariantTwo m2;
public float3[] ReplacementData;
[GlobalSetup]
public void Setup()
{
m1 = new MeshVariantOne();
m1.Data = new float3[N];
m2 = new MeshVariantTwo();
m2.Data = new float3[N];
ReplacementData = new float3[N];
var rnd = new Random();
Array.Fill(m2.Data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(m1.Data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(ReplacementData,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
}
[Benchmark]
public void ReplaceAllVertices_M1()
{
m1.SetVertices(ref ReplacementData);
}
[Benchmark]
public void ReplaceAllVertices_M2()
{
var reference = m2.BeginEdit();
for (var i = 0; i < reference.Length; i++)
reference[i] = ReplacementData[i];
m2.EndEdit(0, reference.Length);
}
[Benchmark]
public void ReplaceSomeVertices_M1()
{
m1.SetVertices(ref ReplacementData, 10, 20);
m1.SetVertices(ref ReplacementData, 50, 70);
m1.SetVertices(ref ReplacementData, N - 100, N - 1);
}
[Benchmark]
public void ReplaceSomeVertices_M2()
{
var reference = m2.BeginEdit();
for (var i = 10; i < 20; i++)
reference[i] = ReplacementData[i];
for (var i = 50; i < 70; i++)
reference[i] = ReplacementData[i];
for (var i = N - 100; i < N - 1; i++)
reference[i] = ReplacementData[i];
// normally we would need to call 3 times to m2.EndEdit(), however let's just call one time
// usually we would have an implementation with an array of tuples
m2.EndEdit(10, 20);
}
[Benchmark]
public void ChangeSomeVertices_M1()
{
var data = m1.GetReadonlyRefToData.ToArray(); // data copy :(
// change
for (var i = 0; i < 100; i++)
data[i] = new float3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5};
m1.SetVertices(ref data, 0, 100);
}
[Benchmark]
public void ChangeSomeVertices_M2()
{
var data = m2.BeginEdit();
// change
for (var i = 0; i < 100; i++)
data[i] = new float3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5};
m2.EndEdit(0, 100);
}
}
public static class Program
{
public static void Main()
{
Console.WriteLine("Starting Benchmark");
var summary = BenchmarkRunner.Run<TestMesh>();
}
}
}
Results
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT [AttachedDebugger]
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Error | StdDev |
|---|---|---|---|---|
| ReplaceAllVertices_M1 | 10000 | 14,516.6 ns | 76.20 ns | 67.55 ns |
| ReplaceAllVertices_M2 | 10000 | 10,162.5 ns | 154.10 ns | 128.68 ns |
| ReplaceSomeVertices_M1 | 10000 | 200.5 ns | 1.01 ns | 0.95 ns |
| ReplaceSomeVertices_M2 | 10000 | 147.5 ns | 2.18 ns | 1.93 ns |
| ChangeSomeVertices_M1 | 10000 | 73,644.8 ns | 760.38 ns | 711.26 ns |
| ChangeSomeVertices_M2 | 10000 | 126.2 ns | 1.05 ns | 0.82 ns |
| ReplaceAllVertices_M1 | 100000 | 145,922.9 ns | 446.97 ns | 396.22 ns |
| ReplaceAllVertices_M2 | 100000 | 118,489.6 ns | 2,056.64 ns | 3,321.09 ns |
| ReplaceSomeVertices_M1 | 100000 | 179.6 ns | 0.90 ns | 0.84 ns |
| ReplaceSomeVertices_M2 | 100000 | 146.4 ns | 1.53 ns | 1.36 ns |
| ChangeSomeVertices_M1 | 100000 | 335,919.5 ns | 3,811.63 ns | 3,565.40 ns |
| ChangeSomeVertices_M2 | 100000 | 125.7 ns | 0.89 ns | 0.75 ns |
| ReplaceAllVertices_M1 | 100000000 | 163,270,260.7 ns | 1,775,762.38 ns | 1,574,167.10 ns |
| ReplaceAllVertices_M2 | 100000000 | 135,759,776.0 ns | 2,608,763.27 ns | 3,482,624.04 ns |
| ReplaceSomeVertices_M1 | 100000000 | 200.1 ns | 0.74 ns | 0.69 ns |
| ReplaceSomeVertices_M2 | 100000000 | 144.6 ns | 0.47 ns | 0.40 ns |
| ChangeSomeVertices_M1 | 100000000 | 452,193,086.7 ns | 5,400,314.15 ns | 5,051,457.20 ns |
| ChangeSomeVertices_M2 | 100000000 | 126.5 ns | 0.57 ns | 0.50 ns |
Version M2 (BeginEdit(), EndEdit()) performs better in each scenario. Especially when GetReadonlyRefToData from version 1 is being used (data copying).
Performance optimizations to Mesh impl
- using agressive inline
- testing Vector3 with SIMD
- testing
new float3{}vsfloat3.x =
using System.Collections.ObjectModel;
using System.Numerics;
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace MeshPerfTest
{
public struct float3
{
public float X;
public float Y;
public float Z;
}
public class MeshVariantOne
{
public float3[] Data; // do not access by public var!
public EventHandler<(int startIdx, int length)> callMe;
public ReadOnlyCollection<float3> GetReadonlyRefToData => new ReadOnlyCollection<float3>(Data);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetVertices(in float3[] dataIn, int startIdx = 0, int length = -1)
{
var loopEnd = (length == -1 ? dataIn.Length : length);
for (var i = startIdx; i < loopEnd; i++)
{
Data[i] = dataIn[i];
}
callMe?.Invoke(this, (startIdx, length));
}
}
public class MeshVariantTwo
{
public float3[] Data; // do not access by public var!
public EventHandler<(int startIdx, int length)> callMe;
public ref float3[] BeginEdit()
{
return ref Data;
}
public void EndEdit(int startIdx, int length)
{
callMe?.Invoke(this, (startIdx, length));
}
}
public class MeshVariantThree
{
public Vector3[] Data; // do not access by public var!
public EventHandler<(int startIdx, int length)> callMe;
public ReadOnlyCollection<Vector3> GetReadonlyRefToData => new(Data);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetVertices(in Vector3[] dataIn, int startIdx = 0, int length = -1)
{
var loopEnd = (length == -1 ? dataIn.Length : length);
for (var i = startIdx; i < loopEnd; i++)
{
Data[i] = dataIn[i];
}
callMe?.Invoke(this, (startIdx, length));
}
}
public class MeshVariantFour
{
public Vector3[] Data; // do not access by public var!
public EventHandler<(int startIdx, int length)> callMe;
public ref Vector3[] BeginEdit()
{
return ref Data;
}
public void EndEdit(int startIdx, int length)
{
callMe?.Invoke(this, (startIdx, length));
}
}
//[SimpleJob(RuntimeMoniker.Net60)]
//[MemoryDiagnoser] // check used memory
//[HardwareCounters(
// HardwareCounter.BranchMispredictions,
// HardwareCounter.BranchInstructions)]
//[HtmlExporter] // export results in a neat html file
[MarkdownExporter] // export results for markdown
public class TestMesh
{
[Params(10000, 100_000, 100_000_000)] public int N;
public MeshVariantOne m1;
public MeshVariantTwo m2;
public MeshVariantThree m3;
public MeshVariantFour m4;
public float3[] ReplacementData;
public Vector3[] ReplacementDataV3;
[GlobalSetup]
public void Setup()
{
m1 = new MeshVariantOne();
m1.Data = new float3[N];
m2 = new MeshVariantTwo();
m2.Data = new float3[N];
m3 = new MeshVariantThree();
m3.Data = new Vector3[N];
m4 = new MeshVariantFour();
m4.Data = new Vector3[N];
ReplacementData = new float3[N];
ReplacementDataV3 = new Vector3[N];
var rnd = new Random();
Array.Fill(m2.Data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(m1.Data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(m3.Data,
new Vector3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(ReplacementData,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(ReplacementDataV3,
new Vector3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
}
[Benchmark]
public void ReplaceAllVertices_M1()
{
m1.SetVertices(in ReplacementData);
}
[Benchmark]
public void ReplaceAllVertices_M2()
{
var reference = m2.BeginEdit();
for (var i = 0; i < reference.Length; i++)
reference[i] = ReplacementData[i];
m2.EndEdit(0, reference.Length);
}
[Benchmark]
public void ReplaceAllVertices_M3()
{
m3.SetVertices(in ReplacementDataV3);
}
[Benchmark]
public void ReplaceAllVertices_M4()
{
var reference = m4.BeginEdit();
for (var i = 0; i < reference.Length; i++)
reference[i] = ReplacementDataV3[i];
m4.EndEdit(0, reference.Length);
}
[Benchmark]
public void ReplaceSomeVertices_M1()
{
m1.SetVertices(in ReplacementData, 10, 20);
m1.SetVertices(in ReplacementData, 50, 70);
m1.SetVertices(in ReplacementData, N - 100, N - 1);
}
[Benchmark]
public void ReplaceSomeVertices_M2()
{
var reference = m2.BeginEdit();
for (var i = 10; i < 20; i++)
reference[i] = ReplacementData[i];
for (var i = 50; i < 70; i++)
reference[i] = ReplacementData[i];
for (var i = N - 100; i < N - 1; i++)
reference[i] = ReplacementData[i];
// normally we would need to call 3 times to m2.EndEdit(), however let's just call one time
// usually we would have an implementation with an array of tuples
m2.EndEdit(10, 20);
}
[Benchmark]
public void ReplaceSomeVertices_M3()
{
m3.SetVertices(in ReplacementDataV3, 10, 20);
m3.SetVertices(in ReplacementDataV3, 50, 70);
m3.SetVertices(in ReplacementDataV3, N - 100, N - 1);
}
[Benchmark]
public void ReplaceSomeVertices_M4()
{
var reference = m4.BeginEdit();
for (var i = 10; i < 20; i++)
reference[i] = ReplacementDataV3[i];
for (var i = 50; i < 70; i++)
reference[i] = ReplacementDataV3[i];
for (var i = N - 100; i < N - 1; i++)
reference[i] = ReplacementDataV3[i];
// normally we would need to call 3 times to m2.EndEdit(), however let's just call one time
// usually we would have an implementation with an array of tuples
m4.EndEdit(10, 20);
}
[Benchmark]
public void ChangeSomeVertices_M1()
{
var data = new float3[100];
// change
for (var i = 0; i < 100; i++)
data[i] = new float3
{
X = m1.GetReadonlyRefToData[i].X * i + 10, Y = m1.GetReadonlyRefToData[i].Y * i + 20,
Z = m1.GetReadonlyRefToData[i].Z * i + 5
};
m1.SetVertices(in data, 0, 100);
}
[Benchmark]
public void ChangeSomeVertices_M2()
{
var data = m2.BeginEdit();
// change
for (var i = 0; i < 100; i++)
data[i] = new float3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5 };
m2.EndEdit(0, 100);
}
[Benchmark]
public void ChangeSomeVertices_M2_AlternativeVersion()
{
var data = m2.BeginEdit();
// change
for (var i = 0; i < 100; i++)
{
data[i].X *= i + 10;
data[i].Y *= i + 20;
data[i].Z *= i + 5;
}
m2.EndEdit(0, 100);
}
[Benchmark]
public void ChangeSomeVertices_M3()
{
var data = new Vector3[100];
// change
for (var i = 0; i < 100; i++)
data[i] = new Vector3()
{
X = m3.GetReadonlyRefToData[i].X * i + 10, Y = m3.GetReadonlyRefToData[i].Y * i + 20,
Z = m3.GetReadonlyRefToData[i].Z * i + 5
};
m3.SetVertices(in data, 0, 100);
}
[Benchmark]
public void ChangeSomeVertices_M4()
{
var data = m4.BeginEdit();
// change
for (var i = 0; i < 100; i++)
data[i] = new Vector3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5 };
m4.EndEdit(0, 100);
}
}
public static class Program
{
public static void Main()
{
Console.WriteLine("Starting Benchmark");
var summary = BenchmarkRunner.Run<TestMesh>();
}
}
}
Results
Some strange stuff: ChangeSomeVertices_M2_AlternativeVersion() is slower than ChangeSomeVertices_M2() without the *=
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT [AttachedDebugger]
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Error | StdDev | Median |
|---|---|---|---|---|---|
| ReplaceAllVertices_M1 | 10000 | 12,368.5 ns | 245.25 ns | 272.60 ns | 12,344.7 ns |
| ReplaceAllVertices_M2 | 10000 | 10,227.9 ns | 166.97 ns | 217.11 ns | 10,178.4 ns |
| ReplaceAllVertices_M3 | 10000 | 11,170.2 ns | 102.77 ns | 85.82 ns | 11,168.9 ns |
| ReplaceAllVertices_M4 | 10000 | 10,274.1 ns | 108.57 ns | 90.66 ns | 10,271.1 ns |
| ReplaceSomeVertices_M1 | 10000 | 202.1 ns | 4.22 ns | 11.96 ns | 205.6 ns |
| ReplaceSomeVertices_M2 | 10000 | 145.8 ns | 0.54 ns | 0.50 ns | 145.6 ns |
| ReplaceSomeVertices_M3 | 10000 | 145.2 ns | 0.79 ns | 0.74 ns | 145.3 ns |
| ReplaceSomeVertices_M4 | 10000 | 144.2 ns | 0.55 ns | 0.51 ns | 144.2 ns |
| ChangeSomeVertices_M1 | 10000 | 2,793.8 ns | 55.63 ns | 103.11 ns | 2,770.3 ns |
| ChangeSomeVertices_M2 | 10000 | 126.2 ns | 0.91 ns | 0.81 ns | 125.9 ns |
| ChangeSomeVertices_M2_AlternativeVersion | 10000 | 155.6 ns | 1.54 ns | 1.44 ns | 155.0 ns |
| ChangeSomeVertices_M3 | 10000 | 2,840.6 ns | 25.90 ns | 22.96 ns | 2,840.0 ns |
| ChangeSomeVertices_M4 | 10000 | 153.3 ns | 0.61 ns | 0.51 ns | 153.4 ns |
| ReplaceAllVertices_M1 | 100000 | 125,962.3 ns | 2,421.91 ns | 2,146.96 ns | 125,850.4 ns |
| ReplaceAllVertices_M2 | 100000 | 105,787.7 ns | 1,242.14 ns | 1,161.90 ns | 105,537.4 ns |
| ReplaceAllVertices_M3 | 100000 | 114,437.5 ns | 1,197.81 ns | 1,120.43 ns | 114,212.4 ns |
| ReplaceAllVertices_M4 | 100000 | 106,866.9 ns | 2,047.79 ns | 2,437.75 ns | 105,724.5 ns |
| ReplaceSomeVertices_M1 | 100000 | 162.2 ns | 0.51 ns | 0.47 ns | 162.2 ns |
| ReplaceSomeVertices_M2 | 100000 | 145.2 ns | 0.29 ns | 0.24 ns | 145.2 ns |
| ReplaceSomeVertices_M3 | 100000 | 146.0 ns | 1.06 ns | 0.99 ns | 145.9 ns |
| ReplaceSomeVertices_M4 | 100000 | 143.2 ns | 0.88 ns | 0.78 ns | 143.1 ns |
| ChangeSomeVertices_M1 | 100000 | 2,649.8 ns | 39.24 ns | 36.71 ns | 2,643.7 ns |
| ChangeSomeVertices_M2 | 100000 | 125.2 ns | 0.37 ns | 0.31 ns | 125.2 ns |
| ChangeSomeVertices_M2_AlternativeVersion | 100000 | 153.3 ns | 0.77 ns | 0.72 ns | 153.0 ns |
| ChangeSomeVertices_M3 | 100000 | 2,723.1 ns | 18.99 ns | 15.86 ns | 2,722.1 ns |
| ChangeSomeVertices_M4 | 100000 | 152.9 ns | 0.83 ns | 0.65 ns | 153.0 ns |
| ReplaceAllVertices_M1 | 100000000 | 145,332,460.4 ns | 1,319,414.43 ns | 1,030,112.00 ns | 145,057,925.0 ns |
| ReplaceAllVertices_M2 | 100000000 | 134,879,753.3 ns | 1,490,533.54 ns | 1,394,245.99 ns | 135,314,100.0 ns |
| ReplaceAllVertices_M3 | 100000000 | 139,177,286.7 ns | 1,235,668.59 ns | 1,155,845.16 ns | 138,949,550.0 ns |
| ReplaceAllVertices_M4 | 100000000 | 132,078,203.6 ns | 697,992.64 ns | 618,752.30 ns | 132,109,287.5 ns |
| ReplaceSomeVertices_M1 | 100000000 | 160.9 ns | 2.23 ns | 4.19 ns | 159.5 ns |
| ReplaceSomeVertices_M2 | 100000000 | 146.0 ns | 1.06 ns | 0.89 ns | 146.2 ns |
| ReplaceSomeVertices_M3 | 100000000 | 146.7 ns | 1.23 ns | 1.09 ns | 146.5 ns |
| ReplaceSomeVertices_M4 | 100000000 | 141.7 ns | 1.59 ns | 1.41 ns | 141.4 ns |
| ChangeSomeVertices_M1 | 100000000 | 2,785.7 ns | 54.80 ns | 53.82 ns | 2,788.4 ns |
| ChangeSomeVertices_M2 | 100000000 | 126.7 ns | 0.89 ns | 0.79 ns | 126.8 ns |
| ChangeSomeVertices_M2_AlternativeVersion | 100000000 | 154.6 ns | 1.23 ns | 1.09 ns | 154.4 ns |
| ChangeSomeVertices_M3 | 100000000 | 2,882.4 ns | 51.35 ns | 45.52 ns | 2,876.9 ns |
| ChangeSomeVertices_M4 | 100000000 | 151.2 ns | 1.77 ns | 1.47 ns | 150.8 ns |
Questions
-
How relevant are the ReadOnlySpan tests? I don't see the point of reading from a ReadOnlySpan but writing directly to the backing array.
-
In the Span benchmarks: Would it make a difference (although the code was probably optimized) to only get the
Spanonce outside the loop instead of accessing theDataAs[ReadOnly]Spangetters (involving the new operation) once, twice or even three times in each loop iteration? E.G:[Benchmark] public void ForLoop_AsSpan_AddValue() { var dataAsSpan = DataAsSpan; // ONLY HERE THE new Span<float3>(_data) OPERATION IS PERFORMED for (var i = 0; i < dataAsSpan.Length; i++) { var currentVal = dataAsSpan[i]; // NO new Span<float3>(_data) HERE dataAsSpan[i] = new float3 // NO new Span<float3>(_data) HERE { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) }; } } -
I'd love to see a way to make the indexer work for us because it would allow us to implement a mechanism keeping track of the changed vertices and propagate them to the GPU in a way that's totally opaque for the user. Some questions arise with the indexer
-
The timing difference in reading from/replacing items directly on the backing array compared to reading/replacing using an indexer seems to be neglectable (~0.3 μs), while the difference between backing array vs. indexer when changing existing contents is substantially higher (~10 μs):
Method Mean ForLoop_AddValue 17.511 μs ForLoop_AsIndexer_AddValue 17.874 μs ForLoop_ReplaceValue 18.801 μs ForLoop_AsIndexer_ReplaceValue 28.203 μs Why that? More data copying involved? Would be interesting to know, because if this holds, we need to eplain users how to set up their loops and should be able to explain why and not just how.
-
Does the minimal difference of ~0.3 μs still hold if we add just the slightest bit of functionality to the setter of the indexer? My suspicion is that the "just-pass-it-to-the-array"-implementation of the indexer's setter (
set => _data[idx] = value;) can be optimized by the compiler much better as if the setter contained some more instructions - even if the additional instruction(s) itself do not eat up much time.
-
Question 1)
Just testing if ReadOnlySpan has any performance implications. Yes, in this case it's not useful.
Question 2)
Code
using System;
using System.Collections.ObjectModel;
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace MeshPerf
{
public struct float3
{
public float X;
public float Y;
public float Z;
}
public class Mesh
{
public float3[] Data; // do not access by public var!
public Span<float3> DataAsSpan => new Span<float3>(Data);
}
[MarkdownExporter] // export results for markdown
public class MeshTests
{
[Params(10000, 100_000, 100_000_000)]
public int N;
public Mesh m;
[GlobalSetup]
public void Setup()
{
m = new Mesh
{
Data = new float3[N]
};
var rnd = new Random();
Array.Fill(m.Data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
}
[Benchmark]
public void ForLoop_AsSpan_AddValue_Original()
{
var dataAsSpan = m.DataAsSpan;
for (var i = 0; i < dataAsSpan.Length; i++)
{
var currentVal = dataAsSpan[i];
dataAsSpan[i] = new float3
{ X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
}
}
[Benchmark]
public void ForLoop_AsSpan_AddValue_New()
{
var dataAsSpan = m.DataAsSpan; // ONLY HERE THE new Span<float3>(_data) OPERATION IS PERFORMED
for (var i = 0; i < dataAsSpan.Length; i++)
{
// var currentVal = dataAsSpan[i]; // NO new Span<float3>(_data) HERE
dataAsSpan[i] = new float3 // NO new Span<float3>(_data) HERE
{ X = dataAsSpan[i].X * (i + 1), Y = dataAsSpan[i].Y * (i + 3), Z = dataAsSpan[i].Z * (i + 2) };
}
}
}
public static class MeshPerf
{
public static void Main()
{
var summary = BenchmarkRunner.Run<MeshTests>();
}
}
}
Results
Actually, there is a difference. I would have guessed that the optimizer is able to identify and remove this…
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19043.1889 (21H1/May2021Update)
AMD Ryzen 7 2700X, 1 CPU, 16 logical and 8 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Error | StdDev |
|---|---|---|---|---|
| ForLoop_AsSpan_AddValue_Original | 10000 | 13.58 μs | 0.132 μs | 0.117 μs |
| ForLoop_AsSpan_AddValue_New | 10000 | 12.56 μs | 0.133 μs | 0.124 μs |
| ForLoop_AsSpan_AddValue_Original | 100000 | 137.37 μs | 1.262 μs | 1.181 μs |
| ForLoop_AsSpan_AddValue_New | 100000 | 125.47 μs | 0.682 μs | 0.604 μs |
| ForLoop_AsSpan_AddValue_Original | 100000000 | 147,904.77 μs | 2,134.418 μs | 1,782.336 μs |
| ForLoop_AsSpan_AddValue_New | 100000000 | 143,569.10 μs | 2,836.517 μs | 3,786.669 μs |
Some more example implementations regarding
Question 3)
The first mesh implementation uses RAII for change notifications and is unsafe and optimized for speed. The other implementation is "memory safe" and uses the indexer and a bool for change notifications. Added an extra method for value replacement, since the indexer is much slower than Array.Copy()
Code
using System;
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
namespace MeshPerf
{
public struct float3
{
public float X;
public float Y;
public float Z;
}
public enum MeshDataType
{
Vertices,
Normals
}
public class MeshChangedEvent : EventArgs
{
public MeshDataType Type;
}
public unsafe class MeshData : IDisposable
{
private readonly float3* _internalRef;
private readonly int _internalLength;
private readonly MeshDataType _type;
private readonly EventHandler<MeshChangedEvent> _hndl; // dummy
private bool disposedValue;
public Span<float3> Data => new(_internalRef, _internalLength);
public MeshData(float3* _data, in int length, in MeshDataType type, in EventHandler<MeshChangedEvent> hndl)
{
_internalRef = _data;
_internalLength = length;
_type = type;
_hndl = hndl;
}
protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
_hndl?.Invoke(this, new MeshChangedEvent { Type = _type });
disposedValue = true;
}
}
public void Dispose()
{
// Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
}
public unsafe class RAIIMesh
{
public float3[] _verticesData; // do not access publicly
public EventHandler<MeshChangedEvent> OnChangeEvent;
public MeshData GetVertices()
{
fixed (float3* ptr = &_verticesData[0])
return new MeshData(ptr, _verticesData.Length, MeshDataType.Vertices, in OnChangeEvent);
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public void SetVertices(in float3[] data, int dstOffset, int length)
{
var float3Offset = dstOffset * 3;
var float3Length = length * 3;
var destInBytes = _verticesData.Length * 3 * sizeof(float);
var dataInBytes = float3Length * sizeof(float);
fixed (float* ptr = &data[0].X)
fixed (float* localPtr = &_verticesData[0].X)
{
var localPtrOffset = localPtr + float3Offset;
Buffer.MemoryCopy(ptr, localPtrOffset, destInBytes, dataInBytes);
}
OnChangeEvent?.Invoke(this, new MeshChangedEvent
{
Type = MeshDataType.Vertices
});
}
}
public class ThisMesh
{
public float3[] _verticesData; // do not access publicly
public EventHandler<MeshChangedEvent> OnChangeEvent;
public int Length => _verticesData.Length;
public bool DataModified { get; private set; } = false;
public float3 this[int idx]
{
get => _verticesData[idx];
set
{
_verticesData[idx] = value;
// do not invoke event here, this is done every call and leads to a massive performance hit
// set a variable, that things are changed and leave the call to the event for later
DataModified = true;
//OnChangeEvent?.Invoke(this, new MeshChangedEvent
//{
//Type = MeshDataType.Vertices
//});
}
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public void SetVertices(in float3[] data, int offset, int length)
{
Array.Copy(data, offset, _verticesData, offset, length);
OnChangeEvent?.Invoke(this, new MeshChangedEvent
{
Type = MeshDataType.Vertices
});
}
}
public class Mesh
{
public float3[] Data;
}
[MarkdownExporter] // export results for markdown
[MemoryDiagnoser]
public class MeshTests
{
[Params(10000, 100_000, 100_000_000)]
public int N = 100;
public Mesh m;
public RAIIMesh raiiMesh;
public ThisMesh indexerMesh;
public float3[] DataForReplacement;
public static long DummyCounter = 0;
[GlobalSetup]
public void Setup()
{
m = new Mesh
{
Data = new float3[N]
};
raiiMesh = new RAIIMesh
{
_verticesData = new float3[N]
};
indexerMesh = new ThisMesh
{
_verticesData = new float3[N]
};
DataForReplacement = new float3[N];
var rnd = new Random();
Array.Fill(m.Data,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(raiiMesh._verticesData,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(indexerMesh._verticesData,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
Array.Fill(DataForReplacement,
new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
DummyCounter = 0;
raiiMesh.OnChangeEvent += (s, e) => { if (e.Type == MeshDataType.Vertices) DummyCounter++; }; // do some dummy work
indexerMesh.OnChangeEvent += (s, e) => { if (e.Type == MeshDataType.Vertices) DummyCounter++; }; // do some dummy work
}
[Benchmark]
public void ChangeAllElements_Baseline()
{
var data = m.Data;
for (var i = 0; i < data.Length; i++)
{
data[i] = new float3 { X = (data[i].X + 1) * (i + 1), Y = (data[i].Y + 1) * (i + 3), Z = (data[i].Z + 1) * (i + 2) };
}
}
[Benchmark]
public unsafe void ChangeAllElements_RAIIMesh()
{
using var vert = raiiMesh.GetVertices();
var data = vert.Data;
for (var i = 0; i < data.Length; i++)
{
data[i] = new float3 { X = (data[i].X + 1) * (i + 1), Y = (data[i].Y + 1) * (i + 3), Z = (data[i].Z + 1) * (i + 2) };
}
}
[Benchmark]
public unsafe void ChangeAllElements_IndexerMesh()
{
var data = indexerMesh;
for (var i = 0; i < data.Length; i++)
{
data[i] = new float3 { X = (data[i].X + 1) * (i + 1), Y = (data[i].Y + 1) * (i + 3), Z = (data[i].Z + 1) * (i + 2) };
}
// call event after data modification
if (indexerMesh.DataModified)
indexerMesh.OnChangeEvent?.Invoke(this, new MeshChangedEvent { Type = MeshDataType.Vertices });
}
[Benchmark]
public void ReplaceAllElements_Baseline()
{
var data = m.Data;
for (var i = 0; i < data.Length; i++)
{
var replaceData = DataForReplacement[i];
data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
}
}
[Benchmark]
public unsafe void ReplaceAllElements_RAIIMesh()
{
raiiMesh.SetVertices(in DataForReplacement, 0, DataForReplacement.Length);
}
[Benchmark]
public void ReplaceAllElements_IndexerMesh()
{
indexerMesh.SetVertices(in DataForReplacement, 0, DataForReplacement.Length);
}
[Benchmark]
public void ReplaceAllElements_IndexerMesh_NoExtraMethod()
{
var data = indexerMesh;
for (var i = 0; i < data.Length; i++)
{
var replaceData = DataForReplacement[i];
data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
}
// call event after data modification
if (indexerMesh.DataModified)
indexerMesh.OnChangeEvent?.Invoke(this, new MeshChangedEvent { Type = MeshDataType.Vertices });
}
[Benchmark]
public void ReplaceSomeElements_Baseline()
{
var data = m.Data;
for (var i = 100; i < data.Length - 100; i++)
{
var replaceData = DataForReplacement[i];
data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
}
}
[Benchmark]
public unsafe void ReplaceSomeElements_RAIIMesh()
{
raiiMesh.SetVertices(in DataForReplacement, 100, DataForReplacement.Length - 100);
}
[Benchmark]
public unsafe void ReplaceSomeElements_IndexerMesh()
{
indexerMesh.SetVertices(in DataForReplacement, 100, DataForReplacement.Length - 100);
}
[Benchmark]
public void ReplaceSomeElements_IndexerMesh_NoExtraMethod()
{
var data = indexerMesh;
for (var i = 100; i < data.Length - 100; i++)
{
var replaceData = DataForReplacement[i];
data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
}
// call event after data modification
if (indexerMesh.DataModified)
indexerMesh.OnChangeEvent?.Invoke(this, new MeshChangedEvent { Type = MeshDataType.Vertices });
}
}
public static class MeshPerf
{
public static void Main(string[] args)
{
//var testor = new MeshTests();
//testor.Setup();
//testor.ReplaceAllElements_RAIIMesh();
var summary = BenchmarkRunner.Run<MeshTests>();
Console.WriteLine(MeshTests.DummyCounter);
}
}
}
Results
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19043.1889 (21H1/May2021Update)
AMD Ryzen 7 2700X, 1 CPU, 16 logical and 8 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Gen 0 | Allocated |
|---|---|---|---|---|
| ChangeAllElements_Baseline | 10000 | 15.243 μs | - | - |
| ChangeAllElements_RAIIMesh | 10000 | 16.348 μs | - | 72 B |
| ChangeAllElements_IndexerMesh | 10000 | 20.100 μs | - | 24 B |
| ReplaceAllElements_Baseline | 10000 | 18.504 μs | - | - |
| ReplaceAllElements_RAIIMesh | 10000 | 2.146 μs | 0.0038 | 24 B |
| ReplaceAllElements_IndexerMesh | 10000 | 3.102 μs | 0.0038 | 24 B |
| ReplaceAllElements_IndexerMesh_NoExtraMethod | 10000 | 27.049 μs | - | 24 B |
| ReplaceSomeElements_Baseline | 10000 | 17.638 μs | - | - |
| ReplaceSomeElements_RAIIMesh | 10000 | 2.014 μs | 0.0038 | 24 B |
| ReplaceSomeElements_IndexerMesh | 10000 | 3.052 μs | 0.0038 | 24 B |
| ReplaceSomeElements_IndexerMesh_NoExtraMethod | 10000 | 27.422 μs | - | 24 B |
| ChangeAllElements_Baseline | 100000 | 150.544 μs | - | - |
| ChangeAllElements_RAIIMesh | 100000 | 162.869 μs | - | 72 B |
| ChangeAllElements_IndexerMesh | 100000 | 278.018 μs | - | 24 B |
| ReplaceAllElements_Baseline | 100000 | 185.578 μs | - | - |
| ReplaceAllElements_RAIIMesh | 100000 | 29.779 μs | - | 24 B |
| ReplaceAllElements_IndexerMesh | 100000 | 33.617 μs | - | 24 B |
| ReplaceAllElements_IndexerMesh_NoExtraMethod | 100000 | 285.607 μs | - | 24 B |
| ReplaceSomeElements_Baseline | 100000 | 183.384 μs | - | - |
| ReplaceSomeElements_RAIIMesh | 100000 | 25.871 μs | - | 24 B |
| ReplaceSomeElements_IndexerMesh | 100000 | 33.678 μs | - | 24 B |
| ReplaceSomeElements_IndexerMesh_NoExtraMethod | 100000 | 261.310 μs | - | 24 B |
| ChangeAllElements_Baseline | 100000000 | 162,162.093 μs | - | 578 B |
| ChangeAllElements_RAIIMesh | 100000000 | 174,144.947 μs | - | 845 B |
| ChangeAllElements_IndexerMesh | 100000000 | 285,332.773 μs | - | 1,180 B |
| ReplaceAllElements_Baseline | 100000000 | 216,335.931 μs | - | 771 B |
| ReplaceAllElements_RAIIMesh | 100000000 | 81,180.116 μs | - | 93 B |
| ReplaceAllElements_IndexerMesh | 100000000 | 80,354.836 μs | - | 203 B |
| ReplaceAllElements_IndexerMesh_NoExtraMethod | 100000000 | 293,278.668 μs | - | 564 B |
| ReplaceSomeElements_Baseline | 100000000 | 212,722.011 μs | - | 771 B |
| ReplaceSomeElements_RAIIMesh | 100000000 | 81,746.793 μs | - | 206 B |
| ReplaceSomeElements_IndexerMesh | 100000000 | 79,903.332 μs | - | 93 B |
| ReplaceSomeElements_IndexerMesh_NoExtraMethod | 100000000 | 293,259.157 μs | - | 1,180 B |
One interessting finding: Array.Copy() performs better than Memory.BlockCopy for large elements.
Results, branch predition / cache misses
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Error | StdDev | BranchInstructions/Op | BranchMispredictions/Op |
|---|---|---|---|---|---|---|
| ChangeAllElements_Baseline | 10000 | 17.579 μs | 0.3509 μs | 0.6053 μs | 10,202 | 8 |
| ChangeAllElements_RAIIMesh | 10000 | 18.502 μs | 0.3165 μs | 0.2805 μs | 10,227 | 8 |
| ChangeAllElements_IndexerMesh | 10000 | 20.878 μs | 0.3437 μs | 0.2870 μs | 20,248 | 10 |
| ReplaceAllElements_Baseline | 10000 | 19.761 μs | 0.3206 μs | 0.3938 μs | 20,222 | 9 |
| ReplaceAllElements_RAIIMesh | 10000 | 2.624 μs | 0.0480 μs | 0.0449 μs | 67 | 1 |
| ReplaceAllElements_IndexerMesh | 10000 | 2.543 μs | 0.0289 μs | 0.0241 μs | 71 | 1 |
| ReplaceAllElements_IndexerMesh_NoExtraMethod | 10000 | 22.226 μs | 0.3524 μs | 0.3297 μs | 30,263 | 10 |
| ReplaceSomeElements_Baseline | 10000 | 19.325 μs | 0.3750 μs | 0.3683 μs | 19,842 | 9 |
| ReplaceSomeElements_RAIIMesh | 10000 | 2.121 μs | 0.0307 μs | 0.0272 μs | 60 | 0 |
| ReplaceSomeElements_IndexerMesh | 10000 | 2.590 μs | 0.0487 μs | 0.0432 μs | 72 | 1 |
| ReplaceSomeElements_IndexerMesh_NoExtraMethod | 10000 | 22.870 μs | 0.2953 μs | 0.2762 μs | 29,677 | 11 |
| ChangeAllElements_Baseline | 100000 | 176.943 μs | 3.4493 μs | 3.8339 μs | 102,100 | 62 |
| ChangeAllElements_RAIIMesh | 100000 | 180.424 μs | 2.3812 μs | 2.1109 μs | 101,995 | 52 |
| ChangeAllElements_IndexerMesh | 100000 | 207.774 μs | 2.4982 μs | 2.3368 μs | 202,294 | 75 |
| ReplaceAllElements_Baseline | 100000 | 199.659 μs | 2.3540 μs | 2.2019 μs | 202,243 | 72 |
| ReplaceAllElements_RAIIMesh | 100000 | 35.487 μs | 0.6606 μs | 0.6179 μs | 573 | 11 |
| ReplaceAllElements_IndexerMesh | 100000 | 34.294 μs | 0.6512 μs | 0.6092 μs | 443 | 6 |
| ReplaceAllElements_IndexerMesh_NoExtraMethod | 100000 | 223.151 μs | 3.2693 μs | 3.0581 μs | 302,743 | 96 |
| ReplaceSomeElements_Baseline | 100000 | 199.397 μs | 2.2433 μs | 1.9886 μs | 202,231 | 81 |
| ReplaceSomeElements_RAIIMesh | 100000 | 35.615 μs | 0.4313 μs | 0.4034 μs | 449 | 5 |
| ReplaceSomeElements_IndexerMesh | 100000 | 35.243 μs | 0.3341 μs | 0.2790 μs | 396 | 4 |
| ReplaceSomeElements_IndexerMesh_NoExtraMethod | 100000 | 233.689 μs | 4.3176 μs | 3.8275 μs | 302,408 | 104 |
| ChangeAllElements_Baseline | 100000000 | 189,533.622 μs | 3,739.5204 μs | 4,001.2450 μs | 102,645,153 | 102,324 |
| ChangeAllElements_RAIIMesh | 100000000 | 188,567.398 μs | 2,536.0843 μs | 2,248.1727 μs | 102,435,681 | 76,550 |
| ChangeAllElements_IndexerMesh | 100000000 | 216,362.010 μs | 4,270.5804 μs | 4,194.2810 μs | 202,713,771 | 102,656 |
| ReplaceAllElements_Baseline | 100000000 | 208,290.382 μs | 4,065.6367 μs | 4,518.9445 μs | 202,922,451 | 103,262 |
| ReplaceAllElements_RAIIMesh | 100000000 | 81,626.107 μs | 640.0073 μs | 567.3498 μs | 5,820,325 | 28,581 |
| ReplaceAllElements_IndexerMesh | 100000000 | 85,533.504 μs | 1,870.6457 μs | 5,367.2331 μs | 5,759,678 | 26,735 |
| ReplaceAllElements_IndexerMesh_NoExtraMethod | 100000000 | 236,609.964 μs | 2,506.1896 μs | 2,221.6718 μs | 304,499,917 | 166,434 |
| ReplaceSomeElements_Baseline | 100000000 | 209,677.622 μs | 2,289.9830 μs | 2,142.0515 μs | 202,798,967 | 98,031 |
| ReplaceSomeElements_RAIIMesh | 100000000 | 82,316.572 μs | 1,383.3185 μs | 1,698.8394 μs | 5,927,367 | 33,198 |
| ReplaceSomeElements_IndexerMesh | 100000000 | 86,159.622 μs | 1,722.1894 μs | 3,632.6819 μs | 5,926,457 | 33,655 |
| ReplaceSomeElements_IndexerMesh_NoExtraMethod | 100000000 | 246,318.263 μs | 2,573.1784 μs | 2,406.9527 μs | 303,324,638 | 127,795 |
The timing difference in reading from/replacing items directly on the backing array compared to reading/replacing using an indexer seems to be neglectable (~0.3 μs), while the difference between backing array vs. indexer when changing existing contents is substantially higher (~10 μs)
Why that?
TLDR: Right now, not really explainable
Use https://sharplab.io/ for testing
Example code
using System;
using System.Runtime.CompilerServices;
public struct float3
{
public float X;
public float Y;
public float Z;
public float3(float x, float y, float z) => (X, Y, Z) = (x,y,z);
}
public class ThisMesh
{
public float3[] _data = new float3[100];
public bool DataModified { get; private set; } = false;
public float3 this[int idx]
{
get => _data[idx];
set
{
_data[idx] = value;
DataModified = true;
}
}
}
public class BaseMesh
{
public float3[] _data;
}
public class DoWork
{
void LoopAdd_This()
{
var m = new ThisMesh();
for(var i = 0; i < 100; i++)
m[i] = new float3(i, i+1, i+2);
}
void LoopAdd_Base()
{
var m = new BaseMesh();
for(var i = 0; i < 100; i++)
m._data[i] = new float3(i, i+1, i+2);
}
}
Release IL code with indexer
.method private hidebysig
instance void LoopAdd_This () cil managed
{
// Method begins at RVA 0x20cc
// Code size 42 (0x2a)
.maxstack 6
.locals init (
[0] class ThisMesh m,
[1] int32 i
)
IL_0000: newobj instance void ThisMesh::.ctor()
IL_0005: stloc.0
IL_0006: ldc.i4.0
IL_0007: stloc.1
// sequence point: hidden
IL_0008: br.s IL_0024
// loop start (head: IL_0024)
IL_000a: ldloc.0
IL_000b: ldloc.1
IL_000c: ldloc.1
IL_000d: conv.r4
IL_000e: ldloc.1
IL_000f: ldc.i4.1
IL_0010: add
IL_0011: conv.r4
IL_0012: ldloc.1
IL_0013: ldc.i4.2
IL_0014: add
IL_0015: conv.r4
IL_0016: newobj instance void float3::.ctor(float32, float32, float32)
IL_001b: callvirt instance void ThisMesh::set_Item(int32, valuetype float3) // <- calls to method below via callvirt (prevents null references!)
IL_0020: ldloc.1
IL_0021: ldc.i4.1
IL_0022: add
IL_0023: stloc.1
IL_0024: ldloc.1
IL_0025: ldc.i4.s 100
IL_0027: blt.s IL_000a
// end loop
IL_0029: ret
} // end of method DoWork::LoopAdd_This
callvirt set_Item()
.method public hidebysig specialname
instance void set_Item (
int32 idx,
valuetype float3 'value'
) cil managed
{
// Method begins at RVA 0x2097
// Code size 21 (0x15)
.maxstack 8
IL_0000: ldarg.0
IL_0001: ldfld valuetype float3[] ThisMesh::_data
IL_0006: ldarg.1
IL_0007: ldarg.2
IL_0008: stelem float3
IL_000d: ldarg.0
IL_000e: ldc.i4.1
IL_000f: call instance void ThisMesh::set_DataModified(bool)
IL_0014: ret
} // end of method ThisMesh::set_Item
Release IL code without indexer (base loop)
.method private hidebysig
instance void LoopAdd_Base () cil managed
{
// Method begins at RVA 0x2104
// Code size 47 (0x2f)
.maxstack 6
.locals init (
[0] class BaseMesh m,
[1] int32 i
)
IL_0000: newobj instance void BaseMesh::.ctor()
IL_0005: stloc.0
IL_0006: ldc.i4.0
IL_0007: stloc.1
// sequence point: hidden
IL_0008: br.s IL_0029
// loop start (head: IL_0029)
IL_000a: ldloc.0
IL_000b: ldfld valuetype float3[] BaseMesh::_data
IL_0010: ldloc.1
IL_0011: ldloc.1
IL_0012: conv.r4
IL_0013: ldloc.1
IL_0014: ldc.i4.1
IL_0015: add
IL_0016: conv.r4
IL_0017: ldloc.1
IL_0018: ldc.i4.2
IL_0019: add
IL_001a: conv.r4
IL_001b: newobj instance void float3::.ctor(float32, float32, float32)
IL_0020: stelem float3
IL_0025: ldloc.1
IL_0026: ldc.i4.1
IL_0027: add
IL_0028: stloc.1
IL_0029: ldloc.1
IL_002a: ldc.i4.s 100
IL_002c: blt.s IL_000a
// end loop
IL_002e: ret
} // end of method DoWork::LoopAdd_Base
The difference is at least one additional call to callvirt set_Item() and only there the _data array is being pushed on the stack, each loop iteration.
Furthermore, looking inside the source code for Array.Copy() (https://source.dot.net/#System.Private.CoreLib/src/System/Array.CoreCLR.cs,24) I would advise using this/these method/s as often as possible instead of relying upon plain for-loops and or an indexer due to the massive runtime differences.
~~Wanted to test branch & cache misses for indexer version, however Windows 10 is unable to read this data reliable from AMD processors.~~
Changed to Intel PC, test results using code posted above
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
[Host] : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
| Method | N | Mean | Error | StdDev | BranchInstructions/Op | BranchMispredictions/Op |
|---|---|---|---|---|---|---|
| LoopAdd_This | 10000 | 13.91 μs | 0.192 μs | 0.171 μs | 20,141 | 6 |
| LoopAdd_Base | 10000 | 14.00 μs | 0.218 μs | 0.193 μs | 20,151 | 7 |
| LoopAdd_This | 100000 | 145.46 μs | 2.871 μs | 6.056 μs | 201,663 | 73 |
| LoopAdd_Base | 100000 | 145.80 μs | 2.486 μs | 2.660 μs | 201,630 | 71 |
| LoopAdd_This | 100000000 | 151,834.48 μs | 2,958.620 μs | 4,606.214 μs | 202,032,097 | 92,098 |
| LoopAdd_Base | 100000000 | 147,553.75 μs | 1,969.378 μs | 1,842.157 μs | 202,243,004 | 89,975 |
Example implementation
//#define WITH_GUARD // <- define guard, disable for benchmark
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using CommunityToolkit.Diagnostics;
using System;
using System.Linq;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace NewMeshImp
{
public struct float3
{
public float X;
public float Y;
public float Z;
public float3(float _x, float _y, float _z) => (X, Y, Z) = (_x, _y, _z);
}
public enum MeshDataType
{
Vertices,
Normals
}
public class MeshChangedEvent : EventArgs
{
public MeshDataType Type;
public IReadOnlyCollection<int>? ChangedIndices;
}
public unsafe class MeshFloat3Data : IDisposable
{
private readonly float3* _internalRef;
private readonly int _internalLength;
private readonly MeshDataType _type;
private readonly EventHandler<MeshChangedEvent> _hndl; // dummy
private bool disposedValue;
private readonly List<int> _changedIndices = new();
public ReadOnlySpan<float3> Data => new(_internalRef, _internalLength);
public MeshFloat3Data(float3* _data, in int length, in MeshDataType type, in EventHandler<MeshChangedEvent> hndl)
{
_internalRef = _data;
_internalLength = length;
_type = type;
_hndl = hndl;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetValues(in float3[] data, int destStartOffset, int length)
{
#if WITH_GUARD
Guard.IsGreaterThanOrEqualTo(destStartOffset, 0);
Guard.IsGreaterThanOrEqualTo(length, 1);
Guard.IsLessThan(length, _internalLength);
#endif
if (_changedIndices.Capacity < length)
_changedIndices.Capacity = length;
var float3Offset = destStartOffset * 3;
var float3Length = length * 3;
var destInBytes = _internalLength * 3 * sizeof(float);
var dataInBytes = float3Length * sizeof(float);
fixed (float* ptr = &data[0].X)
{
var localPtrOffset = _internalRef + float3Offset;
Buffer.MemoryCopy(ptr, localPtrOffset, destInBytes, dataInBytes);
}
_changedIndices.AddRange(Enumerable.Range(destStartOffset, length));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetValues(int startIdx, int endIdx, in Func<float3, float3> actionToPerformOnValue)
{
#if WITH_GUARD
Guard.IsGreaterThanOrEqualTo(startIdx, 0);
Guard.IsLessThan(endIdx, _internalLength);
Guard.IsGreaterThan(endIdx, startIdx);
#endif
for (var i = 0; i < endIdx; i++)
{
_internalRef[i] = actionToPerformOnValue(_internalRef[i]);
}
_changedIndices.AddRange(Enumerable.Range(startIdx, endIdx));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetValue(int index, in Func<float3, float3> actionToPerformOnValue)
{
#if WITH_GUARD
Guard.IsGreaterThanOrEqualTo(index, 0);
Guard.IsLessThan(index, _internalLength);
#endif
_internalRef[index] = actionToPerformOnValue(_internalRef[index]);
_changedIndices.Add(index);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetValue(int index, float3 value)
{
#if WITH_GUARD
Guard.IsGreaterThanOrEqualTo(index, 0);
Guard.IsLessThan(index, _internalLength);
#endif
_internalRef[index] = value;
_changedIndices.Add(index);
}
protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
_hndl?.Invoke(this, new MeshChangedEvent { Type = _type, ChangedIndices = _changedIndices.AsReadOnly() });
disposedValue = true;
}
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
}
public unsafe class Mesh
{
private readonly float3[] _vertices;
private readonly float3[] _normals;
public EventHandler<MeshChangedEvent>? OnChangeEvent;
public Mesh(in float3[] vertices, in float3[] normals)
{
_vertices = new float3[vertices.Length];
_normals = new float3[normals.Length];
Array.Copy(vertices, _vertices, vertices.Length); // copy to be sure we do not use the same array twice
Array.Copy(vertices, _normals, vertices.Length); // copy to be sure we do not use the same array twice
}
public MeshFloat3Data GetVertices()
{
fixed (float3* ptr = &_vertices[0])
return new MeshFloat3Data(ptr, _vertices.Length, MeshDataType.Vertices, in OnChangeEvent);
}
public MeshFloat3Data GetNormals()
{
fixed (float3* ptr = &_normals[0])
return new MeshFloat3Data(ptr, _normals.Length, MeshDataType.Normals, in OnChangeEvent);
}
}
public class SimpleMesh
{
public readonly float3[] _vertices;
public readonly float3[] _normals;
public SimpleMesh(in float3[] vertices, in float3[] normals)
{
_vertices = new float3[vertices.Length];
_normals = new float3[normals.Length];
Array.Copy(vertices, _vertices, vertices.Length); // copy to be sure we do not use the same array twice
Array.Copy(vertices, _normals, vertices.Length); // copy to be sure we do not use the same array twice
}
}
[MarkdownExporter]
[MemoryDiagnoser]
public class Benchmark
{
[Params(1_000_000, 100_000_000)]
public int N;
public Mesh mesh;
public SimpleMesh simpleMesh;
public event EventHandler<MeshChangedEvent> Event;
public static int DummyVariable = 0;
[GlobalSetup]
public void Setup()
{
var vertices = new float3[N];
var normals = new float3[N];
var rnd = new Random();
Array.Fill(vertices, new float3((float)rnd.NextDouble(), (float)rnd.NextDouble(), (float)rnd.NextDouble()));
Array.Fill(normals, new float3((float)rnd.NextDouble(), (float)rnd.NextDouble(), (float)rnd.NextDouble()));
mesh = new Mesh(in vertices, in normals);
simpleMesh = new SimpleMesh(in vertices, in normals);
mesh.OnChangeEvent += (s, e) => { DummyVariable += e.ChangedIndices.Count; };
Event += (s, e) => { DummyVariable += e.ChangedIndices.Count; };
}
[Benchmark]
public void BenchmarkSimpleMeshOp_1()
{
// Simple Mesh
for (var i = 10_00_00; i < simpleMesh._vertices.Length - 1000; i++)
{
var data = simpleMesh._vertices[i];
data.X += 10;
data.Y += 5;
data.Z += 1;
}
Event.Invoke(this, new MeshChangedEvent
{
ChangedIndices = Enumerable.Range(10_00_00, simpleMesh._vertices.Length - 1000).ToList(),
Type = MeshDataType.Vertices
});
}
[Benchmark]
public void BenchmarkSimpleMeshOp_2()
{
for (var i = 0; i < simpleMesh._vertices.Length; i++)
{
var data = simpleMesh._vertices[i];
data.X += (data.X * i);
}
Event.Invoke(this, new MeshChangedEvent
{
ChangedIndices = Enumerable.Range(0, simpleMesh._vertices.Length).ToList(),
Type = MeshDataType.Vertices
});
}
[Benchmark]
public void BenchmarkMeshOp_1()
{
using var vertData = mesh.GetVertices();
vertData.SetValues(10_00_00, vertData.Data.Length - 1000, static (float3 data) => { data.X += 10; data.Y += 5; data.Z += 1; return data; });
}
[Benchmark]
public void BenchmarkMeshOp_ReplaceStaticDelegateWithForLoop()
{
using var vertData = mesh.GetVertices();
for (var i = 10_00_00; i < vertData.Data.Length - 1000; i++)
{
vertData.SetValue(i, new float3(vertData.Data[i].X + 10, vertData.Data[i].Y + 5, vertData.Data[i].Z + 1));
}
}
[Benchmark]
public void BenchmarkMeshOp_ReplaceStaticDelegateWithArrayCopy()
{
using var vertData = mesh.GetVertices();
var tmpArray = new float3[vertData.Data.Length - 1000 - 10_00_00];
var cnt = 0;
for(var i = 10_00_00; i < vertData.Data.Length - 1000; i++)
{
tmpArray[cnt++] = (new float3(vertData.Data[i].X + 10, vertData.Data[i].Y + 5, vertData.Data[i].Z + 1));
}
vertData.SetValues(tmpArray, 10_00_00, vertData.Data.Length - 1000);
}
[Benchmark]
public void BenchmarkMeshOp_2()
{
using var vertData = mesh.GetVertices();
for (var i = 0; i < vertData.Data.Length; i++)
{
vertData.SetValue(i, (float3 data) => { data.X += (data.X * i); return data; }); // closure, no static possible ...
}
}
[Benchmark]
public void BenchmarkMeshOp_ReplaceClosureWithForLoop()
{
using var vertData = mesh.GetVertices();
for (var i = 0; i < vertData.Data.Length; i++)
{
vertData.SetValue(i, new float3(vertData.Data[i].X + (vertData.Data[i].X * i), vertData.Data[i].Y, vertData.Data[i].Z));
}
}
[Benchmark]
public void BenchmarkMeshOp_ReplaceClosureWithArrayCopy()
{
using var vertData = mesh.GetVertices();
var tmpArray = new float3[vertData.Data.Length];
for (var i = 0; i < vertData.Data.Length; i++)
{
tmpArray[i] = new float3(vertData.Data[i].X + (vertData.Data[i].X * i), vertData.Data[i].Y, vertData.Data[i].Z);
}
vertData.SetValues(tmpArray, 0, vertData.Data.Length);
}
}
public static class Program
{
public static void Main()
{
var _ = BenchmarkRunner.Run<Benchmark>();
Console.WriteLine(Benchmark.DummyVariable);
}
}
}
Results
BenchmarkSimpleMeshOp_1 4.280 ms and 3996144 B allocated
BenchmarkSimpleMeshOp_2 4.065 ms and 4000141 B allocated
vs.
BenchmarkMeshOp_ReplaceStaticDelegateWithForLoop 4.121 ms and 8389242 B allocated
BenchmarkMeshOp_ReplaceClosureWithForLoop 4.554 ms and 8389242 B allocated
vs.
BenchmarkMeshOp_ReplaceClosureWithArrayCopy 10.402 ms and 16000312 B allocated
Basic for loops should be the way to go, array copy uses even more resources. However we have to live with the memory overhead ...
Final implementation recommendation with all findings
This implementation is nearly as fast as using POD-arrays with all public methods. However, the memory overhead is significant, usually the allocated memory is 2x (this is also due to the fact, that we do track changed indices, perhaps there is a better method to keep track while changing elements). When this is ovoerhead is too much, one needs to fall back to passing methods (preferably static delegates) in which the modification on each point is being described.
using System.Runtime.CompilerServices;
using CommunityToolkit.Diagnostics;
public struct float3
{
public float X;
public float Y;
public float Z;
public float3(float _x, float _y, float _z) => (X, Y, Z) = (_x, _y, _z);
}
public enum MeshDataType
{
Vertices,
Normals
}
public class MeshChangedEvent : EventArgs
{
public MeshDataType Type;
public IReadOnlyCollection<int>? ChangedIndices;
}
public unsafe class MeshFloat3Data : IDisposable
{
private readonly float3* _internalRef;
private readonly int _internalLength;
private readonly MeshDataType _type;
private readonly EventHandler<MeshChangedEvent> _hndl; // dummy
private bool disposedValue;
private readonly List<int> _changedIndices = new();
public ReadOnlySpan<float3> Data => new(_internalRef, _internalLength);
public MeshFloat3Data(float3* _data, in int length, in MeshDataType type, in EventHandler<MeshChangedEvent> hndl)
{
_internalRef = _data;
_internalLength = length;
_type = type;
_hndl = hndl;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetValue(int index, float3 value)
{
Guard.IsGreaterThanOrEqualTo(index, 0);
Guard.IsLessThan(index, _internalLength);
_internalRef[index] = value;
_changedIndices.Add(index);
}
protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
_hndl?.Invoke(this, new MeshChangedEvent { Type = _type, ChangedIndices = _changedIndices.AsReadOnly() });
disposedValue = true;
}
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
}
public unsafe class Mesh
{
private readonly float3[] _vertices;
private readonly float3[] _normals;
public EventHandler<MeshChangedEvent>? OnChangeEvent;
public Mesh(in float3[] vertices, in float3[] normals)
{
_vertices = vertices;
_normals = normals;
}
public MeshFloat3Data GetVertices()
{
fixed (float3* ptr = &_vertices[0])
return new MeshFloat3Data(ptr, _vertices.Length, MeshDataType.Vertices, in OnChangeEvent);
}
public MeshFloat3Data GetNormals()
{
fixed (float3* ptr = &_normals[0])
return new MeshFloat3Data(ptr, _normals.Length, MeshDataType.Normals, in OnChangeEvent);
}
}
public static class Program
{
public static void Main()
{
var mesh = new Mesh(new float3[] { new float3(1, 2, 3), new float3(1, 2, 3) }, new float3[] { new float3(1, 2, 3), new float3(1, 2, 3) });
using var vertData = mesh.GetVertices();
for (var i = 0; i < vertData.Data.Length; i++)
{
vertData.SetValue(i, new float3(vertData.Data[i].X + (vertData.Data[i].X * i), vertData.Data[i].Y, vertData.Data[i].Z));
}
}
}