Fusee icon indicating copy to clipboard operation
Fusee copied to clipboard

Redesign Mesh

Open griestopf opened this issue 3 years ago • 11 comments

Requirements

  • Initially specify mesh data with given arrays (constructor)

  • Completely exchange mesh data

    • Changes in buffer sizes are not allowed
  • Change single array entries

  • Automatically identify changed indices

  • Allow users to explicitely identify changed indices

  • Collect changed indices into ranges and trigger copying changed ranges to GPU once a frame (or at user specified frames)

  • Enable mechanism to be easily applied to new vertex attributes without code copying

  • Keep AABB up-to-date

  • Meshes can be "static" or "dynamic"

    • Combine classes 'Mesh' and 'GpuMesh'
  • [ ] Evaluate performance differences in mechanisms to track changed indices

    • Observable Collection (involves events and probably data copying)
    • Indexer setter (involves data copying)
    • SetVertices (involves data copying)
    • Explicit index specification after changing original data array (no copying)
  • [ ] Implement mechanism taking "changed" indices, performantly store them and combine them into more or less contiguous ranges.

griestopf avatar Aug 17 '22 14:08 griestopf

First observation: Observable Collection (involves events and probably data copying) crashes the benchmark with 100_000_000 elements, as all necessary operations are either too slow or too memory hungry. Removing from test preemptively.

wrestledBearOnce avatar Aug 18 '22 12:08 wrestledBearOnce

Code

using System.Collections.ObjectModel;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;

namespace MeshPerfTest
{
    public struct float3
    {
        public float X;
        public float Y;
        public float Z;
    }

    [SimpleJob(RuntimeMoniker.Net60)]
    [MemoryDiagnoser] // check used memory
    [HardwareCounters(
        HardwareCounter.BranchMispredictions,
        HardwareCounter.BranchInstructions)]
    [HtmlExporter] // export results in a neat html file 
    [MarkdownExporter] // export results for markdown 
    public class TestMesh
    {
        // before with getter, this should be very slow
        public float3[] GetData
        {
            get => _data;
            set => _data = value;
        }

        // direct access
        private float3[] _data = { };

        private float3[] _secondDataFieldForRealWorld = { };
        
        // span access
        private Span<float3> DataAsSpan => new Span<float3>(_data);

        // span access
        private ReadOnlySpan<float3> DataAsReadOnlySpanSpan => new ReadOnlySpan<float3>(_data);

        [Params(10000, 100_000, 100_000_000)] public int N;

        public float3 this[int idx]
        {
            get => _data[idx];
            set => _data[idx] = value;
        }

        [GlobalSetup]
        public void Setup()
        {
            _data = new float3[N];
            _secondDataFieldForRealWorld = new float3[N];
            var rnd = new Random();
            Array.Fill(_data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(_secondDataFieldForRealWorld, new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
        }

        [Benchmark]
        public void ForLoop_Baseline_AddValue()
        {
            for (var i = 0; i < GetData.Length; i++)
            {
                var currentVal = GetData[i];
                GetData[i] = new float3
                    { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
            }
        }

        [Benchmark]
        public void ForLoop_AddValue()
        {
            for (var i = 0; i < _data.Length; i++)
            {
                _data[i] = new float3 { X = _data[i].X * (i + 1), Y = _data[i].Y * (i + 3), Z = _data[i].Z * (i + 2) };
            }
        }

        [Benchmark]
        public void ForLoop_AsSpan_AddValue()
        {
            for (var i = 0; i < DataAsSpan.Length; i++)
            {
                var currentVal = DataAsSpan[i];
                DataAsSpan[i] = new float3
                    { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
            }
        }

        [Benchmark]
        public void ForLoop_AsReadOnlySpan_AddValue()
        {
            for (var i = 0; i < DataAsReadOnlySpanSpan.Length; i++)
            {
                var currentVal = DataAsReadOnlySpanSpan[i];
                _data[i] = new float3
                    { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
            }
        }

        [Benchmark]
        public void ForLoop_AsIndexer_AddValue()
        {
            for (var i = 0; i < _data.Length; i++)
            {
                var currentVal = this[i];
                this[i] = new float3
                    { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
            }
        }

        [Benchmark]
        public void ForLoop_Baseline_ReplaceValue()
        {
            for (var i = 0; i < GetData.Length; i++)
            {
                var currentVal = GetData[i];
                GetData[i].X *= (i + 1);
                GetData[i].Y *= (i + 3);
                GetData[i].Z *= (i + 2);
            }
        }

        [Benchmark]
        public void ForLoop_ReplaceValue()
        {
            for (var i = 0; i < _data.Length; i++)
            {
                _data[i].X *= (i + 1);
                _data[i].Y *= (i + 3);
                _data[i].Z *= (i + 2);
            }
        }

        [Benchmark]
        public void ForLoop_AsSpan_ReplaceValue()
        {
            for (var i = 0; i < DataAsSpan.Length; i++)
            {
                DataAsSpan[i].X *= (i + 1);
                DataAsSpan[i].Y *= (i + 3);
                DataAsSpan[i].Z *= (i + 2);
            }
        }

        [Benchmark]
        public void ForLoop_AsReadOnlySpan_ReplaceValue()
        {
            for (var i = 0; i < DataAsReadOnlySpanSpan.Length; i++)
            {
                var currentVal = DataAsReadOnlySpanSpan[i];
                _data[i].X = currentVal.X * (i + 1);
                _data[i].Y = currentVal.Y * (i + 3);
                _data[i].Z = currentVal.Z * (i + 2);
                ;
            }
        }

        [Benchmark]
        public void ForLoop_AsIndexer_ReplaceValue()
        {
            for (var i = 0; i < _data.Length; i++)
            {
                var currentVal = this[i];
                currentVal.X *= (i + 1);
                currentVal.Y *= (i + 3);
                currentVal.Z *= (i + 2);
                this[i] = currentVal;
            }
        }
        
        [Benchmark]
        public void RealWorldImpl_WithSimpleSetVertices_Copy()
        {
            for (var i = N / 2; i < N - 100; i++)
            {
                _data[i] = _secondDataFieldForRealWorld[i];
            }
        }
        
        [Benchmark]
        public void RealWorldImpl_ReplacingAllDataWithOtherArray()
        {
            for (var i = 0; i < _secondDataFieldForRealWorld.Length; i++)
            {
                _data[i] = _secondDataFieldForRealWorld[i];
            }
        }
    }

    public static class Program
    {
        public static void Main()
        {
            Console.WriteLine("Starting Benchmark");
            var summary = BenchmarkRunner.Run<TestMesh>();
        }
    }
}

Results


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT  [AttachedDebugger]
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT


Method N Mean Error StdDev BranchInstructions/Op BranchMispredictions/Op Allocated
ForLoop_Baseline_AddValue 10000 17.753 μs 0.1542 μs 0.1367 μs 20,176 6 -
ForLoop_AddValue 10000 17.511 μs 0.2026 μs 0.1796 μs 20,181 8 -
ForLoop_AsSpan_AddValue 10000 26.377 μs 0.1449 μs 0.1284 μs 60,277 13 -
ForLoop_AsReadOnlySpan_AddValue 10000 23.909 μs 0.2351 μs 0.2084 μs 50,267 13 -
ForLoop_AsIndexer_AddValue 10000 17.874 μs 0.2151 μs 0.2012 μs 20,183 7 -
ForLoop_Baseline_ReplaceValue 10000 19.452 μs 0.0987 μs 0.0824 μs 20,193 7 -
ForLoop_ReplaceValue 10000 18.801 μs 0.0683 μs 0.0606 μs 20,182 6 -
ForLoop_AsSpan_ReplaceValue 10000 29.063 μs 0.3297 μs 0.2923 μs 80,315 18 -
ForLoop_AsReadOnlySpan_ReplaceValue 10000 23.931 μs 0.1213 μs 0.1076 μs 60,254 13 -
ForLoop_AsIndexer_ReplaceValue 10000 28.203 μs 0.1537 μs 0.1363 μs 30,275 9 -
RealWorldImpl_WithSimpleSetVertices_Copy 10000 5.937 μs 0.0320 μs 0.0284 μs 14,764 4 -
RealWorldImpl_ReplacingAllDataWithOtherArray 10000 12.703 μs 0.1225 μs 0.1086 μs 30,169 9 -
ForLoop_Baseline_AddValue 100000 177.257 μs 0.6038 μs 0.5353 μs 201,745 51 -
ForLoop_AddValue 100000 172.977 μs 0.5293 μs 0.4132 μs 201,698 50 -
ForLoop_AsSpan_AddValue 100000 262.381 μs 1.1548 μs 1.0237 μs 602,706 122 -
ForLoop_AsReadOnlySpan_AddValue 100000 239.112 μs 1.6832 μs 1.3142 μs 502,566 118 -
ForLoop_AsIndexer_AddValue 100000 178.840 μs 2.4601 μs 2.1808 μs 201,785 55 -
ForLoop_Baseline_ReplaceValue 100000 195.011 μs 0.8040 μs 0.7127 μs 201,907 54 -
ForLoop_ReplaceValue 100000 189.299 μs 1.1598 μs 1.0282 μs 201,861 53 -
ForLoop_AsSpan_ReplaceValue 100000 288.885 μs 1.4949 μs 1.3252 μs 802,954 161 -
ForLoop_AsReadOnlySpan_ReplaceValue 100000 239.613 μs 1.3238 μs 1.1054 μs 602,505 122 -
ForLoop_AsIndexer_ReplaceValue 100000 283.333 μs 0.7748 μs 0.7248 μs 302,769 76 -
RealWorldImpl_WithSimpleSetVertices_Copy 100000 62.468 μs 0.5889 μs 0.5509 μs 150,368 32 -
RealWorldImpl_ReplacingAllDataWithOtherArray 100000 128.034 μs 1.0584 μs 0.9901 μs 301,365 63 -
ForLoop_Baseline_AddValue 100000000 184,885.571 μs 1,360.7537 μs 1,206.2727 μs 202,385,362 86,380 128 B
ForLoop_AddValue 100000000 180,727.119 μs 968.1744 μs 858.2613 μs 202,264,485 78,643 1,371 B
ForLoop_AsSpan_AddValue 100000000 268,765.647 μs 1,576.5453 μs 1,474.7014 μs 603,424,905 164,386 1,364 B
ForLoop_AsReadOnlySpan_AddValue 100000000 244,021.818 μs 1,775.4558 μs 1,482.5860 μs 503,051,423 142,905 128 B
ForLoop_AsIndexer_AddValue 100000000 184,510.056 μs 1,133.5554 μs 1,060.3284 μs 202,299,438 80,100 341 B
ForLoop_Baseline_ReplaceValue 100000000 202,071.036 μs 1,925.9486 μs 1,707.3033 μs 202,344,585 83,376 1,371 B
ForLoop_ReplaceValue 100000000 197,450.618 μs 1,739.1837 μs 1,452.2972 μs 202,532,454 91,659 128 B
ForLoop_AsSpan_ReplaceValue 100000000 295,217.950 μs 1,857.6172 μs 1,737.6162 μs 803,724,766 194,833 2,096 B
ForLoop_AsReadOnlySpan_ReplaceValue 100000000 245,030.749 μs 1,414.8232 μs 1,181.4414 μs 602,980,716 150,278 1,341 B
ForLoop_AsIndexer_ReplaceValue 100000000 291,626.064 μs 2,277.9259 μs 2,019.3220 μs 303,394,543 117,009 2,056 B
RealWorldImpl_WithSimpleSetVertices_Copy 100000000 74,050.853 μs 1,422.2323 μs 2,336.7684 μs 150,965,551 47,628 55 B
RealWorldImpl_ReplacingAllDataWithOtherArray 100000000 148,863.958 μs 1,371.5788 μs 1,282.9757 μs 301,881,754 86,084 256 B

Findings

This code is faster

[Benchmark]
public void ForLoop_AddValue()
{
     for (var i = 0; i < _data.Length; i++)
     {
         _data[i] = new float3 { X = _data[i].X * (i + 1), Y = _data[i].Y * (i + 3), Z = _data[i].Z * (i + 2) };
     }
 }

than this

 [Benchmark]
        public void ForLoop_ReplaceValue()
        {
            for (var i = 0; i < _data.Length; i++)
            {
                _data[i].X *= (i + 1);
                _data[i].Y *= (i + 3);
                _data[i].Z *= (i + 2);
            }
        }

And replacing array elements by already existing data consumes more memory (however not during the actual file operation), but is faster than everything else.

wrestledBearOnce avatar Aug 18 '22 13:08 wrestledBearOnce

The results speak for themselves. I think in this setting, it is usually slower to change existing array elements than to replace them with data from another array or just assign a new float3. That's kinda strange ... Lookup via getter or indexer is slower and more memory hungry (my guess is due to defensive copies).

Furthermore, I've implemented two versions (without Span<T> as it is slower than the other methods). The first version is the "new" implementation with BeginEdit() and EndEdit() in a slightly modified version (ref instead of Span<T>). The second version is with SetVertices(float3[] data, int startIdx, int length) and compared both implementations with some standard file operations. My expectation is, given the results above, that the second version with the SetVertices method is faster, even while copying data (at least that is what the human suspects is happening here). Results are pending...

wrestledBearOnce avatar Aug 18 '22 13:08 wrestledBearOnce

Mesh Implementations

Code

using System.Collections.ObjectModel;
using System.ComponentModel.DataAnnotations;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;

namespace MeshPerfTest
{
    public struct float3
    {
        public float X;
        public float Y;
        public float Z;
    }

    
    public class MeshVariantOne
    {
        public float3[] Data; // do not access by public var!
        public EventHandler<(int startIdx, int length)> callMe;
        public ReadOnlySpan<float3> GetReadonlyRefToData => new ReadOnlySpan<float3>(Data);

        public void SetVertices(ref float3[] dataIn, int startIdx = 0, int length = -1)
        {
            for (var i = startIdx; i < (length == -1 ? dataIn.Length : length); i++)
            {
                Data[i] = dataIn[i];
            }
            
            callMe?.Invoke(this, (startIdx, length));
        }
    }
    
    public class MeshVariantTwo
    {
        public float3[] Data; // do not access by public var!
        public EventHandler<(int startIdx, int length)> callMe;

        public ref float3[] BeginEdit()
        {
            return ref Data;
        }
        
        public void EndEdit(int startIdx, int length) {
            callMe?.Invoke(this, (startIdx, length));
        }
    }

    //[SimpleJob(RuntimeMoniker.Net60)]
    //[MemoryDiagnoser] // check used memory
    //[HardwareCounters(
    //    HardwareCounter.BranchMispredictions,
    //    HardwareCounter.BranchInstructions)]
    //[HtmlExporter] // export results in a neat html file 
    [MarkdownExporter] // export results for markdown 
    public class TestMesh
    {
        [Params(10000, 100_000, 100_000_000)] 
        public int N;

        public MeshVariantOne m1;
        public MeshVariantTwo m2;

        public float3[] ReplacementData;
        
        [GlobalSetup]
        public void Setup()
        {
            m1 = new MeshVariantOne();
            m1.Data = new float3[N];

            m2 = new MeshVariantTwo();
            m2.Data = new float3[N];

            ReplacementData = new float3[N];
            
            var rnd = new Random();
                
            Array.Fill(m2.Data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
            
            Array.Fill(m1.Data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
            
            Array.Fill(ReplacementData,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
        }

        [Benchmark]
        public void ReplaceAllVertices_M1()
        {
            m1.SetVertices(ref ReplacementData);
        }
        
        [Benchmark]
        public void ReplaceAllVertices_M2()
        {
            var reference = m2.BeginEdit();
            for (var i = 0; i < reference.Length; i++)
                reference[i] = ReplacementData[i];
            m2.EndEdit(0, reference.Length);
        }
        
        [Benchmark]
        public void ReplaceSomeVertices_M1()
        {
            m1.SetVertices(ref ReplacementData, 10, 20);
            m1.SetVertices(ref ReplacementData, 50, 70);
            m1.SetVertices(ref ReplacementData, N - 100, N - 1);
        }
        
        [Benchmark]
        public void ReplaceSomeVertices_M2()
        {
            var reference = m2.BeginEdit();
            for (var i = 10; i < 20; i++)
                reference[i] = ReplacementData[i];
            
            for (var i = 50; i < 70; i++)
                reference[i] = ReplacementData[i];
            
            
            for (var i = N - 100; i < N - 1; i++)
                reference[i] = ReplacementData[i];

            // normally we would need to call 3 times to m2.EndEdit(), however let's just call one time
            // usually we would have an implementation with an array of tuples
            m2.EndEdit(10, 20);
        }
        
        [Benchmark]
        public void ChangeSomeVertices_M1()
        {
            var data = m1.GetReadonlyRefToData.ToArray(); // data copy :(
            
            // change
            for (var i = 0; i < 100; i++)
                data[i] = new float3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5};
            
            m1.SetVertices(ref data, 0, 100);
        }
        
        [Benchmark]
        public void ChangeSomeVertices_M2()
        {
            var data = m2.BeginEdit();
            
            // change
            for (var i = 0; i < 100; i++)
                data[i] = new float3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5};
            
            m2.EndEdit(0, 100);
        }
    }

    public static class Program
    {
        public static void Main()
        {
            Console.WriteLine("Starting Benchmark");
            var summary = BenchmarkRunner.Run<TestMesh>();
        }
    }
}

Results


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT  [AttachedDebugger]
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT


Method N Mean Error StdDev
ReplaceAllVertices_M1 10000 14,516.6 ns 76.20 ns 67.55 ns
ReplaceAllVertices_M2 10000 10,162.5 ns 154.10 ns 128.68 ns
ReplaceSomeVertices_M1 10000 200.5 ns 1.01 ns 0.95 ns
ReplaceSomeVertices_M2 10000 147.5 ns 2.18 ns 1.93 ns
ChangeSomeVertices_M1 10000 73,644.8 ns 760.38 ns 711.26 ns
ChangeSomeVertices_M2 10000 126.2 ns 1.05 ns 0.82 ns
ReplaceAllVertices_M1 100000 145,922.9 ns 446.97 ns 396.22 ns
ReplaceAllVertices_M2 100000 118,489.6 ns 2,056.64 ns 3,321.09 ns
ReplaceSomeVertices_M1 100000 179.6 ns 0.90 ns 0.84 ns
ReplaceSomeVertices_M2 100000 146.4 ns 1.53 ns 1.36 ns
ChangeSomeVertices_M1 100000 335,919.5 ns 3,811.63 ns 3,565.40 ns
ChangeSomeVertices_M2 100000 125.7 ns 0.89 ns 0.75 ns
ReplaceAllVertices_M1 100000000 163,270,260.7 ns 1,775,762.38 ns 1,574,167.10 ns
ReplaceAllVertices_M2 100000000 135,759,776.0 ns 2,608,763.27 ns 3,482,624.04 ns
ReplaceSomeVertices_M1 100000000 200.1 ns 0.74 ns 0.69 ns
ReplaceSomeVertices_M2 100000000 144.6 ns 0.47 ns 0.40 ns
ChangeSomeVertices_M1 100000000 452,193,086.7 ns 5,400,314.15 ns 5,051,457.20 ns
ChangeSomeVertices_M2 100000000 126.5 ns 0.57 ns 0.50 ns

Version M2 (BeginEdit(), EndEdit()) performs better in each scenario. Especially when GetReadonlyRefToData from version 1 is being used (data copying).

wrestledBearOnce avatar Aug 18 '22 14:08 wrestledBearOnce

Performance optimizations to Mesh impl

  • using agressive inline
  • testing Vector3 with SIMD
  • testing new float3{} vs float3.x =
using System.Collections.ObjectModel;
using System.Numerics;
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;

namespace MeshPerfTest
{
    public struct float3
    {
        public float X;
        public float Y;
        public float Z;
    }


    public class MeshVariantOne
    {
        public float3[] Data; // do not access by public var!
        public EventHandler<(int startIdx, int length)> callMe;
        public ReadOnlyCollection<float3> GetReadonlyRefToData => new ReadOnlyCollection<float3>(Data);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetVertices(in float3[] dataIn, int startIdx = 0, int length = -1)
        {
            var loopEnd = (length == -1 ? dataIn.Length : length);
            for (var i = startIdx; i < loopEnd; i++)
            {
                Data[i] = dataIn[i];
            }

            callMe?.Invoke(this, (startIdx, length));
        }
    }

    public class MeshVariantTwo
    {
        public float3[] Data; // do not access by public var!
        public EventHandler<(int startIdx, int length)> callMe;

        public ref float3[] BeginEdit()
        {
            return ref Data;
        }

        public void EndEdit(int startIdx, int length)
        {
            callMe?.Invoke(this, (startIdx, length));
        }
    }

    public class MeshVariantThree
    {
        public Vector3[] Data; // do not access by public var!
        public EventHandler<(int startIdx, int length)> callMe;
        public ReadOnlyCollection<Vector3> GetReadonlyRefToData => new(Data);

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetVertices(in Vector3[] dataIn, int startIdx = 0, int length = -1)
        {
            var loopEnd = (length == -1 ? dataIn.Length : length);
            for (var i = startIdx; i < loopEnd; i++)
            {
                Data[i] = dataIn[i];
            }

            callMe?.Invoke(this, (startIdx, length));
        }
    }

    public class MeshVariantFour
    {
        public Vector3[] Data; // do not access by public var!
        public EventHandler<(int startIdx, int length)> callMe;

        public ref Vector3[] BeginEdit()
        {
            return ref Data;
        }

        public void EndEdit(int startIdx, int length)
        {
            callMe?.Invoke(this, (startIdx, length));
        }
    }

    //[SimpleJob(RuntimeMoniker.Net60)]
    //[MemoryDiagnoser] // check used memory
    //[HardwareCounters(
    //    HardwareCounter.BranchMispredictions,
    //    HardwareCounter.BranchInstructions)]
    //[HtmlExporter] // export results in a neat html file 
    [MarkdownExporter] // export results for markdown 
    public class TestMesh
    {
        [Params(10000, 100_000, 100_000_000)] public int N;

        public MeshVariantOne m1;
        public MeshVariantTwo m2;
        public MeshVariantThree m3;
        public MeshVariantFour m4;

        public float3[] ReplacementData;
        public Vector3[] ReplacementDataV3;

        [GlobalSetup]
        public void Setup()
        {
            m1 = new MeshVariantOne();
            m1.Data = new float3[N];

            m2 = new MeshVariantTwo();
            m2.Data = new float3[N];

            m3 = new MeshVariantThree();
            m3.Data = new Vector3[N];
            
            m4 = new MeshVariantFour();
            m4.Data = new Vector3[N];

            ReplacementData = new float3[N];
            ReplacementDataV3 = new Vector3[N];

            var rnd = new Random();

            Array.Fill(m2.Data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(m1.Data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(m3.Data,
                new Vector3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(ReplacementData,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(ReplacementDataV3,
                new Vector3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });
        }

        [Benchmark]
        public void ReplaceAllVertices_M1()
        {
            m1.SetVertices(in ReplacementData);
        }

        [Benchmark]
        public void ReplaceAllVertices_M2()
        {
            var reference = m2.BeginEdit();
            for (var i = 0; i < reference.Length; i++)
                reference[i] = ReplacementData[i];
            m2.EndEdit(0, reference.Length);
        }

        [Benchmark]
        public void ReplaceAllVertices_M3()
        {
            m3.SetVertices(in ReplacementDataV3);
        }

        [Benchmark]
        public void ReplaceAllVertices_M4()
        {
            var reference = m4.BeginEdit();
            for (var i = 0; i < reference.Length; i++)
                reference[i] = ReplacementDataV3[i];
            m4.EndEdit(0, reference.Length);
        }

        [Benchmark]
        public void ReplaceSomeVertices_M1()
        {
            m1.SetVertices(in ReplacementData, 10, 20);
            m1.SetVertices(in ReplacementData, 50, 70);
            m1.SetVertices(in ReplacementData, N - 100, N - 1);
        }

        [Benchmark]
        public void ReplaceSomeVertices_M2()
        {
            var reference = m2.BeginEdit();
            for (var i = 10; i < 20; i++)
                reference[i] = ReplacementData[i];

            for (var i = 50; i < 70; i++)
                reference[i] = ReplacementData[i];


            for (var i = N - 100; i < N - 1; i++)
                reference[i] = ReplacementData[i];

            // normally we would need to call 3 times to m2.EndEdit(), however let's just call one time
            // usually we would have an implementation with an array of tuples
            m2.EndEdit(10, 20);
        }

        [Benchmark]
        public void ReplaceSomeVertices_M3()
        {
            m3.SetVertices(in ReplacementDataV3, 10, 20);
            m3.SetVertices(in ReplacementDataV3, 50, 70);
            m3.SetVertices(in ReplacementDataV3, N - 100, N - 1);
        }

        [Benchmark]
        public void ReplaceSomeVertices_M4()
        {
            var reference = m4.BeginEdit();
            for (var i = 10; i < 20; i++)
                reference[i] = ReplacementDataV3[i];

            for (var i = 50; i < 70; i++)
                reference[i] = ReplacementDataV3[i];


            for (var i = N - 100; i < N - 1; i++)
                reference[i] = ReplacementDataV3[i];

            // normally we would need to call 3 times to m2.EndEdit(), however let's just call one time
            // usually we would have an implementation with an array of tuples
            m4.EndEdit(10, 20);
        }

        [Benchmark]
        public void ChangeSomeVertices_M1()
        {
            var data = new float3[100];
            // change
            for (var i = 0; i < 100; i++)
                data[i] = new float3
                {
                    X = m1.GetReadonlyRefToData[i].X * i + 10, Y = m1.GetReadonlyRefToData[i].Y * i + 20,
                    Z = m1.GetReadonlyRefToData[i].Z * i + 5
                };

            m1.SetVertices(in data, 0, 100);
        }

        [Benchmark]
        public void ChangeSomeVertices_M2()
        {
            var data = m2.BeginEdit();

            // change
            for (var i = 0; i < 100; i++)
                data[i] = new float3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5 };

            m2.EndEdit(0, 100);
        }

        [Benchmark]
        public void ChangeSomeVertices_M2_AlternativeVersion()
        {
            var data = m2.BeginEdit();

            // change
            for (var i = 0; i < 100; i++)
            {
                data[i].X *= i + 10;
                data[i].Y *= i + 20;
                data[i].Z *= i + 5;
            }

            m2.EndEdit(0, 100);
        }

        [Benchmark]
        public void ChangeSomeVertices_M3()
        {
            var data = new Vector3[100];
            // change
            for (var i = 0; i < 100; i++)
                data[i] = new Vector3()
                {
                    X = m3.GetReadonlyRefToData[i].X * i + 10, Y = m3.GetReadonlyRefToData[i].Y * i + 20,
                    Z = m3.GetReadonlyRefToData[i].Z * i + 5
                };

            m3.SetVertices(in data, 0, 100);
        }

        [Benchmark]
        public void ChangeSomeVertices_M4()
        {
            var data = m4.BeginEdit();

            // change
            for (var i = 0; i < 100; i++)
                data[i] = new Vector3 { X = data[i].X * i + 10, Y = data[i].Y * i + 20, Z = data[i].Z * i + 5 };

            m4.EndEdit(0, 100);
        }
    }


    public static class Program
    {
        public static void Main()
        {
            Console.WriteLine("Starting Benchmark");
            var summary = BenchmarkRunner.Run<TestMesh>();
        }
    }
}

Results

Some strange stuff: ChangeSomeVertices_M2_AlternativeVersion() is slower than ChangeSomeVertices_M2() without the *=


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT  [AttachedDebugger]
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT


Method N Mean Error StdDev Median
ReplaceAllVertices_M1 10000 12,368.5 ns 245.25 ns 272.60 ns 12,344.7 ns
ReplaceAllVertices_M2 10000 10,227.9 ns 166.97 ns 217.11 ns 10,178.4 ns
ReplaceAllVertices_M3 10000 11,170.2 ns 102.77 ns 85.82 ns 11,168.9 ns
ReplaceAllVertices_M4 10000 10,274.1 ns 108.57 ns 90.66 ns 10,271.1 ns
ReplaceSomeVertices_M1 10000 202.1 ns 4.22 ns 11.96 ns 205.6 ns
ReplaceSomeVertices_M2 10000 145.8 ns 0.54 ns 0.50 ns 145.6 ns
ReplaceSomeVertices_M3 10000 145.2 ns 0.79 ns 0.74 ns 145.3 ns
ReplaceSomeVertices_M4 10000 144.2 ns 0.55 ns 0.51 ns 144.2 ns
ChangeSomeVertices_M1 10000 2,793.8 ns 55.63 ns 103.11 ns 2,770.3 ns
ChangeSomeVertices_M2 10000 126.2 ns 0.91 ns 0.81 ns 125.9 ns
ChangeSomeVertices_M2_AlternativeVersion 10000 155.6 ns 1.54 ns 1.44 ns 155.0 ns
ChangeSomeVertices_M3 10000 2,840.6 ns 25.90 ns 22.96 ns 2,840.0 ns
ChangeSomeVertices_M4 10000 153.3 ns 0.61 ns 0.51 ns 153.4 ns
ReplaceAllVertices_M1 100000 125,962.3 ns 2,421.91 ns 2,146.96 ns 125,850.4 ns
ReplaceAllVertices_M2 100000 105,787.7 ns 1,242.14 ns 1,161.90 ns 105,537.4 ns
ReplaceAllVertices_M3 100000 114,437.5 ns 1,197.81 ns 1,120.43 ns 114,212.4 ns
ReplaceAllVertices_M4 100000 106,866.9 ns 2,047.79 ns 2,437.75 ns 105,724.5 ns
ReplaceSomeVertices_M1 100000 162.2 ns 0.51 ns 0.47 ns 162.2 ns
ReplaceSomeVertices_M2 100000 145.2 ns 0.29 ns 0.24 ns 145.2 ns
ReplaceSomeVertices_M3 100000 146.0 ns 1.06 ns 0.99 ns 145.9 ns
ReplaceSomeVertices_M4 100000 143.2 ns 0.88 ns 0.78 ns 143.1 ns
ChangeSomeVertices_M1 100000 2,649.8 ns 39.24 ns 36.71 ns 2,643.7 ns
ChangeSomeVertices_M2 100000 125.2 ns 0.37 ns 0.31 ns 125.2 ns
ChangeSomeVertices_M2_AlternativeVersion 100000 153.3 ns 0.77 ns 0.72 ns 153.0 ns
ChangeSomeVertices_M3 100000 2,723.1 ns 18.99 ns 15.86 ns 2,722.1 ns
ChangeSomeVertices_M4 100000 152.9 ns 0.83 ns 0.65 ns 153.0 ns
ReplaceAllVertices_M1 100000000 145,332,460.4 ns 1,319,414.43 ns 1,030,112.00 ns 145,057,925.0 ns
ReplaceAllVertices_M2 100000000 134,879,753.3 ns 1,490,533.54 ns 1,394,245.99 ns 135,314,100.0 ns
ReplaceAllVertices_M3 100000000 139,177,286.7 ns 1,235,668.59 ns 1,155,845.16 ns 138,949,550.0 ns
ReplaceAllVertices_M4 100000000 132,078,203.6 ns 697,992.64 ns 618,752.30 ns 132,109,287.5 ns
ReplaceSomeVertices_M1 100000000 160.9 ns 2.23 ns 4.19 ns 159.5 ns
ReplaceSomeVertices_M2 100000000 146.0 ns 1.06 ns 0.89 ns 146.2 ns
ReplaceSomeVertices_M3 100000000 146.7 ns 1.23 ns 1.09 ns 146.5 ns
ReplaceSomeVertices_M4 100000000 141.7 ns 1.59 ns 1.41 ns 141.4 ns
ChangeSomeVertices_M1 100000000 2,785.7 ns 54.80 ns 53.82 ns 2,788.4 ns
ChangeSomeVertices_M2 100000000 126.7 ns 0.89 ns 0.79 ns 126.8 ns
ChangeSomeVertices_M2_AlternativeVersion 100000000 154.6 ns 1.23 ns 1.09 ns 154.4 ns
ChangeSomeVertices_M3 100000000 2,882.4 ns 51.35 ns 45.52 ns 2,876.9 ns
ChangeSomeVertices_M4 100000000 151.2 ns 1.77 ns 1.47 ns 150.8 ns

wrestledBearOnce avatar Aug 18 '22 15:08 wrestledBearOnce

Questions

  1. How relevant are the ReadOnlySpan tests? I don't see the point of reading from a ReadOnlySpan but writing directly to the backing array.

  2. In the Span benchmarks: Would it make a difference (although the code was probably optimized) to only get the Span once outside the loop instead of accessing the DataAs[ReadOnly]Span getters (involving the new operation) once, twice or even three times in each loop iteration? E.G:

    [Benchmark]
    public void ForLoop_AsSpan_AddValue()
    {
        var dataAsSpan = DataAsSpan; // ONLY HERE THE new Span<float3>(_data) OPERATION IS PERFORMED
        for (var i = 0; i < dataAsSpan.Length; i++)
        {
            var currentVal = dataAsSpan[i]; // NO new Span<float3>(_data) HERE
            dataAsSpan[i] = new float3      // NO new Span<float3>(_data) HERE
                { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
        }
    }
    
  3. I'd love to see a way to make the indexer work for us because it would allow us to implement a mechanism keeping track of the changed vertices and propagate them to the GPU in a way that's totally opaque for the user. Some questions arise with the indexer

    1. The timing difference in reading from/replacing items directly on the backing array compared to reading/replacing using an indexer seems to be neglectable (~0.3 μs), while the difference between backing array vs. indexer when changing existing contents is substantially higher (~10 μs):

      Method Mean
      ForLoop_AddValue 17.511 μs
      ForLoop_AsIndexer_AddValue 17.874 μs
      ForLoop_ReplaceValue 18.801 μs
      ForLoop_AsIndexer_ReplaceValue 28.203 μs

      Why that? More data copying involved? Would be interesting to know, because if this holds, we need to eplain users how to set up their loops and should be able to explain why and not just how.

    2. Does the minimal difference of ~0.3 μs still hold if we add just the slightest bit of functionality to the setter of the indexer? My suspicion is that the "just-pass-it-to-the-array"-implementation of the indexer's setter (set => _data[idx] = value;) can be optimized by the compiler much better as if the setter contained some more instructions - even if the additional instruction(s) itself do not eat up much time.

griestopf avatar Aug 19 '22 08:08 griestopf

Question 1)

Just testing if ReadOnlySpan has any performance implications. Yes, in this case it's not useful.

Question 2)

Code

using System;
using System.Collections.ObjectModel;
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;

namespace MeshPerf
{
    public struct float3
    {
        public float X;
        public float Y;
        public float Z;
    }

    public class Mesh
    {
        public float3[] Data; // do not access by public var!
        public Span<float3> DataAsSpan => new Span<float3>(Data);
    }

    [MarkdownExporter] // export results for markdown
    public class MeshTests
    {
        [Params(10000, 100_000, 100_000_000)]
        public int N;

        public Mesh m;

        [GlobalSetup]
        public void Setup()
        {
            m = new Mesh
            {
                Data = new float3[N]
            };

            var rnd = new Random();
            Array.Fill(m.Data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

        }

        [Benchmark]
        public void ForLoop_AsSpan_AddValue_Original()
        {
            var dataAsSpan = m.DataAsSpan;
            for (var i = 0; i < dataAsSpan.Length; i++)
            {
                var currentVal = dataAsSpan[i];
                dataAsSpan[i] = new float3
                { X = currentVal.X * (i + 1), Y = currentVal.Y * (i + 3), Z = currentVal.Z * (i + 2) };
            }
        }

        [Benchmark]
        public void ForLoop_AsSpan_AddValue_New()
        {
            var dataAsSpan = m.DataAsSpan; // ONLY HERE THE new Span<float3>(_data) OPERATION IS PERFORMED
            for (var i = 0; i < dataAsSpan.Length; i++)
            {
                // var currentVal = dataAsSpan[i]; // NO new Span<float3>(_data) HERE
                dataAsSpan[i] = new float3      // NO new Span<float3>(_data) HERE
                { X = dataAsSpan[i].X * (i + 1), Y = dataAsSpan[i].Y * (i + 3), Z = dataAsSpan[i].Z * (i + 2) };
            }
        }
    }

    public static class MeshPerf
    {
        public static void Main()
        {
            var summary = BenchmarkRunner.Run<MeshTests>();
        }
    }
}

Results

Actually, there is a difference. I would have guessed that the optimizer is able to identify and remove this…


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19043.1889 (21H1/May2021Update)
AMD Ryzen 7 2700X, 1 CPU, 16 logical and 8 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT

Method N Mean Error StdDev
ForLoop_AsSpan_AddValue_Original 10000 13.58 μs 0.132 μs 0.117 μs
ForLoop_AsSpan_AddValue_New 10000 12.56 μs 0.133 μs 0.124 μs
ForLoop_AsSpan_AddValue_Original 100000 137.37 μs 1.262 μs 1.181 μs
ForLoop_AsSpan_AddValue_New 100000 125.47 μs 0.682 μs 0.604 μs
ForLoop_AsSpan_AddValue_Original 100000000 147,904.77 μs 2,134.418 μs 1,782.336 μs
ForLoop_AsSpan_AddValue_New 100000000 143,569.10 μs 2,836.517 μs 3,786.669 μs

wrestledBearOnce avatar Aug 22 '22 08:08 wrestledBearOnce

Some more example implementations regarding

Question 3)

The first mesh implementation uses RAII for change notifications and is unsafe and optimized for speed. The other implementation is "memory safe" and uses the indexer and a bool for change notifications. Added an extra method for value replacement, since the indexer is much slower than Array.Copy()

Code

using System;
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;

namespace MeshPerf
{
    public struct float3
    {
        public float X;
        public float Y;
        public float Z;
    }

    public enum MeshDataType
    {
        Vertices,
        Normals
    }

    public class MeshChangedEvent : EventArgs
    {
        public MeshDataType Type;
    }

    public unsafe class MeshData : IDisposable
    {
        private readonly float3* _internalRef;
        private readonly int _internalLength;
        private readonly MeshDataType _type;
        private readonly EventHandler<MeshChangedEvent> _hndl; // dummy
        private bool disposedValue;

        public Span<float3> Data => new(_internalRef, _internalLength);

        public MeshData(float3* _data, in int length, in MeshDataType type, in EventHandler<MeshChangedEvent> hndl)
        {
            _internalRef = _data;
            _internalLength = length;
            _type = type;
            _hndl = hndl;
        }

        protected virtual void Dispose(bool disposing)
        {
            if (!disposedValue)
            {
                _hndl?.Invoke(this, new MeshChangedEvent { Type = _type });
                disposedValue = true;
            }
        }

        public void Dispose()
        {
            // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
            Dispose(disposing: true);
            GC.SuppressFinalize(this);
        }
    }

    public unsafe class RAIIMesh
    {
        public float3[] _verticesData; // do not access publicly

        public EventHandler<MeshChangedEvent> OnChangeEvent;

        public MeshData GetVertices()
        {
            fixed (float3* ptr = &_verticesData[0])
                return new MeshData(ptr, _verticesData.Length, MeshDataType.Vertices, in OnChangeEvent);
        }

        [MethodImpl(MethodImplOptions.AggressiveOptimization)]
        public void SetVertices(in float3[] data, int dstOffset, int length)
        {
            var float3Offset = dstOffset * 3;
            var float3Length = length * 3;

            var destInBytes = _verticesData.Length * 3 * sizeof(float);
            var dataInBytes = float3Length * sizeof(float);

            fixed (float* ptr = &data[0].X)
            fixed (float* localPtr = &_verticesData[0].X)
            {
                var localPtrOffset = localPtr + float3Offset;
                Buffer.MemoryCopy(ptr, localPtrOffset, destInBytes, dataInBytes);
            }

            OnChangeEvent?.Invoke(this, new MeshChangedEvent
            {
                Type = MeshDataType.Vertices
            });
        }

    }

    public class ThisMesh
    {
        public float3[] _verticesData; // do not access publicly
        public EventHandler<MeshChangedEvent> OnChangeEvent;

        public int Length => _verticesData.Length;

        public bool DataModified { get; private set; } = false;

        public float3 this[int idx]
        {
            get => _verticesData[idx];
            set
            {
                _verticesData[idx] = value;
                // do not invoke event here, this is done every call and leads to a massive performance hit
                // set a variable, that things are changed and leave the call to the event for later
                DataModified = true;
                //OnChangeEvent?.Invoke(this, new MeshChangedEvent
                //{
                //Type = MeshDataType.Vertices
                //});

            }
        }

        [MethodImpl(MethodImplOptions.AggressiveOptimization)]
        public void SetVertices(in float3[] data, int offset, int length)
        {
            Array.Copy(data, offset, _verticesData, offset, length);

            OnChangeEvent?.Invoke(this, new MeshChangedEvent
            {
                Type = MeshDataType.Vertices
            });
        }
    }

    public class Mesh
    {
        public float3[] Data;
    }


    [MarkdownExporter] // export results for markdown
    [MemoryDiagnoser]
    public class MeshTests
    {
        [Params(10000, 100_000, 100_000_000)]
        public int N = 100;

        public Mesh m;
        public RAIIMesh raiiMesh;
        public ThisMesh indexerMesh;

        public float3[] DataForReplacement;

        public static long DummyCounter = 0;

        [GlobalSetup]
        public void Setup()
        {
            m = new Mesh
            {
                Data = new float3[N]
            };

            raiiMesh = new RAIIMesh
            {
                _verticesData = new float3[N]
            };

            indexerMesh = new ThisMesh
            {
                _verticesData = new float3[N]
            };

            DataForReplacement = new float3[N];

            var rnd = new Random();
            Array.Fill(m.Data,
                new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(raiiMesh._verticesData,
               new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(indexerMesh._verticesData,
               new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            Array.Fill(DataForReplacement,
              new float3 { X = (float)rnd.NextDouble(), Y = (float)rnd.NextDouble(), Z = (float)rnd.NextDouble() });

            DummyCounter = 0;

            raiiMesh.OnChangeEvent += (s, e) => { if (e.Type == MeshDataType.Vertices) DummyCounter++; }; // do some dummy work
            indexerMesh.OnChangeEvent += (s, e) => { if (e.Type == MeshDataType.Vertices) DummyCounter++; }; // do some dummy work

        }

        [Benchmark]
        public void ChangeAllElements_Baseline()
        {
            var data = m.Data;
            for (var i = 0; i < data.Length; i++)
            {
                data[i] = new float3 { X = (data[i].X + 1) * (i + 1), Y = (data[i].Y + 1) * (i + 3), Z = (data[i].Z + 1) * (i + 2) };
            }
        }

        [Benchmark]
        public unsafe void ChangeAllElements_RAIIMesh()
        {
            using var vert = raiiMesh.GetVertices();
            var data = vert.Data;
            for (var i = 0; i < data.Length; i++)
            {
                data[i] = new float3 { X = (data[i].X + 1) * (i + 1), Y = (data[i].Y + 1) * (i + 3), Z = (data[i].Z + 1) * (i + 2) };
            }
        }

        [Benchmark]
        public unsafe void ChangeAllElements_IndexerMesh()
        {
            var data = indexerMesh;
            for (var i = 0; i < data.Length; i++)
            {
                data[i] = new float3 { X = (data[i].X + 1) * (i + 1), Y = (data[i].Y + 1) * (i + 3), Z = (data[i].Z + 1) * (i + 2) };
            }

            // call event after data modification
            if (indexerMesh.DataModified)
                indexerMesh.OnChangeEvent?.Invoke(this, new MeshChangedEvent { Type = MeshDataType.Vertices });
        }


        [Benchmark]
        public void ReplaceAllElements_Baseline()
        {
            var data = m.Data;
            for (var i = 0; i < data.Length; i++)
            {
                var replaceData = DataForReplacement[i];
                data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
            }
        }

        [Benchmark]
        public unsafe void ReplaceAllElements_RAIIMesh()
        {
            raiiMesh.SetVertices(in DataForReplacement, 0, DataForReplacement.Length);
        }

        [Benchmark]
        public void ReplaceAllElements_IndexerMesh()
        {
            indexerMesh.SetVertices(in DataForReplacement, 0, DataForReplacement.Length);
        }

        [Benchmark]
        public void ReplaceAllElements_IndexerMesh_NoExtraMethod()
        {
            var data = indexerMesh;
            for (var i = 0; i < data.Length; i++)
            {
                var replaceData = DataForReplacement[i];
                data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
            }

            // call event after data modification
            if (indexerMesh.DataModified)
                indexerMesh.OnChangeEvent?.Invoke(this, new MeshChangedEvent { Type = MeshDataType.Vertices });
        }

        [Benchmark]
        public void ReplaceSomeElements_Baseline()
        {
            var data = m.Data;
            for (var i = 100; i < data.Length - 100; i++)
            {
                var replaceData = DataForReplacement[i];
                data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
            }
        }

        [Benchmark]
        public unsafe void ReplaceSomeElements_RAIIMesh()
        {
            raiiMesh.SetVertices(in DataForReplacement, 100, DataForReplacement.Length - 100);
        }

        [Benchmark]
        public unsafe void ReplaceSomeElements_IndexerMesh()
        {
            indexerMesh.SetVertices(in DataForReplacement, 100, DataForReplacement.Length - 100);
        }

        [Benchmark]
        public void ReplaceSomeElements_IndexerMesh_NoExtraMethod()
        {
            var data = indexerMesh;
            for (var i = 100; i < data.Length - 100; i++)
            {
                var replaceData = DataForReplacement[i];
                data[i] = new float3 { X = (replaceData.X + 1) * (i + 1), Y = (replaceData.Y + 1) * (i + 3), Z = (replaceData.Z + 1) * (i + 2) };
            }

            // call event after data modification
            if (indexerMesh.DataModified)
                indexerMesh.OnChangeEvent?.Invoke(this, new MeshChangedEvent { Type = MeshDataType.Vertices });
        }

    }

    public static class MeshPerf
    {
        public static void Main(string[] args)
        {
            //var testor = new MeshTests();
            //testor.Setup();
            //testor.ReplaceAllElements_RAIIMesh();
            var summary = BenchmarkRunner.Run<MeshTests>();
            Console.WriteLine(MeshTests.DummyCounter);

        }
    }
}

Results


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19043.1889 (21H1/May2021Update)
AMD Ryzen 7 2700X, 1 CPU, 16 logical and 8 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT


Method N Mean Gen 0 Allocated
ChangeAllElements_Baseline 10000 15.243 μs - -
ChangeAllElements_RAIIMesh 10000 16.348 μs - 72 B
ChangeAllElements_IndexerMesh 10000 20.100 μs - 24 B
ReplaceAllElements_Baseline 10000 18.504 μs - -
ReplaceAllElements_RAIIMesh 10000 2.146 μs 0.0038 24 B
ReplaceAllElements_IndexerMesh 10000 3.102 μs 0.0038 24 B
ReplaceAllElements_IndexerMesh_NoExtraMethod 10000 27.049 μs - 24 B
ReplaceSomeElements_Baseline 10000 17.638 μs - -
ReplaceSomeElements_RAIIMesh 10000 2.014 μs 0.0038 24 B
ReplaceSomeElements_IndexerMesh 10000 3.052 μs 0.0038 24 B
ReplaceSomeElements_IndexerMesh_NoExtraMethod 10000 27.422 μs - 24 B
ChangeAllElements_Baseline 100000 150.544 μs - -
ChangeAllElements_RAIIMesh 100000 162.869 μs - 72 B
ChangeAllElements_IndexerMesh 100000 278.018 μs - 24 B
ReplaceAllElements_Baseline 100000 185.578 μs - -
ReplaceAllElements_RAIIMesh 100000 29.779 μs - 24 B
ReplaceAllElements_IndexerMesh 100000 33.617 μs - 24 B
ReplaceAllElements_IndexerMesh_NoExtraMethod 100000 285.607 μs - 24 B
ReplaceSomeElements_Baseline 100000 183.384 μs - -
ReplaceSomeElements_RAIIMesh 100000 25.871 μs - 24 B
ReplaceSomeElements_IndexerMesh 100000 33.678 μs - 24 B
ReplaceSomeElements_IndexerMesh_NoExtraMethod 100000 261.310 μs - 24 B
ChangeAllElements_Baseline 100000000 162,162.093 μs - 578 B
ChangeAllElements_RAIIMesh 100000000 174,144.947 μs - 845 B
ChangeAllElements_IndexerMesh 100000000 285,332.773 μs - 1,180 B
ReplaceAllElements_Baseline 100000000 216,335.931 μs - 771 B
ReplaceAllElements_RAIIMesh 100000000 81,180.116 μs - 93 B
ReplaceAllElements_IndexerMesh 100000000 80,354.836 μs - 203 B
ReplaceAllElements_IndexerMesh_NoExtraMethod 100000000 293,278.668 μs - 564 B
ReplaceSomeElements_Baseline 100000000 212,722.011 μs - 771 B
ReplaceSomeElements_RAIIMesh 100000000 81,746.793 μs - 206 B
ReplaceSomeElements_IndexerMesh 100000000 79,903.332 μs - 93 B
ReplaceSomeElements_IndexerMesh_NoExtraMethod 100000000 293,259.157 μs - 1,180 B

One interessting finding: Array.Copy() performs better than Memory.BlockCopy for large elements.

Results, branch predition / cache misses


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT


Method N Mean Error StdDev BranchInstructions/Op BranchMispredictions/Op
ChangeAllElements_Baseline 10000 17.579 μs 0.3509 μs 0.6053 μs 10,202 8
ChangeAllElements_RAIIMesh 10000 18.502 μs 0.3165 μs 0.2805 μs 10,227 8
ChangeAllElements_IndexerMesh 10000 20.878 μs 0.3437 μs 0.2870 μs 20,248 10
ReplaceAllElements_Baseline 10000 19.761 μs 0.3206 μs 0.3938 μs 20,222 9
ReplaceAllElements_RAIIMesh 10000 2.624 μs 0.0480 μs 0.0449 μs 67 1
ReplaceAllElements_IndexerMesh 10000 2.543 μs 0.0289 μs 0.0241 μs 71 1
ReplaceAllElements_IndexerMesh_NoExtraMethod 10000 22.226 μs 0.3524 μs 0.3297 μs 30,263 10
ReplaceSomeElements_Baseline 10000 19.325 μs 0.3750 μs 0.3683 μs 19,842 9
ReplaceSomeElements_RAIIMesh 10000 2.121 μs 0.0307 μs 0.0272 μs 60 0
ReplaceSomeElements_IndexerMesh 10000 2.590 μs 0.0487 μs 0.0432 μs 72 1
ReplaceSomeElements_IndexerMesh_NoExtraMethod 10000 22.870 μs 0.2953 μs 0.2762 μs 29,677 11
ChangeAllElements_Baseline 100000 176.943 μs 3.4493 μs 3.8339 μs 102,100 62
ChangeAllElements_RAIIMesh 100000 180.424 μs 2.3812 μs 2.1109 μs 101,995 52
ChangeAllElements_IndexerMesh 100000 207.774 μs 2.4982 μs 2.3368 μs 202,294 75
ReplaceAllElements_Baseline 100000 199.659 μs 2.3540 μs 2.2019 μs 202,243 72
ReplaceAllElements_RAIIMesh 100000 35.487 μs 0.6606 μs 0.6179 μs 573 11
ReplaceAllElements_IndexerMesh 100000 34.294 μs 0.6512 μs 0.6092 μs 443 6
ReplaceAllElements_IndexerMesh_NoExtraMethod 100000 223.151 μs 3.2693 μs 3.0581 μs 302,743 96
ReplaceSomeElements_Baseline 100000 199.397 μs 2.2433 μs 1.9886 μs 202,231 81
ReplaceSomeElements_RAIIMesh 100000 35.615 μs 0.4313 μs 0.4034 μs 449 5
ReplaceSomeElements_IndexerMesh 100000 35.243 μs 0.3341 μs 0.2790 μs 396 4
ReplaceSomeElements_IndexerMesh_NoExtraMethod 100000 233.689 μs 4.3176 μs 3.8275 μs 302,408 104
ChangeAllElements_Baseline 100000000 189,533.622 μs 3,739.5204 μs 4,001.2450 μs 102,645,153 102,324
ChangeAllElements_RAIIMesh 100000000 188,567.398 μs 2,536.0843 μs 2,248.1727 μs 102,435,681 76,550
ChangeAllElements_IndexerMesh 100000000 216,362.010 μs 4,270.5804 μs 4,194.2810 μs 202,713,771 102,656
ReplaceAllElements_Baseline 100000000 208,290.382 μs 4,065.6367 μs 4,518.9445 μs 202,922,451 103,262
ReplaceAllElements_RAIIMesh 100000000 81,626.107 μs 640.0073 μs 567.3498 μs 5,820,325 28,581
ReplaceAllElements_IndexerMesh 100000000 85,533.504 μs 1,870.6457 μs 5,367.2331 μs 5,759,678 26,735
ReplaceAllElements_IndexerMesh_NoExtraMethod 100000000 236,609.964 μs 2,506.1896 μs 2,221.6718 μs 304,499,917 166,434
ReplaceSomeElements_Baseline 100000000 209,677.622 μs 2,289.9830 μs 2,142.0515 μs 202,798,967 98,031
ReplaceSomeElements_RAIIMesh 100000000 82,316.572 μs 1,383.3185 μs 1,698.8394 μs 5,927,367 33,198
ReplaceSomeElements_IndexerMesh 100000000 86,159.622 μs 1,722.1894 μs 3,632.6819 μs 5,926,457 33,655
ReplaceSomeElements_IndexerMesh_NoExtraMethod 100000000 246,318.263 μs 2,573.1784 μs 2,406.9527 μs 303,324,638 127,795

wrestledBearOnce avatar Aug 22 '22 12:08 wrestledBearOnce

The timing difference in reading from/replacing items directly on the backing array compared to reading/replacing using an indexer seems to be neglectable (~0.3 μs), while the difference between backing array vs. indexer when changing existing contents is substantially higher (~10 μs)

Why that?

TLDR: Right now, not really explainable

Use https://sharplab.io/ for testing

Example code

using System;
using System.Runtime.CompilerServices;

public struct float3
{
    public float X;
    public float Y;
    public float Z;
    
    public float3(float x, float y, float z) => (X, Y, Z) = (x,y,z);
}

public class ThisMesh
{
    public float3[] _data = new float3[100];
    public bool DataModified { get; private set; } = false;

    public float3 this[int idx]
    {
        get => _data[idx];
        set
        {
            _data[idx] = value;               
            DataModified = true; 
        }
    }
}


public class BaseMesh
{
    public float3[] _data;
}


public class DoWork
{
    void LoopAdd_This()
    {
         var m = new ThisMesh();
         for(var i = 0; i < 100; i++)
             m[i] = new float3(i, i+1, i+2);
    }
    
    void LoopAdd_Base()
    {
         var m = new BaseMesh();
         for(var i = 0; i < 100; i++)
             m._data[i] = new float3(i, i+1, i+2);
    }
}

Release IL code with indexer

.method private hidebysig 
        instance void LoopAdd_This () cil managed 
    {
        // Method begins at RVA 0x20cc
        // Code size 42 (0x2a)
        .maxstack 6
        .locals init (
            [0] class ThisMesh m,
            [1] int32 i
        )

        IL_0000: newobj instance void ThisMesh::.ctor()
        IL_0005: stloc.0
        IL_0006: ldc.i4.0
        IL_0007: stloc.1
        // sequence point: hidden
        IL_0008: br.s IL_0024
        // loop start (head: IL_0024)
            IL_000a: ldloc.0
            IL_000b: ldloc.1
            IL_000c: ldloc.1
            IL_000d: conv.r4
            IL_000e: ldloc.1
            IL_000f: ldc.i4.1
            IL_0010: add
            IL_0011: conv.r4
            IL_0012: ldloc.1
            IL_0013: ldc.i4.2
            IL_0014: add
            IL_0015: conv.r4
            IL_0016: newobj instance void float3::.ctor(float32, float32, float32)
            IL_001b: callvirt instance void ThisMesh::set_Item(int32, valuetype float3) // <- calls to method below via callvirt (prevents null references!)
            IL_0020: ldloc.1
            IL_0021: ldc.i4.1
            IL_0022: add
            IL_0023: stloc.1

            IL_0024: ldloc.1
            IL_0025: ldc.i4.s 100
            IL_0027: blt.s IL_000a
        // end loop

        IL_0029: ret
    } // end of method DoWork::LoopAdd_This

callvirt set_Item()

.method public hidebysig specialname 
        instance void set_Item (
            int32 idx,
            valuetype float3 'value'
        ) cil managed 
    {
        // Method begins at RVA 0x2097
        // Code size 21 (0x15)
        .maxstack 8

        IL_0000: ldarg.0
        IL_0001: ldfld valuetype float3[] ThisMesh::_data
        IL_0006: ldarg.1
        IL_0007: ldarg.2
        IL_0008: stelem float3
        IL_000d: ldarg.0
        IL_000e: ldc.i4.1
        IL_000f: call instance void ThisMesh::set_DataModified(bool)
        IL_0014: ret
    } // end of method ThisMesh::set_Item

Release IL code without indexer (base loop)

.method private hidebysig 
        instance void LoopAdd_Base () cil managed 
    {
        // Method begins at RVA 0x2104
        // Code size 47 (0x2f)
        .maxstack 6
        .locals init (
            [0] class BaseMesh m,
            [1] int32 i
        )

        IL_0000: newobj instance void BaseMesh::.ctor()
        IL_0005: stloc.0
        IL_0006: ldc.i4.0
        IL_0007: stloc.1
        // sequence point: hidden
        IL_0008: br.s IL_0029
        // loop start (head: IL_0029)
            IL_000a: ldloc.0
            IL_000b: ldfld valuetype float3[] BaseMesh::_data
            IL_0010: ldloc.1
            IL_0011: ldloc.1
            IL_0012: conv.r4
            IL_0013: ldloc.1
            IL_0014: ldc.i4.1
            IL_0015: add
            IL_0016: conv.r4
            IL_0017: ldloc.1
            IL_0018: ldc.i4.2
            IL_0019: add
            IL_001a: conv.r4
            IL_001b: newobj instance void float3::.ctor(float32, float32, float32)
            IL_0020: stelem float3
            IL_0025: ldloc.1
            IL_0026: ldc.i4.1
            IL_0027: add
            IL_0028: stloc.1

            IL_0029: ldloc.1
            IL_002a: ldc.i4.s 100
            IL_002c: blt.s IL_000a
        // end loop

        IL_002e: ret
    } // end of method DoWork::LoopAdd_Base

The difference is at least one additional call to callvirt set_Item() and only there the _data array is being pushed on the stack, each loop iteration.

Furthermore, looking inside the source code for Array.Copy() (https://source.dot.net/#System.Private.CoreLib/src/System/Array.CoreCLR.cs,24) I would advise using this/these method/s as often as possible instead of relying upon plain for-loops and or an indexer due to the massive runtime differences.

~~Wanted to test branch & cache misses for indexer version, however Windows 10 is unable to read this data reliable from AMD processors.~~

Changed to Intel PC, test results using code posted above


BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19044.1889 (21H2)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.400
  [Host]     : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT
  DefaultJob : .NET 6.0.8 (6.0.822.36306), X64 RyuJIT


Method N Mean Error StdDev BranchInstructions/Op BranchMispredictions/Op
LoopAdd_This 10000 13.91 μs 0.192 μs 0.171 μs 20,141 6
LoopAdd_Base 10000 14.00 μs 0.218 μs 0.193 μs 20,151 7
LoopAdd_This 100000 145.46 μs 2.871 μs 6.056 μs 201,663 73
LoopAdd_Base 100000 145.80 μs 2.486 μs 2.660 μs 201,630 71
LoopAdd_This 100000000 151,834.48 μs 2,958.620 μs 4,606.214 μs 202,032,097 92,098
LoopAdd_Base 100000000 147,553.75 μs 1,969.378 μs 1,842.157 μs 202,243,004 89,975

wrestledBearOnce avatar Aug 22 '22 13:08 wrestledBearOnce

Example implementation

//#define WITH_GUARD // <- define guard, disable for benchmark

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using CommunityToolkit.Diagnostics;
using System;
using System.Linq;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;



namespace NewMeshImp
{
    public struct float3
    {
        public float X;
        public float Y;
        public float Z;

        public float3(float _x, float _y, float _z) => (X, Y, Z) = (_x, _y, _z);
    }

    public enum MeshDataType
    {
        Vertices,
        Normals
    }

    public class MeshChangedEvent : EventArgs
    {
        public MeshDataType Type;
        public IReadOnlyCollection<int>? ChangedIndices;
    }


    public unsafe class MeshFloat3Data : IDisposable
    {
        private readonly float3* _internalRef;
        private readonly int _internalLength;
        private readonly MeshDataType _type;
        private readonly EventHandler<MeshChangedEvent> _hndl; // dummy
        private bool disposedValue;

        private readonly List<int> _changedIndices = new();

        public ReadOnlySpan<float3> Data => new(_internalRef, _internalLength);

        public MeshFloat3Data(float3* _data, in int length, in MeshDataType type, in EventHandler<MeshChangedEvent> hndl)
        {
            _internalRef = _data;
            _internalLength = length;
            _type = type;
            _hndl = hndl;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetValues(in float3[] data, int destStartOffset, int length)
        {

#if WITH_GUARD
            Guard.IsGreaterThanOrEqualTo(destStartOffset, 0);
            Guard.IsGreaterThanOrEqualTo(length, 1);
            Guard.IsLessThan(length, _internalLength);
#endif

            if (_changedIndices.Capacity < length)
                _changedIndices.Capacity = length;

            var float3Offset = destStartOffset * 3;
            var float3Length = length * 3;

            var destInBytes = _internalLength * 3 * sizeof(float);
            var dataInBytes = float3Length * sizeof(float);

            fixed (float* ptr = &data[0].X)
            {
                var localPtrOffset = _internalRef + float3Offset;
                Buffer.MemoryCopy(ptr, localPtrOffset, destInBytes, dataInBytes);
            }

            _changedIndices.AddRange(Enumerable.Range(destStartOffset, length));
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetValues(int startIdx, int endIdx, in Func<float3, float3> actionToPerformOnValue)
        {

#if WITH_GUARD
            Guard.IsGreaterThanOrEqualTo(startIdx, 0);
            Guard.IsLessThan(endIdx, _internalLength);
            Guard.IsGreaterThan(endIdx, startIdx);
#endif

            for (var i = 0; i < endIdx; i++)
            {
                _internalRef[i] = actionToPerformOnValue(_internalRef[i]);
            }

            _changedIndices.AddRange(Enumerable.Range(startIdx, endIdx));
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetValue(int index, in Func<float3, float3> actionToPerformOnValue)
        {

#if WITH_GUARD
            Guard.IsGreaterThanOrEqualTo(index, 0);
            Guard.IsLessThan(index, _internalLength);
#endif

            _internalRef[index] = actionToPerformOnValue(_internalRef[index]);
            _changedIndices.Add(index);
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetValue(int index, float3 value)
        {

#if WITH_GUARD
            Guard.IsGreaterThanOrEqualTo(index, 0);
            Guard.IsLessThan(index, _internalLength);
#endif

            _internalRef[index] = value;
            _changedIndices.Add(index);
        }

        protected virtual void Dispose(bool disposing)
        {
            if (!disposedValue)
            {
                _hndl?.Invoke(this, new MeshChangedEvent { Type = _type, ChangedIndices = _changedIndices.AsReadOnly() });
                disposedValue = true;
            }
        }

        public void Dispose()
        {
            Dispose(disposing: true);
            GC.SuppressFinalize(this);
        }
    }

    public unsafe class Mesh
    {
        private readonly float3[] _vertices;
        private readonly float3[] _normals;

        public EventHandler<MeshChangedEvent>? OnChangeEvent;

        public Mesh(in float3[] vertices, in float3[] normals)
        {
            _vertices = new float3[vertices.Length];
            _normals = new float3[normals.Length];
            Array.Copy(vertices, _vertices, vertices.Length); // copy to be sure we do not use the same array twice
            Array.Copy(vertices, _normals, vertices.Length); // copy to be sure we do not use the same array twice
            
        }

        public MeshFloat3Data GetVertices()
        {
            fixed (float3* ptr = &_vertices[0])
                return new MeshFloat3Data(ptr, _vertices.Length, MeshDataType.Vertices, in OnChangeEvent);
        }

        public MeshFloat3Data GetNormals()
        {
            fixed (float3* ptr = &_normals[0])
                return new MeshFloat3Data(ptr, _normals.Length, MeshDataType.Normals, in OnChangeEvent);
        }
    }

    public class SimpleMesh
    {
        public readonly float3[] _vertices;
        public readonly float3[] _normals;

        public SimpleMesh(in float3[] vertices, in float3[] normals)
        {
            _vertices = new float3[vertices.Length];
            _normals = new float3[normals.Length];
            Array.Copy(vertices, _vertices, vertices.Length); // copy to be sure we do not use the same array twice
            Array.Copy(vertices, _normals, vertices.Length); // copy to be sure we do not use the same array twice
        }
    }

    [MarkdownExporter]
    [MemoryDiagnoser]
    public class Benchmark
    {
        [Params(1_000_000, 100_000_000)]
        public int N;

        public Mesh mesh;
        public SimpleMesh simpleMesh;
        public event EventHandler<MeshChangedEvent> Event;

        public static int DummyVariable = 0;

        [GlobalSetup]
        public void Setup()
        {
            var vertices = new float3[N];
            var normals = new float3[N];

            var rnd = new Random();

            Array.Fill(vertices, new float3((float)rnd.NextDouble(), (float)rnd.NextDouble(), (float)rnd.NextDouble()));
            Array.Fill(normals, new float3((float)rnd.NextDouble(), (float)rnd.NextDouble(), (float)rnd.NextDouble()));

            mesh = new Mesh(in vertices, in normals);
            simpleMesh = new SimpleMesh(in vertices, in normals);

            mesh.OnChangeEvent += (s, e) => { DummyVariable += e.ChangedIndices.Count; };
            Event += (s, e) => { DummyVariable += e.ChangedIndices.Count; };
        }

        [Benchmark]
        public void BenchmarkSimpleMeshOp_1()
        {
            // Simple Mesh
            for (var i = 10_00_00; i < simpleMesh._vertices.Length - 1000; i++)
            {
                var data = simpleMesh._vertices[i];
                data.X += 10;
                data.Y += 5;
                data.Z += 1;
            }

            Event.Invoke(this, new MeshChangedEvent
            {
                ChangedIndices = Enumerable.Range(10_00_00, simpleMesh._vertices.Length - 1000).ToList(),
                Type = MeshDataType.Vertices
            });
        }

        [Benchmark]
        public void BenchmarkSimpleMeshOp_2()
        {
            for (var i = 0; i < simpleMesh._vertices.Length; i++)
            {
                var data = simpleMesh._vertices[i];
                data.X += (data.X * i);
            }

            Event.Invoke(this, new MeshChangedEvent
            {
                ChangedIndices = Enumerable.Range(0, simpleMesh._vertices.Length).ToList(),
                Type = MeshDataType.Vertices
            });
        }

        [Benchmark]
        public void BenchmarkMeshOp_1()
        {
            using var vertData = mesh.GetVertices();
            vertData.SetValues(10_00_00, vertData.Data.Length - 1000, static (float3 data) => { data.X += 10; data.Y += 5; data.Z += 1; return data; });
        }

        [Benchmark]
        public void BenchmarkMeshOp_ReplaceStaticDelegateWithForLoop()
        {
            using var vertData = mesh.GetVertices();
            for (var i = 10_00_00; i < vertData.Data.Length - 1000; i++)
            {
                vertData.SetValue(i, new float3(vertData.Data[i].X + 10, vertData.Data[i].Y + 5, vertData.Data[i].Z + 1));
            }
        }

        [Benchmark]
        public void BenchmarkMeshOp_ReplaceStaticDelegateWithArrayCopy()
        {
            using var vertData = mesh.GetVertices();
            var tmpArray = new float3[vertData.Data.Length - 1000 - 10_00_00];

            var cnt = 0;
            for(var i = 10_00_00; i < vertData.Data.Length - 1000; i++)
            {
                tmpArray[cnt++] = (new float3(vertData.Data[i].X + 10, vertData.Data[i].Y + 5, vertData.Data[i].Z + 1));
            }

            vertData.SetValues(tmpArray, 10_00_00, vertData.Data.Length - 1000);
        }

        [Benchmark]
        public void BenchmarkMeshOp_2()
        {
            using var vertData = mesh.GetVertices();

            for (var i = 0; i < vertData.Data.Length; i++)
            {
                vertData.SetValue(i, (float3 data) => { data.X += (data.X * i); return data; }); // closure, no static possible ...
            }
        }

        [Benchmark]
        public void BenchmarkMeshOp_ReplaceClosureWithForLoop()
        {
            using var vertData = mesh.GetVertices();

            for (var i = 0; i < vertData.Data.Length; i++)
            {
                vertData.SetValue(i, new float3(vertData.Data[i].X + (vertData.Data[i].X * i), vertData.Data[i].Y, vertData.Data[i].Z));
            }
        }

        [Benchmark]
        public void BenchmarkMeshOp_ReplaceClosureWithArrayCopy()
        {
            using var vertData = mesh.GetVertices();
            var tmpArray = new float3[vertData.Data.Length];
            for (var i = 0; i < vertData.Data.Length; i++)
            {
                tmpArray[i] = new float3(vertData.Data[i].X + (vertData.Data[i].X * i), vertData.Data[i].Y, vertData.Data[i].Z);
            }

            vertData.SetValues(tmpArray, 0, vertData.Data.Length);
        }
    }


    public static class Program
    {
        public static void Main()
        {
            var _ = BenchmarkRunner.Run<Benchmark>();

            Console.WriteLine(Benchmark.DummyVariable);
        }
    }
}

Results

BenchmarkSimpleMeshOp_1   4.280 ms and 3996144 B allocated
BenchmarkSimpleMeshOp_2   4.065 ms and  4000141 B allocated

vs.

BenchmarkMeshOp_ReplaceStaticDelegateWithForLoop  4.121 ms and 8389242 B allocated
BenchmarkMeshOp_ReplaceClosureWithForLoop   4.554 ms and 8389242 B  allocated

vs.

BenchmarkMeshOp_ReplaceClosureWithArrayCopy  10.402 ms and 16000312 B allocated

Basic for loops should be the way to go, array copy uses even more resources. However we have to live with the memory overhead ...

wrestledBearOnce avatar Aug 26 '22 10:08 wrestledBearOnce

Final implementation recommendation with all findings

This implementation is nearly as fast as using POD-arrays with all public methods. However, the memory overhead is significant, usually the allocated memory is 2x (this is also due to the fact, that we do track changed indices, perhaps there is a better method to keep track while changing elements). When this is ovoerhead is too much, one needs to fall back to passing methods (preferably static delegates) in which the modification on each point is being described.

using System.Runtime.CompilerServices;
using CommunityToolkit.Diagnostics;

 public struct float3
    {
        public float X;
        public float Y;
        public float Z;

        public float3(float _x, float _y, float _z) => (X, Y, Z) = (_x, _y, _z);
    }

    public enum MeshDataType
    {
        Vertices,
        Normals
    }

    public class MeshChangedEvent : EventArgs
    {
        public MeshDataType Type;
        public IReadOnlyCollection<int>? ChangedIndices;
    }


    public unsafe class MeshFloat3Data : IDisposable
    {
        private readonly float3* _internalRef;
        private readonly int _internalLength;
        private readonly MeshDataType _type;
        private readonly EventHandler<MeshChangedEvent> _hndl; // dummy
        private bool disposedValue;

        private readonly List<int> _changedIndices = new();

        public ReadOnlySpan<float3> Data => new(_internalRef, _internalLength);

        public MeshFloat3Data(float3* _data, in int length, in MeshDataType type, in EventHandler<MeshChangedEvent> hndl)
        {
            _internalRef = _data;
            _internalLength = length;
            _type = type;
            _hndl = hndl;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void SetValue(int index, float3 value)
        {
            Guard.IsGreaterThanOrEqualTo(index, 0);
            Guard.IsLessThan(index, _internalLength);

            _internalRef[index] = value;
            _changedIndices.Add(index);
        }

        protected virtual void Dispose(bool disposing)
        {
            if (!disposedValue)
            {
                _hndl?.Invoke(this, new MeshChangedEvent { Type = _type, ChangedIndices = _changedIndices.AsReadOnly() });
                disposedValue = true;
            }
        }

        public void Dispose()
        {
            Dispose(disposing: true);
            GC.SuppressFinalize(this);
        }
    }

    public unsafe class Mesh
    {
        private readonly float3[] _vertices;
        private readonly float3[] _normals;

        public EventHandler<MeshChangedEvent>? OnChangeEvent;

        public Mesh(in float3[] vertices, in float3[] normals)
        {
            _vertices = vertices;
            _normals = normals;
        }

        public MeshFloat3Data GetVertices()
        {
            fixed (float3* ptr = &_vertices[0])
                return new MeshFloat3Data(ptr, _vertices.Length, MeshDataType.Vertices, in OnChangeEvent);
        }

        public MeshFloat3Data GetNormals()
        {
            fixed (float3* ptr = &_normals[0])
                return new MeshFloat3Data(ptr, _normals.Length, MeshDataType.Normals, in OnChangeEvent);
        }
    }

    public static class Program
    {
        public static void Main()
        {
            var mesh = new Mesh(new float3[] { new float3(1, 2, 3), new float3(1, 2, 3) }, new float3[] { new float3(1, 2, 3), new float3(1, 2, 3) });

            using var vertData = mesh.GetVertices();

            for (var i = 0; i < vertData.Data.Length; i++)
            {
                vertData.SetValue(i, new float3(vertData.Data[i].X + (vertData.Data[i].X * i), vertData.Data[i].Y, vertData.Data[i].Z));
            }
        }
    }

wrestledBearOnce avatar Aug 26 '22 11:08 wrestledBearOnce