In serialization process, MessagePack for C# uses Buffer.BlockCopy heavily.
for example when serializing contractless(stirng key), property name was cached and uses block copy.
Here is benchmark of property name(Encoding.UTF8.GetBytes("MyProperty1"), length = 11) copy speed.
| Method |
Jit |
Platform |
Mean |
Error |
Scaled |
Allocated |
| ArrayCopy |
LegacyJit |
X86 |
29.714 ns |
NA |
1.31 |
0 B |
| BlockCopy |
LegacyJit |
X86 |
22.731 ns |
NA |
1.00 |
0 B |
| MemoryCopy |
LegacyJit |
X86 |
6.256 ns |
NA |
0.28 |
0 B |
| ByteCopy |
LegacyJit |
X86 |
10.119 ns |
NA |
0.45 |
0 B |
| LongCopy |
LegacyJit |
X86 |
5.429 ns |
NA |
0.24 |
0 B |
| Copy11 |
LegacyJit |
X86 |
2.992 ns |
NA |
0.13 |
0 B |
| CopyBlock |
LegacyJit |
X86 |
15.780 ns |
NA |
0.69 |
0 B |
| CopyBlockUnaligned |
LegacyJit |
X86 |
13.963 ns |
NA |
0.61 |
0 B |
| ArrayCopy |
RyuJit |
X64 |
9.942 ns |
NA |
0.93 |
0 B |
| BlockCopy |
RyuJit |
X64 |
10.666 ns |
NA |
1.00 |
0 B |
| MemoryCopy |
RyuJit |
X64 |
3.173 ns |
NA |
0.30 |
0 B |
| ByteCopy |
RyuJit |
X64 |
7.308 ns |
NA |
0.69 |
0 B |
| LongCopy |
RyuJit |
X64 |
2.586 ns |
NA |
0.24 |
0 B |
| Copy11 |
RyuJit |
X64 |
2.586 ns |
NA |
0.24 |
0 B |
| CopyBlock |
RyuJit |
X64 |
3.406 ns |
NA |
0.32 |
0 B |
| CopyBlockUnaligned |
RyuJit |
X64 |
3.304 ns |
NA |
0.31 |
0 B |
MemoryCopy is fast but only support from .NET 4.6.
CopyBlock, CopyBlockUnaligned is from System.Runtime.CompilerServices.Unsafe, it is using cpblk opcode directly.
https://github.com/dotnet/corefx/blob/master/src/System.Runtime.CompilerServices.Unsafe/src/System.Runtime.CompilerServices.Unsafe.il#L162
It is nice but in x86, too slow.
ByteCopy , LongCopy, Copy11 are self defined unsafe copy code.
Surprisingly, it is very fast in small size(like property name binary).
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Exporters;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System;
using System.Runtime.CompilerServices;
using System.Text;
class Program
{
static void Main(string[] args)
{
var switcher = new BenchmarkSwitcher(new[]
{
typeof(StandardBenchmark)
});
args = new string[] { "0" };
#if DEBUG
var b = new StandardBenchmark();
#else
switcher.Run(args);
#endif
}
}
public class BenchmarkConfig : ManualConfig
{
public BenchmarkConfig()
{
Add(MarkdownExporter.GitHub);
Add(MemoryDiagnoser.Default);
Add(Job.RyuJitX64.WithWarmupCount(1).WithLaunchCount(1).WithTargetCount(1),
Job.LegacyJitX86.WithWarmupCount(1).WithLaunchCount(1).WithTargetCount(1));
}
}
[Config(typeof(BenchmarkConfig))]
public class StandardBenchmark
{
byte[] bytes = Encoding.UTF8.GetBytes("MyProperty1");
byte[] to;
public StandardBenchmark()
{
to = new byte[bytes.Length];
}
[Benchmark]
public void ArrayCopy()
{
Array.Copy(bytes, 0, to, 0, to.Length);
}
[Benchmark(Baseline = true)]
public void BlockCopy()
{
Buffer.BlockCopy(bytes, 0, to, 0, to.Length);
}
[Benchmark]
public unsafe void MemoryCopy()
{
fixed (void* s = &bytes[0])
fixed (void* p = &to[0])
{
Buffer.MemoryCopy(s, p, bytes.Length, to.Length);
}
}
[Benchmark]
public unsafe void UnsafeSimple()
{
fixed (byte* s = &bytes[0])
fixed (byte* p = &to[0])
{
UnsafeUtility.ByteCopy(s, p, to.Length);
}
}
[Benchmark]
public unsafe void UnsafeBlock()
{
fixed (byte* s = &bytes[0])
fixed (byte* p = &to[0])
{
UnsafeUtility.LongCopy(s, p, to.Length);
}
}
[Benchmark]
public unsafe void UnsafeOptimized()
{
fixed (byte* s = &bytes[0])
fixed (byte* p = &to[0])
{
UnsafeUtility.Copy11(s, p);
}
}
[Benchmark]
public unsafe void CopyBlock()
{
fixed (void* s = &bytes[0])
fixed (void* p = &to[0])
{
Unsafe.CopyBlock(p, s, (uint)bytes.Length);
}
}
[Benchmark]
public unsafe void CopyBlockUnaligned()
{
fixed (void* s = &bytes[0])
fixed (void* p = &to[0])
{
Unsafe.CopyBlockUnaligned(p, s, (uint)bytes.Length);
}
}
}
public unsafe static class UnsafeUtility
{
public static unsafe void ByteCopy(byte* src, byte* dst, int count)
{
for (int i = 0; i < count; i++)
{
*dst = *src;
src++;
dst++;
}
}
public static unsafe void LongCopy(byte* src, byte* dst, int count)
{
while (count >= 8)
{
*(ulong*)dst = *(ulong*)src;
dst += 8;
src += 8;
count -= 8;
}
if (count >= 4)
{
*(uint*)dst = *(uint*)src;
dst += 4;
src += 4;
count -= 4;
}
if (count >= 2)
{
*(ushort*)dst = *(ushort*)src;
dst += 2;
src += 2;
count -= 2;
}
if (count >= 1)
{
*dst = *src;
}
}
// Optimized for count
public static unsafe void Copy11(byte* src, byte* dst)
{
// 11 - 8 = 3
*(ulong*)dst = *(ulong*)src;
dst += 8;
src += 8;
// 3 - 2 = 1
*(ushort*)dst = *(ushort*)src;
dst += 2;
src += 2;
// 1 - 1 = 0
*dst = *src;
}
}
I'll write Copy1 ~ Copy32 helper and embed in IL.
I do not know which of Buffer.BlockCopy and SelfCode is faster for large size.
I'll measure and think about which to use.
Note, here is Buffer.BlockCopy code
https://github.com/dotnet/coreclr/blob/5c07c5aa98f8a088bf25099f1ab2d38b59ea5478/src/vm/comutilnative.cpp#L1366
In serialization process, MessagePack for C# uses Buffer.BlockCopy heavily.
for example when serializing contractless(stirng key), property name was cached and uses block copy.
Here is benchmark of property name(
Encoding.UTF8.GetBytes("MyProperty1"), length = 11) copy speed.MemoryCopy is fast but only support from .NET 4.6.
CopyBlock, CopyBlockUnaligned is from System.Runtime.CompilerServices.Unsafe, it is using cpblk opcode directly.
https://github.com/dotnet/corefx/blob/master/src/System.Runtime.CompilerServices.Unsafe/src/System.Runtime.CompilerServices.Unsafe.il#L162
It is nice but in x86, too slow.
ByteCopy , LongCopy, Copy11 are self defined unsafe copy code.
Surprisingly, it is very fast in small size(like property name binary).
I'll write
Copy1 ~ Copy32helper and embed in IL.I do not know which of Buffer.BlockCopy and SelfCode is faster for large size.
I'll measure and think about which to use.
Note, here is Buffer.BlockCopy code
https://github.com/dotnet/coreclr/blob/5c07c5aa98f8a088bf25099f1ab2d38b59ea5478/src/vm/comutilnative.cpp#L1366