From 49e93641b49713d186b6d4df06755db19b12e904 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 Oct 2020 20:38:46 +0000 Subject: [PATCH 01/20] Initial 3padshuffle4 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 84 +++++++++++++++++-- .../Common/Helpers/SimdUtils.Shuffle.cs | 63 ++++++++++++-- .../PixelFormats/Utils/PixelConverter.cs | 12 +-- .../Color/Bulk/Pad3Shuffle4Channel.cs | 67 +++++++++++++++ .../Color/Bulk/ShuffleByte4Channel.cs | 2 +- .../Color/Bulk/ShuffleFloat4Channel.cs | 2 +- .../Common/SimdUtilsTests.Shuffle.cs | 71 ++++++++++++++-- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 2 + 8 files changed, 274 insertions(+), 29 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 782328eddf..8a0b5460c5 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -26,7 +26,7 @@ public static class HwIntrinsics /// The destination span of floats. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] - public static void Shuffle4ChannelReduce( + public static void Shuffle4Reduce( ref ReadOnlySpan source, ref Span dest, byte control) @@ -41,7 +41,7 @@ public static void Shuffle4ChannelReduce( if (adjustedCount > 0) { - Shuffle4Channel( + Shuffle4( source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount), control); @@ -53,14 +53,14 @@ public static void Shuffle4ChannelReduce( } /// - /// Shuffle 8-bit integers in a within 128-bit lanes in + /// Shuffle 8-bit integers within 128-bit lanes in /// using the control and store the results in . /// /// The source span of bytes. /// The destination span of bytes. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] - public static void Shuffle4ChannelReduce( + public static void Shuffle4Reduce( ref ReadOnlySpan source, ref Span dest, byte control) @@ -75,7 +75,7 @@ public static void Shuffle4ChannelReduce( if (adjustedCount > 0) { - Shuffle4Channel( + Shuffle4( source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount), control); @@ -86,8 +86,41 @@ public static void Shuffle4ChannelReduce( } } + /// + /// Pads then shuffles 8-bit integers within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static unsafe void Pad3Shuffle4Reduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + int remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + + int adjustedCount = source.Length - remainder; + int sourceSlice = (int)(adjustedCount * (3 / 4F)); + + if (adjustedCount > 0) + { + Pad3Shuffle4( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(sourceSlice); + dest = dest.Slice(adjustedCount); + } + } + } + [MethodImpl(InliningOptions.ShortMethod)] - private static void Shuffle4Channel( + private static void Shuffle4( ReadOnlySpan source, Span dest, byte control) @@ -165,7 +198,7 @@ private static void Shuffle4Channel( } [MethodImpl(InliningOptions.ShortMethod)] - private static void Shuffle4Channel( + private static void Shuffle4( ReadOnlySpan source, Span dest, byte control) @@ -246,6 +279,43 @@ private static void Shuffle4Channel( } } + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe void Pad3Shuffle4( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + Vector128 wMask = Vector128.Create(0xff000000u).AsByte(); + Vector128 padMask = Vector128.Create(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1).AsByte(); + + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + + fixed (byte* sBase = &source.GetPinnableReference()) + fixed (byte* dBase = &dest.GetPinnableReference()) + { + byte* s = sBase; + byte* d = dBase; + + // TODO: Consider unrolling and shuffling 4 at a time using Ssse3.AlignRight + // See https://stackoverflow.com/questions/2973708/fast-24-bit-array-32-bit-array-conversion + for (int i = 0; i < source.Length; i += 16) + { + Vector128 vs0 = Sse2.LoadVector128(s); + Vector128 val = Sse2.Or(wMask, Ssse3.Shuffle(vs0, padMask)); + val = Ssse3.Shuffle(val, vcm); + Sse2.Store(d, val); + + s += 12; + d += 16; + } + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index a4a40fb4fa..81d77d6551 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -18,7 +18,7 @@ internal static partial class SimdUtils /// The destination span of floats. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] - public static void Shuffle4Channel( + public static void Shuffle4( ReadOnlySpan source, Span dest, byte control) @@ -26,13 +26,13 @@ public static void Shuffle4Channel( VerifyShuffleSpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS - HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); + HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control); #endif // Deal with the remainder: if (source.Length > 0) { - ShuffleRemainder4Channel(source, dest, control); + Shuffle4Remainder(source, dest, control); } } @@ -44,7 +44,7 @@ public static void Shuffle4Channel( /// The destination span of bytes. /// The type of shuffle to perform. [MethodImpl(InliningOptions.ShortMethod)] - public static void Shuffle4Channel( + public static void Shuffle4( ReadOnlySpan source, Span dest, TShuffle shuffle) @@ -53,7 +53,7 @@ public static void Shuffle4Channel( VerifyShuffleSpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS - HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, shuffle.Control); + HwIntrinsics.Shuffle4Reduce(ref source, ref dest, shuffle.Control); #endif // Deal with the remainder: @@ -63,7 +63,26 @@ public static void Shuffle4Channel( } } - public static void ShuffleRemainder4Channel( + [MethodImpl(InliningOptions.ShortMethod)] + public static void Pad3Shuffle4( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyPadShuffleSpanInput(source, dest); + +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + Pad3Shuffle4Remainder(source, dest, control); + } + } + + public static void Shuffle4Remainder( ReadOnlySpan source, Span dest, byte control) @@ -81,6 +100,24 @@ public static void ShuffleRemainder4Channel( } } + public static void Pad3Shuffle4Remainder( + ReadOnlySpan source, + Span dest, + byte control) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); + + for (int i = 0, j = 0; i < dest.Length; i += 4, j += 3) + { + Unsafe.Add(ref dBase, p0 + i) = Unsafe.Add(ref sBase, j); + Unsafe.Add(ref dBase, p1 + i) = Unsafe.Add(ref sBase, j + 1); + Unsafe.Add(ref dBase, p2 + i) = Unsafe.Add(ref sBase, j + 2); + Unsafe.Add(ref dBase, p3 + i) = byte.MaxValue; + } + } + [Conditional("DEBUG")] private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) where T : struct @@ -96,6 +133,20 @@ private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span de "Input spans must be divisiable by 4!"); } + [Conditional("DEBUG")] + private static void VerifyPadShuffleSpanInput(ReadOnlySpan source, Span dest) + { + DebugGuard.IsTrue( + source.Length == (int)(dest.Length * 3 / 4F), + nameof(source), + "Input spans must be 3/4 the length of the output span!"); + + DebugGuard.IsTrue( + source.Length % 3 == 0, + nameof(source), + "Input spans must be divisiable by 3!"); + } + public static class Shuffle { [MethodImpl(InliningOptions.ShortMethod)] diff --git a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs index ab9011a5c7..5afd369be3 100644 --- a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs +++ b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs @@ -27,7 +27,7 @@ public static class FromRgba32 /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToArgb32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, default); + => SimdUtils.Shuffle4(source, dest, default); /// /// Converts a representing a collection of @@ -36,7 +36,7 @@ public static void ToArgb32(ReadOnlySpan source, Span dest) /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToBgra32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, default); + => SimdUtils.Shuffle4(source, dest, default); } public static class FromArgb32 @@ -48,7 +48,7 @@ public static class FromArgb32 /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToRgba32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, default); + => SimdUtils.Shuffle4(source, dest, default); /// /// Converts a representing a collection of @@ -57,7 +57,7 @@ public static void ToRgba32(ReadOnlySpan source, Span dest) /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToBgra32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, default); + => SimdUtils.Shuffle4(source, dest, default); } public static class FromBgra32 @@ -69,7 +69,7 @@ public static class FromBgra32 /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToArgb32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, default); + => SimdUtils.Shuffle4(source, dest, default); /// /// Converts a representing a collection of @@ -78,7 +78,7 @@ public static void ToArgb32(ReadOnlySpan source, Span dest) /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToRgba32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, default); + => SimdUtils.Shuffle4(source, dest, default); } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs new file mode 100644 index 0000000000..c529b2af1e --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs @@ -0,0 +1,67 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Pad3Shuffle4Channel + { + private byte[] source; + private byte[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[this.Count]; + new Random(this.Count).NextBytes(this.source); + this.destination = new byte[this.Count]; + } + + [Params(96, 384, 768, 1536)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Channel() + { + SimdUtils.Shuffle4(this.source, this.destination, default); + } + } + + // 2020-10-29 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.403 + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - | +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs index 749859eac9..db49470011 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs @@ -26,7 +26,7 @@ public void Setup() [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4Channel(this.source, this.destination, default); + SimdUtils.Shuffle4(this.source, this.destination, default); } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs index 6f5b5001be..4a2512fea2 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -27,7 +27,7 @@ public void Setup() [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4Channel(this.source, this.destination, control); + SimdUtils.Shuffle4(this.source, this.destination, control); } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 06f61e617d..1c456e5a22 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -22,7 +22,7 @@ static void RunTest(string serialized) TestShuffleFloat4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, control), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, control), control); } @@ -49,43 +49,43 @@ static void RunTest(string serialized) WXYZShuffle4 wxyz = default; TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wxyz), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wxyz), wxyz.Control); WZYXShuffle4 wzyx = default; TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wzyx), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wzyx), wzyx.Control); YZWXShuffle4 yzwx = default; TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, yzwx), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yzwx), yzwx.Control); ZYXWShuffle4 zyxw = default; TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, zyxw), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, zyxw), zyxw.Control); var xwyz = new DefaultShuffle4(2, 1, 3, 0); TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, xwyz), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, xwyz), xwyz.Control); var yyyy = new DefaultShuffle4(1, 1, 1, 1); TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, yyyy), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yyyy), yyyy.Control); var wwww = new DefaultShuffle4(3, 3, 3, 3); TestShuffleByte4Channel( size, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wwww), + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wwww), wwww.Control); } } @@ -97,6 +97,29 @@ static void RunTest(string serialized) HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); } + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy3))] + public void BulkPad3Shuffle4Channel(int count) + { + static void RunTest(string serialized) + { + // No need to test multiple shuffle controls as the + // pipeline is always the same. + int size = FeatureTestRunner.Deserialize(serialized); + byte control = default(WZYXShuffle4).Control; + + TestPad3Shuffle4Channel( + size, + (s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, control), + control); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); + } + private static void TestShuffleFloat4Channel( int count, Action, Memory> convert, @@ -157,5 +180,37 @@ private static void TestShuffleByte4Channel( Assert.Equal(expected, result); } + + private static void TestPad3Shuffle4Channel( + int count, + Action, Memory> convert, + byte control) + { + byte[] source = new byte[count]; + new Random(count).NextBytes(source); + + var result = new byte[(int)(count * (4 / 3F))]; + + byte[] expected = new byte[result.Length]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); + + for (int i = 0, j = 0; i < expected.Length; i += 4, j += 3) + { + expected[p0 + i] = source[j]; + expected[p1 + i] = source[j + 1]; + expected[p2 + i] = source[j + 2]; + expected[p3 + i] = byte.MaxValue; + } + + convert(source, result); + + Assert.Equal(expected, result); + } } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index bddadff4da..fe432107a2 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -163,6 +163,8 @@ public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedDa public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 }; public static readonly TheoryData ArraySizesDivisibleBy4 = new TheoryData { 0, 4, 8, 28, 1020 }; + public static readonly TheoryData ArraySizesDivisibleBy3 = new TheoryData { 0, 3, 9, 36, 957 }; + public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 }; From 1d21dc9b59b727cdf6cb463184905932365c147a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 Oct 2020 23:03:47 +0000 Subject: [PATCH 02/20] Add Shuffle4Slice3 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 77 +++++++++++++++++++ .../Common/Helpers/SimdUtils.Shuffle.cs | 72 +++++++++++++++-- .../Color/Bulk/Pad3Shuffle4Channel.cs | 47 ++++++----- .../Color/Bulk/Shuffle4Slice3Channel.cs | 68 ++++++++++++++++ .../Color/Bulk/ShuffleFloat4Channel.cs | 4 +- .../Common/SimdUtilsTests.Shuffle.cs | 64 +++++++++++++++ 6 files changed, 299 insertions(+), 33 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 8a0b5460c5..29b569a809 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -119,6 +119,39 @@ public static unsafe void Pad3Shuffle4Reduce( } } + /// + /// Shuffles then slices 8-bit integers within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static unsafe void Shuffle4Slice3Reduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + int remainder = ImageMaths.ModuloP2(dest.Length, Vector128.Count); + + int adjustedCount = dest.Length - remainder; + int destSlice = (int)(adjustedCount * (3 / 4F)); + + if (adjustedCount > 0) + { + Shuffle4Slice3( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(destSlice); + } + } + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Shuffle4( ReadOnlySpan source, @@ -316,6 +349,50 @@ private static unsafe void Pad3Shuffle4( } } + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe void Shuffle4Slice3( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + Vector128 sliceMask = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1).AsByte(); + + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + + // var control = MmShuffle(3, 0, 1, 2); + // Span bytes = stackalloc byte[Vector128.Count]; + // MmShuffleSpan(ref bytes, control); + // Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + // + // Vector128 s0 = Vector128.Create((byte)1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1).Dump("s0"); + // Vector128 padded = Ssse3.Shuffle(s0, padMask).Dump("padded"); + // + // padded = Sse3.Or(Vector128.Create(0xff000000u).AsByte(), padded).Dump("0r"); + // + // var shuffled = Ssse3.Shuffle(padded, vcm).Dump("shuffled"); + // var d0 = Ssse3.Shuffle(shuffled, sliceMask).Dump("d0"); + fixed (byte* sBase = &source.GetPinnableReference()) + fixed (byte* dBase = &dest.GetPinnableReference()) + { + byte* s = sBase; + byte* d = dBase; + + for (int i = 0; i < source.Length; i += 16) + { + Vector128 vs0 = Ssse3.Shuffle(Sse2.LoadVector128(s), vcm); + Sse2.Store(d, Ssse3.Shuffle(vs0, sliceMask)); + + s += 16; + d += 12; + } + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 81d77d6551..f3946361b1 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -69,7 +69,7 @@ public static void Pad3Shuffle4( Span dest, byte control) { - VerifyPadShuffleSpanInput(source, dest); + VerifyPad3Shuffle4SpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, control); @@ -82,6 +82,25 @@ public static void Pad3Shuffle4( } } + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4Slice3( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyShuffle4Slice3SpanInput(source, dest); + +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + Shuffle4Slice3Remainder(source, dest, control); + } + } + public static void Shuffle4Remainder( ReadOnlySpan source, Span dest, @@ -118,6 +137,23 @@ public static void Pad3Shuffle4Remainder( } } + public static void Shuffle4Slice3Remainder( + ReadOnlySpan source, + Span dest, + byte control) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + Shuffle.InverseMmShuffle(control, out int _, out int p2, out int p1, out int p0); + + for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); + } + } + [Conditional("DEBUG")] private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) where T : struct @@ -130,21 +166,45 @@ private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span de DebugGuard.IsTrue( source.Length % 4 == 0, nameof(source), - "Input spans must be divisiable by 4!"); + "Input spans must be divisable by 4!"); } [Conditional("DEBUG")] - private static void VerifyPadShuffleSpanInput(ReadOnlySpan source, Span dest) + private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan source, Span dest) { + DebugGuard.IsTrue( + source.Length % 3 == 0, + nameof(source), + "Input span must be divisable by 3!"); + + DebugGuard.IsTrue( + dest.Length % 4 == 0, + nameof(dest), + "Output span must be divisable by 4!"); + DebugGuard.IsTrue( source.Length == (int)(dest.Length * 3 / 4F), nameof(source), - "Input spans must be 3/4 the length of the output span!"); + "Input span must be 3/4 the length of the output span!"); + } + [Conditional("DEBUG")] + private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan source, Span dest) + { DebugGuard.IsTrue( - source.Length % 3 == 0, + source.Length % 4 == 0, + nameof(source), + "Input span must be divisable by 4!"); + + DebugGuard.IsTrue( + dest.Length % 3 == 0, + nameof(dest), + "Output span must be divisable by 3!"); + + DebugGuard.IsTrue( + source.Length == (int)(dest.Length * 4 / 3F), nameof(source), - "Input spans must be divisiable by 3!"); + "Output span must be 3/4 the length of the input span!"); } public static class Shuffle diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs index c529b2af1e..8286fea0e5 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs @@ -9,6 +9,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Pad3Shuffle4Channel { + private static readonly byte Control = default(WXYZShuffle4).Control; private byte[] source; private byte[] destination; @@ -17,20 +18,20 @@ public void Setup() { this.source = new byte[this.Count]; new Random(this.Count).NextBytes(this.source); - this.destination = new byte[this.Count]; + this.destination = new byte[(int)(this.Count * (4 / 3F))]; } [Params(96, 384, 768, 1536)] public int Count { get; set; } [Benchmark] - public void Shuffle4Channel() + public void Pad3Shuffle4() { - SimdUtils.Shuffle4(this.source, this.destination, default); + SimdUtils.Pad3Shuffle4(this.source, this.destination, Control); } } - // 2020-10-29 + // 2020-10-30 // ########## // // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) @@ -43,25 +44,21 @@ public void Shuffle4Channel() // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 62.91 ns | 1.240 ns | 1.569 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 44.34 ns | 0.371 ns | 0.329 ns | 0.70 | 0.02 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 44.46 ns | 0.617 ns | 0.515 ns | 0.70 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 247.93 ns | 2.640 ns | 2.470 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 92.91 ns | 1.204 ns | 1.127 ns | 0.37 | 0.01 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 91.42 ns | 1.234 ns | 1.094 ns | 0.37 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 444.79 ns | 5.094 ns | 4.254 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 162.92 ns | 1.046 ns | 0.873 ns | 0.37 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 166.22 ns | 1.728 ns | 1.443 ns | 0.37 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 882.51 ns | 6.936 ns | 5.792 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 309.72 ns | 3.777 ns | 3.533 ns | 0.35 | 0.01 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 323.18 ns | 4.079 ns | 3.816 ns | 0.37 | 0.00 | - | - | - | - | } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs new file mode 100644 index 0000000000..b64379959d --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Shuffle4Slice3Channel + { + private static readonly byte Control = default(WXYZShuffle4).Control; + private byte[] source; + private byte[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[this.Count]; + new Random(this.Count).NextBytes(this.source); + this.destination = new byte[(int)(this.Count * (3 / 4F))]; + } + + [Params(128, 256, 512, 1024, 2048)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Slice3() + { + SimdUtils.Shuffle4Slice3(this.source, this.destination, Control); + } + } + + // 2020-10-29 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.403 + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 50.09 ns | 1.018 ns | 1.460 ns | 49.16 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 128 | 35.28 ns | 0.106 ns | 0.089 ns | 35.30 ns | 0.69 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 35.13 ns | 0.247 ns | 0.231 ns | 35.22 ns | 0.69 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 101.48 ns | 0.875 ns | 0.819 ns | 101.60 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 256 | 53.25 ns | 0.518 ns | 0.433 ns | 53.21 ns | 0.52 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 57.21 ns | 0.508 ns | 0.451 ns | 57.38 ns | 0.56 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 202.53 ns | 0.884 ns | 0.827 ns | 202.40 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 512 | 82.55 ns | 0.418 ns | 0.391 ns | 82.59 ns | 0.41 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 82.89 ns | 1.057 ns | 0.989 ns | 82.48 ns | 0.41 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 398.79 ns | 7.807 ns | 6.921 ns | 395.67 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 1024 | 144.51 ns | 1.033 ns | 0.966 ns | 144.42 ns | 0.36 | 0.01 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 143.77 ns | 0.820 ns | 0.684 ns | 143.62 ns | 0.36 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 798.44 ns | 4.447 ns | 3.472 ns | 799.39 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 2048 | 277.12 ns | 1.723 ns | 1.612 ns | 276.93 ns | 0.35 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 275.70 ns | 1.796 ns | 1.500 ns | 275.51 ns | 0.35 | 0.00 | - | - | - | - || +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs index 4a2512fea2..86b1f766e1 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -10,7 +10,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class ShuffleFloat4Channel { - private static readonly byte control = default(WXYZShuffle4).Control; + private static readonly byte Control = default(WXYZShuffle4).Control; private float[] source; private float[] destination; @@ -27,7 +27,7 @@ public void Setup() [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4(this.source, this.destination, control); + SimdUtils.Shuffle4(this.source, this.destination, Control); } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 1c456e5a22..f801cd28b5 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -120,6 +120,29 @@ static void RunTest(string serialized) HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void BulkShuffle4Slice3Channel(int count) + { + static void RunTest(string serialized) + { + // No need to test multiple shuffle controls as the + // pipeline is always the same. + int size = FeatureTestRunner.Deserialize(serialized); + byte control = default(WZYXShuffle4).Control; + + TestShuffle4Slice3Channel( + size, + (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, control), + control); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); + } + private static void TestShuffleFloat4Channel( int count, Action, Memory> convert, @@ -210,6 +233,47 @@ private static void TestPad3Shuffle4Channel( convert(source, result); + for (int i = 0; i < expected.Length; i++) + { + Assert.Equal(expected[i], result[i]); + } + + Assert.Equal(expected, result); + } + + private static void TestShuffle4Slice3Channel( + int count, + Action, Memory> convert, + byte control) + { + byte[] source = new byte[count]; + new Random(count).NextBytes(source); + + var result = new byte[(int)(count * (3 / 4F))]; + + byte[] expected = new byte[result.Length]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int _, + out int p2, + out int p1, + out int p0); + + for (int i = 0, j = 0; i < expected.Length; i += 3, j += 4) + { + expected[i] = source[p0 + j]; + expected[i + 1] = source[p1 + j]; + expected[i + 2] = source[p2 + j]; + } + + convert(source, result); + + for (int i = 0; i < expected.Length; i++) + { + Assert.Equal(expected[i], result[i]); + } + Assert.Equal(expected, result); } } From 9f38d40221ed98dbd8a8d6e1e8d6d5c07e282b2a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 Oct 2020 23:22:17 +0000 Subject: [PATCH 03/20] Cleanup --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 29b569a809..5cba631ec9 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -363,18 +363,6 @@ private static unsafe void Shuffle4Slice3( Shuffle.MmShuffleSpan(ref bytes, control); Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - // var control = MmShuffle(3, 0, 1, 2); - // Span bytes = stackalloc byte[Vector128.Count]; - // MmShuffleSpan(ref bytes, control); - // Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - // - // Vector128 s0 = Vector128.Create((byte)1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1).Dump("s0"); - // Vector128 padded = Ssse3.Shuffle(s0, padMask).Dump("padded"); - // - // padded = Sse3.Or(Vector128.Create(0xff000000u).AsByte(), padded).Dump("0r"); - // - // var shuffled = Ssse3.Shuffle(padded, vcm).Dump("shuffled"); - // var d0 = Ssse3.Shuffle(shuffled, sliceMask).Dump("d0"); fixed (byte* sBase = &source.GetPinnableReference()) fixed (byte* dBase = &dest.GetPinnableReference()) { From 1b85483d03b020b84ec89b0b4b061d509bd05918 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 Oct 2020 00:23:38 +0000 Subject: [PATCH 04/20] fix spans directly --- src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 5cba631ec9..d6e45026b7 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -327,8 +327,8 @@ private static unsafe void Pad3Shuffle4( Shuffle.MmShuffleSpan(ref bytes, control); Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - fixed (byte* sBase = &source.GetPinnableReference()) - fixed (byte* dBase = &dest.GetPinnableReference()) + fixed (byte* sBase = source) + fixed (byte* dBase = dest) { byte* s = sBase; byte* d = dBase; @@ -363,8 +363,8 @@ private static unsafe void Shuffle4Slice3( Shuffle.MmShuffleSpan(ref bytes, control); Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - fixed (byte* sBase = &source.GetPinnableReference()) - fixed (byte* dBase = &dest.GetPinnableReference()) + fixed (byte* sBase = source) + fixed (byte* dBase = dest) { byte* s = sBase; byte* d = dBase; From 21611e1911dda235509ffe02567a37ef0d1b4287 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 Oct 2020 18:58:59 +0000 Subject: [PATCH 05/20] Faster Pad3Shuffle4 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 58 ++++++++++--------- .../Common/Helpers/SimdUtils.Shuffle.cs | 2 +- .../Color/Bulk/Pad3Shuffle4Channel.cs | 34 +++++------ .../Common/SimdUtilsTests.Shuffle.cs | 4 +- 4 files changed, 52 insertions(+), 46 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index d6e45026b7..5083a3c03d 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -101,20 +101,20 @@ public static unsafe void Pad3Shuffle4Reduce( { if (Ssse3.IsSupported) { - int remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + int remainder = source.Length % (Vector128.Count * 3); - int adjustedCount = source.Length - remainder; - int sourceSlice = (int)(adjustedCount * (3 / 4F)); + int sourceCount = source.Length - remainder; + int destCount = (int)(sourceCount * (4 / 3D)); - if (adjustedCount > 0) + if (sourceCount > 0) { Pad3Shuffle4( - source.Slice(0, adjustedCount), - dest.Slice(0, adjustedCount), + source.Slice(0, sourceCount), + dest.Slice(0, destCount), control); - source = source.Slice(sourceSlice); - dest = dest.Slice(adjustedCount); + source = source.Slice(sourceCount); + dest = dest.Slice(destCount); } } } @@ -320,31 +320,37 @@ private static unsafe void Pad3Shuffle4( { if (Ssse3.IsSupported) { - Vector128 wMask = Vector128.Create(0xff000000u).AsByte(); - Vector128 padMask = Vector128.Create(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1).AsByte(); + Vector128 vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte(); + Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - fixed (byte* sBase = source) - fixed (byte* dBase = dest) + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + int n = source.Length / Vector128.Count; + + for (int i = 0, j = 0; i < n; i += 3, j += 4) { - byte* s = sBase; - byte* d = dBase; + ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); + Vector128 v1 = Unsafe.Add(ref v0, 1); + Vector128 v2 = Unsafe.Add(ref v0, 2); + Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); - // TODO: Consider unrolling and shuffling 4 at a time using Ssse3.AlignRight - // See https://stackoverflow.com/questions/2973708/fast-24-bit-array-32-bit-array-conversion - for (int i = 0; i < source.Length; i += 16) - { - Vector128 vs0 = Sse2.LoadVector128(s); - Vector128 val = Sse2.Or(wMask, Ssse3.Shuffle(vs0, padMask)); - val = Ssse3.Shuffle(val, vcm); - Sse2.Store(d, val); + v2 = Ssse3.AlignRight(v2, v1, 8); + v1 = Ssse3.AlignRight(v1, v0, 12); - s += 12; - d += 16; - } + ref Vector128 vd = ref Unsafe.Add(ref destBase, j); + + vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); + Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); + Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); + Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index f3946361b1..54ca2a73ec 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -183,7 +183,7 @@ private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan source, Span< "Output span must be divisable by 4!"); DebugGuard.IsTrue( - source.Length == (int)(dest.Length * 3 / 4F), + source.Length == (int)(dest.Length * 3 / 4D), nameof(source), "Input span must be 3/4 the length of the output span!"); } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs index 8286fea0e5..9eb1e109be 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs @@ -44,21 +44,21 @@ public void Pad3Shuffle4() // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 62.91 ns | 1.240 ns | 1.569 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 44.34 ns | 0.371 ns | 0.329 ns | 0.70 | 0.02 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 44.46 ns | 0.617 ns | 0.515 ns | 0.70 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 247.93 ns | 2.640 ns | 2.470 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 92.91 ns | 1.204 ns | 1.127 ns | 0.37 | 0.01 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 91.42 ns | 1.234 ns | 1.094 ns | 0.37 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 444.79 ns | 5.094 ns | 4.254 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 162.92 ns | 1.046 ns | 0.873 ns | 0.37 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 166.22 ns | 1.728 ns | 1.443 ns | 0.37 | 0.00 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 882.51 ns | 6.936 ns | 5.792 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 309.72 ns | 3.777 ns | 3.533 ns | 0.35 | 0.01 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 323.18 ns | 4.079 ns | 3.816 ns | 0.37 | 0.00 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |------------- |------------------- |-------------------------------------------------- |------ |----------:|----------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 71.84 ns | 1.491 ns | 3.146 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.14 ns | 0.186 ns | 0.165 ns | 0.33 | 0.02 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 22.94 ns | 0.127 ns | 0.112 ns | 0.33 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 230.90 ns | 0.877 ns | 0.777 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 39.54 ns | 0.601 ns | 0.533 ns | 0.17 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 39.88 ns | 0.657 ns | 0.548 ns | 0.17 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 451.67 ns | 2.932 ns | 2.448 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 60.79 ns | 1.200 ns | 1.561 ns | 0.13 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.30 ns | 0.469 ns | 0.416 ns | 0.14 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 896.11 ns | 10.358 ns | 9.689 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 109.72 ns | 2.149 ns | 2.300 ns | 0.12 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 108.70 ns | 0.994 ns | 0.881 ns | 0.12 | 0.00 | - | - | - | - | } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index f801cd28b5..26f85dd76c 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -212,7 +212,7 @@ private static void TestPad3Shuffle4Channel( byte[] source = new byte[count]; new Random(count).NextBytes(source); - var result = new byte[(int)(count * (4 / 3F))]; + var result = new byte[(int)(count * (4 / 3D))]; byte[] expected = new byte[result.Length]; @@ -249,7 +249,7 @@ private static void TestShuffle4Slice3Channel( byte[] source = new byte[count]; new Random(count).NextBytes(source); - var result = new byte[(int)(count * (3 / 4F))]; + var result = new byte[(int)(count * (3 / 4D))]; byte[] expected = new byte[result.Length]; From f462bfe7f02168002984dac415af46f780be6ff7 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 Oct 2020 21:35:23 +0000 Subject: [PATCH 06/20] Faster Shuffle4Slice3 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 88 ++++++++++++------- .../Common/Helpers/SimdUtils.Shuffle.cs | 4 +- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 5083a3c03d..abda6c4df6 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -104,7 +104,7 @@ public static unsafe void Pad3Shuffle4Reduce( int remainder = source.Length % (Vector128.Count * 3); int sourceCount = source.Length - remainder; - int destCount = (int)(sourceCount * (4 / 3D)); + int destCount = sourceCount * 4 / 3; if (sourceCount > 0) { @@ -134,20 +134,20 @@ public static unsafe void Shuffle4Slice3Reduce( { if (Ssse3.IsSupported) { - int remainder = ImageMaths.ModuloP2(dest.Length, Vector128.Count); + int remainder = source.Length % (Vector128.Count * 4); - int adjustedCount = dest.Length - remainder; - int destSlice = (int)(adjustedCount * (3 / 4F)); + int sourceCount = source.Length - remainder; + int destCount = sourceCount * 3 / 4; - if (adjustedCount > 0) + if (sourceCount > 0) { Shuffle4Slice3( - source.Slice(0, adjustedCount), - dest.Slice(0, adjustedCount), + source.Slice(0, sourceCount), + dest.Slice(0, destCount), control); - source = source.Slice(adjustedCount); - dest = dest.Slice(destSlice); + source = source.Slice(sourceCount); + dest = dest.Slice(destCount); } } } @@ -243,7 +243,7 @@ private static void Shuffle4( // We can add static ROS instances if need be in the future. Span bytes = stackalloc byte[Vector256.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector256 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector256 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); @@ -260,17 +260,17 @@ private static void Shuffle4( ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i); - vd0 = Avx2.Shuffle(vs0, vcm); - Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vcm); - Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vcm); - Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + vd0 = Avx2.Shuffle(vs0, vshuffle); + Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); + Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); + Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); } if (m > 0) { for (int i = u; i < n; i++) { - Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); } } } @@ -279,7 +279,7 @@ private static void Shuffle4( // Ssse3 Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); @@ -296,17 +296,17 @@ private static void Shuffle4( ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i); - vd0 = Ssse3.Shuffle(vs0, vcm); - Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vcm); - Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vcm); - Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + vd0 = Ssse3.Shuffle(vs0, vshuffle); + Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); + Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); + Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); } if (m > 0) { for (int i = u; i < n; i++) { - Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); } } } @@ -363,26 +363,46 @@ private static unsafe void Shuffle4Slice3( { if (Ssse3.IsSupported) { - Vector128 sliceMask = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1).AsByte(); + Vector128 vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte(); + Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte(); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - fixed (byte* sBase = source) - fixed (byte* dBase = dest) + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + int n = source.Length / Vector128.Count; + + for (int i = 0, j = 0; i < n; i += 4, j += 3) { - byte* s = sBase; - byte* d = dBase; + ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); + Vector128 v1 = Unsafe.Add(ref v0, 1); + Vector128 v2 = Unsafe.Add(ref v0, 2); + Vector128 v3 = Unsafe.Add(ref v0, 3); - for (int i = 0; i < source.Length; i += 16) - { - Vector128 vs0 = Ssse3.Shuffle(Sse2.LoadVector128(s), vcm); - Sse2.Store(d, Ssse3.Shuffle(vs0, sliceMask)); + v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske); + v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko); + v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske); + v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); - s += 16; - d += 12; - } + v0 = Ssse3.AlignRight(v1, v0, 4); + v3 = Ssse3.AlignRight(v3, v2, 12); + + v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); + v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); + + v1 = Ssse3.AlignRight(v2, v1, 8); + + ref Vector128 vd = ref Unsafe.Add(ref destBase, j); + + vd = v0; + Unsafe.Add(ref vd, 1) = v1; + Unsafe.Add(ref vd, 2) = v3; } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 54ca2a73ec..61c1ce48e2 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -183,7 +183,7 @@ private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan source, Span< "Output span must be divisable by 4!"); DebugGuard.IsTrue( - source.Length == (int)(dest.Length * 3 / 4D), + source.Length == dest.Length * 3 / 4, nameof(source), "Input span must be 3/4 the length of the output span!"); } @@ -202,7 +202,7 @@ private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan source, Spa "Output span must be divisable by 3!"); DebugGuard.IsTrue( - source.Length == (int)(dest.Length * 4 / 3F), + source.Length == dest.Length * 4 / 3, nameof(source), "Output span must be 3/4 the length of the input span!"); } From 2d1f2cc2c75255a234206954ccb3936ca115bc9b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 Oct 2020 21:38:45 +0000 Subject: [PATCH 07/20] Update benchmark --- .../Color/Bulk/Shuffle4Slice3Channel.cs | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs index b64379959d..e0fbe1c0b3 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs @@ -44,25 +44,25 @@ public void Shuffle4Slice3() // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:| - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 50.09 ns | 1.018 ns | 1.460 ns | 49.16 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 128 | 35.28 ns | 0.106 ns | 0.089 ns | 35.30 ns | 0.69 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 35.13 ns | 0.247 ns | 0.231 ns | 35.22 ns | 0.69 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 101.48 ns | 0.875 ns | 0.819 ns | 101.60 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 256 | 53.25 ns | 0.518 ns | 0.433 ns | 53.21 ns | 0.52 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 57.21 ns | 0.508 ns | 0.451 ns | 57.38 ns | 0.56 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 202.53 ns | 0.884 ns | 0.827 ns | 202.40 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 512 | 82.55 ns | 0.418 ns | 0.391 ns | 82.59 ns | 0.41 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 82.89 ns | 1.057 ns | 0.989 ns | 82.48 ns | 0.41 | 0.00 | - | - | - | - | - // | | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 398.79 ns | 7.807 ns | 6.921 ns | 395.67 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 1024 | 144.51 ns | 1.033 ns | 0.966 ns | 144.42 ns | 0.36 | 0.01 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 143.77 ns | 0.820 ns | 0.684 ns | 143.62 ns | 0.36 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 798.44 ns | 4.447 ns | 3.472 ns | 799.39 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 2048 | 277.12 ns | 1.723 ns | 1.612 ns | 276.93 ns | 0.35 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 275.70 ns | 1.796 ns | 1.500 ns | 275.51 ns | 0.35 | 0.00 | - | - | - | - || + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |--------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|------:|------:|------:|----------:| + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 52.24 ns | 1.081 ns | 1.062 ns | 1.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 128 | 25.52 ns | 0.189 ns | 0.158 ns | 0.49 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 128 | 26.11 ns | 0.524 ns | 0.644 ns | 0.50 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 101.09 ns | 0.733 ns | 0.612 ns | 1.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 256 | 32.65 ns | 0.674 ns | 1.198 ns | 0.33 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 256 | 32.76 ns | 0.656 ns | 0.853 ns | 0.32 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 209.58 ns | 3.826 ns | 5.957 ns | 1.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 512 | 46.32 ns | 0.729 ns | 1.296 ns | 0.22 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 512 | 46.97 ns | 0.196 ns | 0.183 ns | 0.22 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 406.39 ns | 7.493 ns | 6.257 ns | 1.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 1024 | 74.53 ns | 1.509 ns | 1.678 ns | 0.18 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 74.04 ns | 0.703 ns | 0.657 ns | 0.18 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 796.80 ns | 6.476 ns | 5.741 ns | 1.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 2048 | 130.70 ns | 2.512 ns | 2.227 ns | 0.16 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 129.42 ns | 2.555 ns | 2.133 ns | 0.16 | - | - | - | - | } From d5b257786ebdb20722dc4bd2161ee1d0056f7880 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sun, 1 Nov 2020 23:36:12 +0000 Subject: [PATCH 08/20] Fast fallbacks --- .../{ => Shuffle}/IComponentShuffle.cs | 40 ++-- .../Common/Helpers/Shuffle/IPad3Shuffle4.cs | 96 ++++++++++ .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 74 ++++++++ .../Common/Helpers/SimdUtils.Shuffle.cs | 69 +++---- .../PixelFormats/Utils/PixelConverter.cs | 118 ++++++++++++ .../Color/Bulk/Pad3Shuffle4Channel.cs | 61 ++++-- .../Color/Bulk/Shuffle4Slice3Channel.cs | 71 ++++--- .../Common/SimdUtilsTests.Shuffle.cs | 173 +++++++++++------- 8 files changed, 541 insertions(+), 161 deletions(-) rename src/ImageSharp/Common/Helpers/{ => Shuffle}/IComponentShuffle.cs (85%) create mode 100644 src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs create mode 100644 src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs diff --git a/src/ImageSharp/Common/Helpers/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs similarity index 85% rename from src/ImageSharp/Common/Helpers/IComponentShuffle.cs rename to src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs index e354a57b00..803321d06d 100644 --- a/src/ImageSharp/Common/Helpers/IComponentShuffle.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs @@ -30,13 +30,20 @@ internal interface IComponentShuffle internal readonly struct DefaultShuffle4 : IComponentShuffle { + private readonly byte p3; + private readonly byte p2; + private readonly byte p1; + private readonly byte p0; + public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0) - : this(SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0)) { + this.p3 = p3; + this.p2 = p2; + this.p1 = p1; + this.p0 = p0; + this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); } - public DefaultShuffle4(byte control) => this.Control = control; - public byte Control { get; } [MethodImpl(InliningOptions.ShortMethod)] @@ -44,12 +51,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { ref byte sBase = ref MemoryMarshal.GetReference(source); ref byte dBase = ref MemoryMarshal.GetReference(dest); - SimdUtils.Shuffle.InverseMmShuffle( - this.Control, - out int p3, - out int p2, - out int p1, - out int p0); + + int p3 = this.p3; + int p2 = this.p2; + int p1 = this.p1; + int p0 = this.p0; for (int i = 0; i < source.Length; i += 4) { @@ -63,7 +69,9 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct WXYZShuffle4 : IComponentShuffle { - public byte Control => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); + private static readonly byte WXYZ = SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); + + public byte Control => WXYZ; [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) @@ -89,7 +97,9 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct WZYXShuffle4 : IComponentShuffle { - public byte Control => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); + private static readonly byte WZYX = SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); + + public byte Control => WZYX; [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) @@ -112,7 +122,9 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct YZWXShuffle4 : IComponentShuffle { - public byte Control => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); + private static readonly byte YZWX = SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); + + public byte Control => YZWX; [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) @@ -135,7 +147,9 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct ZYXWShuffle4 : IComponentShuffle { - public byte Control => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); + private static readonly byte ZYXW = SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); + + public byte Control => ZYXW; [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs new file mode 100644 index 0000000000..97bd5aa725 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -0,0 +1,96 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + /// + internal interface IPad3Shuffle4 : IComponentShuffle + { + } + + internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4 + { + private readonly byte p3; + private readonly byte p2; + private readonly byte p1; + private readonly byte p0; + + public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0) + { + this.p3 = p3; + this.p2 = p2; + this.p1 = p1; + this.p0 = p0; + this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); + } + + public byte Control { get; } + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + + int p3 = this.p3; + int p2 = this.p2; + int p1 = this.p1; + int p0 = this.p0; + + Span temp = stackalloc byte[4]; + ref byte t = ref MemoryMarshal.GetReference(temp); + ref uint tu = ref Unsafe.As(ref t); + + for (int i = 0, j = 0; i < source.Length; i += 3, j += 4) + { + ref var s = ref Unsafe.Add(ref sBase, i); + tu = Unsafe.As(ref s) | 0xFF000000; + + Unsafe.Add(ref dBase, j) = Unsafe.Add(ref t, p0); + Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1); + Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2); + Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3); + } + } + } + + internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4 + { + private static readonly byte XYZW = SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); + + public byte Control => XYZW; + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref byte rs = ref MemoryMarshal.GetReference(source); + ref byte rd = ref MemoryMarshal.GetReference(dest); + + ref byte rsEnd = ref Unsafe.Add(ref rs, source.Length); + ref byte rsLoopEnd = ref Unsafe.Subtract(ref rsEnd, 4); + + while (Unsafe.IsAddressLessThan(ref rs, ref rsLoopEnd)) + { + Unsafe.As(ref rd) = Unsafe.As(ref rs) | 0xFF000000; + + rs = ref Unsafe.Add(ref rs, 3); + rd = ref Unsafe.Add(ref rd, 4); + } + + while (Unsafe.IsAddressLessThan(ref rs, ref rsEnd)) + { + Unsafe.Add(ref rd, 0) = Unsafe.Add(ref rs, 0); + Unsafe.Add(ref rd, 1) = Unsafe.Add(ref rs, 1); + Unsafe.Add(ref rd, 2) = Unsafe.Add(ref rs, 2); + Unsafe.Add(ref rd, 3) = byte.MaxValue; + + rs = ref Unsafe.Add(ref rs, 3); + rd = ref Unsafe.Add(ref rd, 4); + } + } + } +} diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs new file mode 100644 index 0000000000..c65c50f684 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -0,0 +1,74 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + /// + internal interface IShuffle4Slice3 : IComponentShuffle + { + } + + internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3 + { + private readonly byte p2; + private readonly byte p1; + private readonly byte p0; + + public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0) + { + this.p2 = p2; + this.p1 = p1; + this.p0 = p0; + this.Control = SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0); + } + + public byte Control { get; } + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + + int p2 = this.p2; + int p1 = this.p1; + int p0 = this.p0; + + for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); + } + } + } + + internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3 + { + private static readonly byte XYZW = SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); + + public byte Control => XYZW; + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + + int n = source.Length / 4; + for (int i = 0, j = 0; i < n; i++, j += 3) + { + Unsafe.As(ref Unsafe.Add(ref dBase, j)) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + } + } + } + + [StructLayout(LayoutKind.Explicit, Size = 3)] + internal readonly struct Xyz24 + { + } +} diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 61c1ce48e2..7ef3be6fe3 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -63,45 +63,61 @@ public static void Shuffle4( } } + /// + /// Pads then shuffles 8-bit integers within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The type of shuffle to perform. [MethodImpl(InliningOptions.ShortMethod)] - public static void Pad3Shuffle4( + public static void Pad3Shuffle4( ReadOnlySpan source, Span dest, - byte control) + TShuffle shuffle) + where TShuffle : struct, IPad3Shuffle4 { VerifyPad3Shuffle4SpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS - HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, control); + HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, shuffle.Control); #endif // Deal with the remainder: if (source.Length > 0) { - Pad3Shuffle4Remainder(source, dest, control); + shuffle.RunFallbackShuffle(source, dest); } } + /// + /// Shuffles then slices 8-bit integers within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The type of shuffle to perform. [MethodImpl(InliningOptions.ShortMethod)] - public static void Shuffle4Slice3( + public static void Shuffle4Slice3( ReadOnlySpan source, Span dest, - byte control) + TShuffle shuffle) + where TShuffle : struct, IShuffle4Slice3 { VerifyShuffle4Slice3SpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS - HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, control); + HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, shuffle.Control); #endif // Deal with the remainder: if (source.Length > 0) { - Shuffle4Slice3Remainder(source, dest, control); + shuffle.RunFallbackShuffle(source, dest); } } - public static void Shuffle4Remainder( + private static void Shuffle4Remainder( ReadOnlySpan source, Span dest, byte control) @@ -119,41 +135,6 @@ public static void Shuffle4Remainder( } } - public static void Pad3Shuffle4Remainder( - ReadOnlySpan source, - Span dest, - byte control) - { - ref byte sBase = ref MemoryMarshal.GetReference(source); - ref byte dBase = ref MemoryMarshal.GetReference(dest); - Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); - - for (int i = 0, j = 0; i < dest.Length; i += 4, j += 3) - { - Unsafe.Add(ref dBase, p0 + i) = Unsafe.Add(ref sBase, j); - Unsafe.Add(ref dBase, p1 + i) = Unsafe.Add(ref sBase, j + 1); - Unsafe.Add(ref dBase, p2 + i) = Unsafe.Add(ref sBase, j + 2); - Unsafe.Add(ref dBase, p3 + i) = byte.MaxValue; - } - } - - public static void Shuffle4Slice3Remainder( - ReadOnlySpan source, - Span dest, - byte control) - { - ref byte sBase = ref MemoryMarshal.GetReference(source); - ref byte dBase = ref MemoryMarshal.GetReference(dest); - Shuffle.InverseMmShuffle(control, out int _, out int p2, out int p1, out int p0); - - for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4) - { - Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); - } - } - [Conditional("DEBUG")] private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) where T : struct diff --git a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs index 5afd369be3..c5f92648c0 100644 --- a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs +++ b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs @@ -37,6 +37,24 @@ public static void ToArgb32(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public static void ToBgra32(ReadOnlySpan source, Span dest) => SimdUtils.Shuffle4(source, dest, default); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToRgb24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Slice3(source, dest, default); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToBgr24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Slice3(source, dest, new DefaultShuffle4Slice3(3, 0, 1, 2)); } public static class FromArgb32 @@ -58,6 +76,24 @@ public static void ToRgba32(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public static void ToBgra32(ReadOnlySpan source, Span dest) => SimdUtils.Shuffle4(source, dest, default); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToRgb24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Slice3(source, dest, new DefaultShuffle4Slice3(0, 3, 2, 1)); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToBgr24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Slice3(source, dest, new DefaultShuffle4Slice3(0, 1, 2, 3)); } public static class FromBgra32 @@ -79,6 +115,88 @@ public static void ToArgb32(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public static void ToRgba32(ReadOnlySpan source, Span dest) => SimdUtils.Shuffle4(source, dest, default); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToRgb24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Slice3(source, dest, new DefaultShuffle4Slice3(3, 0, 1, 2)); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToBgr24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Slice3(source, dest, default); + } + + public static class FromRgb24 + { + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToRgba32(ReadOnlySpan source, Span dest) + => SimdUtils.Pad3Shuffle4(source, dest, default); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToArgb32(ReadOnlySpan source, Span dest) + => SimdUtils.Pad3Shuffle4(source, dest, new DefaultPad3Shuffle4(2, 1, 0, 3)); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToBgra32(ReadOnlySpan source, Span dest) + => SimdUtils.Pad3Shuffle4(source, dest, new DefaultPad3Shuffle4(3, 0, 1, 2)); + + // TODO: Bgr24 + } + + public static class FromBgr24 + { + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToArgb32(ReadOnlySpan source, Span dest) + => SimdUtils.Pad3Shuffle4(source, dest, new DefaultPad3Shuffle4(0, 1, 2, 3)); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToRgba32(ReadOnlySpan source, Span dest) + => SimdUtils.Pad3Shuffle4(source, dest, new DefaultPad3Shuffle4(3, 0, 1, 2)); + + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToBgra32(ReadOnlySpan source, Span dest) + => SimdUtils.Pad3Shuffle4(source, dest, default); + + // TODO: Rgb24 } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs index 9eb1e109be..4af0286054 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs @@ -9,7 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Pad3Shuffle4Channel { - private static readonly byte Control = default(WXYZShuffle4).Control; + private static readonly DefaultPad3Shuffle4 Control = new DefaultPad3Shuffle4(1, 0, 3, 2); + private static readonly XYZWPad3Shuffle4 ControlFast = default; private byte[] source; private byte[] destination; @@ -18,7 +19,7 @@ public void Setup() { this.source = new byte[this.Count]; new Random(this.Count).NextBytes(this.source); - this.destination = new byte[(int)(this.Count * (4 / 3F))]; + this.destination = new byte[this.Count * 4 / 3]; } [Params(96, 384, 768, 1536)] @@ -29,6 +30,12 @@ public void Pad3Shuffle4() { SimdUtils.Pad3Shuffle4(this.source, this.destination, Control); } + + [Benchmark] + public void Pad3Shuffle4FastFallback() + { + SimdUtils.Pad3Shuffle4(this.source, this.destination, ControlFast); + } } // 2020-10-30 @@ -44,21 +51,37 @@ public void Pad3Shuffle4() // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |------------- |------------------- |-------------------------------------------------- |------ |----------:|----------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 71.84 ns | 1.491 ns | 3.146 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.14 ns | 0.186 ns | 0.165 ns | 0.33 | 0.02 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 22.94 ns | 0.127 ns | 0.112 ns | 0.33 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 230.90 ns | 0.877 ns | 0.777 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 39.54 ns | 0.601 ns | 0.533 ns | 0.17 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 39.88 ns | 0.657 ns | 0.548 ns | 0.17 | 0.00 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 451.67 ns | 2.932 ns | 2.448 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 60.79 ns | 1.200 ns | 1.561 ns | 0.13 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.30 ns | 0.469 ns | 0.416 ns | 0.14 | 0.00 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 896.11 ns | 10.358 ns | 9.689 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 109.72 ns | 2.149 ns | 2.300 ns | 0.12 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 108.70 ns | 0.994 ns | 0.881 ns | 0.12 | 0.00 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |------------------------- |------------------- |-------------------------------------------------- |------ |------------:|----------:|----------:|------------:|------:|--------:|------:|------:|------:|----------:| + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 120.64 ns | 7.190 ns | 21.200 ns | 114.26 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.63 ns | 0.175 ns | 0.155 ns | 23.65 ns | 0.15 | 0.01 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 25.25 ns | 0.356 ns | 0.298 ns | 25.27 ns | 0.17 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 14.80 ns | 0.358 ns | 1.032 ns | 14.64 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 2. AVX | Empty | 96 | 24.84 ns | 0.376 ns | 0.333 ns | 24.74 ns | 1.57 | 0.06 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 96 | 24.58 ns | 0.471 ns | 0.704 ns | 24.38 ns | 1.60 | 0.09 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 258.92 ns | 4.873 ns | 4.069 ns | 257.95 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 41.41 ns | 0.859 ns | 1.204 ns | 41.33 ns | 0.16 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 40.74 ns | 0.848 ns | 0.793 ns | 40.48 ns | 0.16 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 74.50 ns | 0.490 ns | 0.383 ns | 74.49 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 2. AVX | Empty | 384 | 40.74 ns | 0.624 ns | 0.584 ns | 40.72 ns | 0.55 | 0.01 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 384 | 38.28 ns | 0.534 ns | 0.417 ns | 38.22 ns | 0.51 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 503.91 ns | 6.466 ns | 6.048 ns | 501.58 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 62.86 ns | 0.332 ns | 0.277 ns | 62.80 ns | 0.12 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 64.59 ns | 0.469 ns | 0.415 ns | 64.62 ns | 0.13 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 110.51 ns | 0.592 ns | 0.554 ns | 110.33 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 2. AVX | Empty | 768 | 64.72 ns | 1.306 ns | 1.090 ns | 64.51 ns | 0.59 | 0.01 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.11 ns | 0.816 ns | 0.682 ns | 61.98 ns | 0.56 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 1,005.84 ns | 13.176 ns | 12.325 ns | 1,004.70 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 110.05 ns | 0.256 ns | 0.214 ns | 110.04 ns | 0.11 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 110.23 ns | 0.545 ns | 0.483 ns | 110.09 ns | 0.11 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Pad3Shuffle4FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 220.37 ns | 1.601 ns | 1.419 ns | 220.13 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 2. AVX | Empty | 1536 | 111.54 ns | 2.173 ns | 2.901 ns | 111.27 ns | 0.51 | 0.01 | - | - | - | - | + // | Pad3Shuffle4FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 110.23 ns | 0.456 ns | 0.427 ns | 110.25 ns | 0.50 | 0.00 | - | - | - | - | } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs index e0fbe1c0b3..9cf24ccd69 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs @@ -9,7 +9,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Shuffle4Slice3Channel { - private static readonly byte Control = default(WXYZShuffle4).Control; + private static readonly DefaultShuffle4Slice3 Control = new DefaultShuffle4Slice3(1, 0, 3, 2); + private static readonly XYZWShuffle4Slice3 ControlFast = default; private byte[] source; private byte[] destination; @@ -29,6 +30,12 @@ public void Shuffle4Slice3() { SimdUtils.Shuffle4Slice3(this.source, this.destination, Control); } + + [Benchmark] + public void Shuffle4Slice3FastFallback() + { + SimdUtils.Shuffle4Slice3(this.source, this.destination, ControlFast); + } } // 2020-10-29 @@ -44,25 +51,45 @@ public void Shuffle4Slice3() // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |--------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|------:|------:|------:|----------:| - // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 52.24 ns | 1.081 ns | 1.062 ns | 1.00 | - | - | - | - | - // | Shuffle4Slice3 | 2. AVX | Empty | 128 | 25.52 ns | 0.189 ns | 0.158 ns | 0.49 | - | - | - | - | - // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 128 | 26.11 ns | 0.524 ns | 0.644 ns | 0.50 | - | - | - | - | - // | | | | | | | | | | | | | - // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 101.09 ns | 0.733 ns | 0.612 ns | 1.00 | - | - | - | - | - // | Shuffle4Slice3 | 2. AVX | Empty | 256 | 32.65 ns | 0.674 ns | 1.198 ns | 0.33 | - | - | - | - | - // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 256 | 32.76 ns | 0.656 ns | 0.853 ns | 0.32 | - | - | - | - | - // | | | | | | | | | | | | | - // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 209.58 ns | 3.826 ns | 5.957 ns | 1.00 | - | - | - | - | - // | Shuffle4Slice3 | 2. AVX | Empty | 512 | 46.32 ns | 0.729 ns | 1.296 ns | 0.22 | - | - | - | - | - // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 512 | 46.97 ns | 0.196 ns | 0.183 ns | 0.22 | - | - | - | - | - // | | | | | | | | | | | | | - // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 406.39 ns | 7.493 ns | 6.257 ns | 1.00 | - | - | - | - | - // | Shuffle4Slice3 | 2. AVX | Empty | 1024 | 74.53 ns | 1.509 ns | 1.678 ns | 0.18 | - | - | - | - | - // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 74.04 ns | 0.703 ns | 0.657 ns | 0.18 | - | - | - | - | - // | | | | | | | | | | | | | - // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 796.80 ns | 6.476 ns | 5.741 ns | 1.00 | - | - | - | - | - // | Shuffle4Slice3 | 2. AVX | Empty | 2048 | 130.70 ns | 2.512 ns | 2.227 ns | 0.16 | - | - | - | - | - // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 129.42 ns | 2.555 ns | 2.133 ns | 0.16 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |--------------------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 56.44 ns | 2.843 ns | 8.382 ns | 56.70 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 128 | 27.15 ns | 0.556 ns | 0.762 ns | 27.34 ns | 0.41 | 0.03 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 128 | 26.36 ns | 0.321 ns | 0.268 ns | 26.26 ns | 0.38 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 25.85 ns | 0.494 ns | 0.462 ns | 25.84 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 2. AVX | Empty | 128 | 26.15 ns | 0.113 ns | 0.106 ns | 26.16 ns | 1.01 | 0.02 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 128 | 25.57 ns | 0.078 ns | 0.061 ns | 25.56 ns | 0.99 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 97.47 ns | 0.327 ns | 0.289 ns | 97.35 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 256 | 32.61 ns | 0.107 ns | 0.095 ns | 32.62 ns | 0.33 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 256 | 33.21 ns | 0.169 ns | 0.150 ns | 33.15 ns | 0.34 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 52.34 ns | 0.779 ns | 0.729 ns | 51.94 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 2. AVX | Empty | 256 | 32.16 ns | 0.111 ns | 0.104 ns | 32.16 ns | 0.61 | 0.01 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 256 | 33.61 ns | 0.342 ns | 0.319 ns | 33.62 ns | 0.64 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 210.74 ns | 3.825 ns | 5.956 ns | 207.70 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 512 | 51.03 ns | 0.535 ns | 0.501 ns | 51.18 ns | 0.24 | 0.01 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 512 | 66.60 ns | 1.313 ns | 1.613 ns | 65.93 ns | 0.31 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 119.12 ns | 1.905 ns | 1.689 ns | 118.52 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 2. AVX | Empty | 512 | 50.33 ns | 0.382 ns | 0.339 ns | 50.41 ns | 0.42 | 0.01 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 512 | 49.25 ns | 0.555 ns | 0.492 ns | 49.26 ns | 0.41 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 423.55 ns | 4.891 ns | 4.336 ns | 423.27 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 1024 | 77.13 ns | 1.355 ns | 2.264 ns | 76.19 ns | 0.19 | 0.01 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 79.39 ns | 0.103 ns | 0.086 ns | 79.37 ns | 0.19 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 226.57 ns | 2.930 ns | 2.598 ns | 226.10 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 2. AVX | Empty | 1024 | 80.25 ns | 1.647 ns | 2.082 ns | 80.98 ns | 0.35 | 0.01 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 84.99 ns | 1.234 ns | 1.155 ns | 85.60 ns | 0.38 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 794.96 ns | 1.735 ns | 1.538 ns | 795.15 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 2. AVX | Empty | 2048 | 128.41 ns | 0.417 ns | 0.390 ns | 128.24 ns | 0.16 | 0.00 | - | - | - | - | + // | Shuffle4Slice3 | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 127.24 ns | 0.294 ns | 0.229 ns | 127.23 ns | 0.16 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Slice3FastFallback | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 382.97 ns | 1.064 ns | 0.831 ns | 382.87 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 2. AVX | Empty | 2048 | 126.93 ns | 0.382 ns | 0.339 ns | 126.94 ns | 0.33 | 0.00 | - | - | - | - | + // | Shuffle4Slice3FastFallback | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 149.36 ns | 1.875 ns | 1.754 ns | 149.33 ns | 0.39 | 0.00 | - | - | - | - | } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 26f85dd76c..29f3925fc9 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -39,56 +39,51 @@ public void BulkShuffleByte4Channel(int count) static void RunTest(string serialized) { int size = FeatureTestRunner.Deserialize(serialized); - foreach (var item in ArraySizesDivisibleBy4) - { - // These cannot be expressed as a theory as you cannot - // use RemoteExecutor within generic methods nor pass - // IComponentShuffle to the generic utils method. - foreach (var count in item) - { - WXYZShuffle4 wxyz = default; - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wxyz), - wxyz.Control); - - WZYXShuffle4 wzyx = default; - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wzyx), - wzyx.Control); - - YZWXShuffle4 yzwx = default; - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yzwx), - yzwx.Control); - - ZYXWShuffle4 zyxw = default; - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, zyxw), - zyxw.Control); - - var xwyz = new DefaultShuffle4(2, 1, 3, 0); - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, xwyz), - xwyz.Control); - - var yyyy = new DefaultShuffle4(1, 1, 1, 1); - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yyyy), - yyyy.Control); - - var wwww = new DefaultShuffle4(3, 3, 3, 3); - TestShuffleByte4Channel( - size, - (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wwww), - wwww.Control); - } - } + + // These cannot be expressed as a theory as you cannot + // use RemoteExecutor within generic methods nor pass + // IComponentShuffle to the generic utils method. + WXYZShuffle4 wxyz = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wxyz), + wxyz.Control); + + WZYXShuffle4 wzyx = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wzyx), + wzyx.Control); + + YZWXShuffle4 yzwx = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yzwx), + yzwx.Control); + + ZYXWShuffle4 zyxw = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, zyxw), + zyxw.Control); + + var xwyz = new DefaultShuffle4(2, 1, 3, 0); + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, xwyz), + xwyz.Control); + + var yyyy = new DefaultShuffle4(1, 1, 1, 1); + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, yyyy), + yyyy.Control); + + var wwww = new DefaultShuffle4(3, 3, 3, 3); + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4(s.Span, d.Span, wwww), + wwww.Control); } FeatureTestRunner.RunWithHwIntrinsicsFeature( @@ -103,21 +98,40 @@ public void BulkPad3Shuffle4Channel(int count) { static void RunTest(string serialized) { - // No need to test multiple shuffle controls as the - // pipeline is always the same. int size = FeatureTestRunner.Deserialize(serialized); - byte control = default(WZYXShuffle4).Control; + // These cannot be expressed as a theory as you cannot + // use RemoteExecutor within generic methods nor pass + // IComponentShuffle to the generic utils method. + XYZWPad3Shuffle4 xyzw = default; TestPad3Shuffle4Channel( size, - (s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, control), - control); + (s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, xyzw), + xyzw.Control); + + var xwyz = new DefaultPad3Shuffle4(2, 1, 3, 0); + TestPad3Shuffle4Channel( + size, + (s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, xwyz), + xwyz.Control); + + var yyyy = new DefaultPad3Shuffle4(1, 1, 1, 1); + TestPad3Shuffle4Channel( + size, + (s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, yyyy), + yyyy.Control); + + var wwww = new DefaultPad3Shuffle4(3, 3, 3, 3); + TestPad3Shuffle4Channel( + size, + (s, d) => SimdUtils.Pad3Shuffle4(s.Span, d.Span, wwww), + wwww.Control); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, count, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); } [Theory] @@ -126,15 +140,34 @@ public void BulkShuffle4Slice3Channel(int count) { static void RunTest(string serialized) { - // No need to test multiple shuffle controls as the - // pipeline is always the same. int size = FeatureTestRunner.Deserialize(serialized); - byte control = default(WZYXShuffle4).Control; + // These cannot be expressed as a theory as you cannot + // use RemoteExecutor within generic methods nor pass + // IComponentShuffle to the generic utils method. + XYZWShuffle4Slice3 xyzw = default; TestShuffle4Slice3Channel( size, - (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, control), - control); + (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, xyzw), + xyzw.Control); + + var xwyz = new DefaultShuffle4Slice3(2, 1, 3, 0); + TestShuffle4Slice3Channel( + size, + (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, xwyz), + xwyz.Control); + + var yyyy = new DefaultShuffle4Slice3(1, 1, 1, 1); + TestShuffle4Slice3Channel( + size, + (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, yyyy), + yyyy.Control); + + var wwww = new DefaultShuffle4Slice3(3, 3, 3, 3); + TestShuffle4Slice3Channel( + size, + (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, wwww), + wwww.Control); } FeatureTestRunner.RunWithHwIntrinsicsFeature( @@ -212,7 +245,7 @@ private static void TestPad3Shuffle4Channel( byte[] source = new byte[count]; new Random(count).NextBytes(source); - var result = new byte[(int)(count * (4 / 3D))]; + var result = new byte[count * 4 / 3]; byte[] expected = new byte[result.Length]; @@ -231,6 +264,20 @@ private static void TestPad3Shuffle4Channel( expected[p3 + i] = byte.MaxValue; } + Span temp = stackalloc byte[4]; + for (int i = 0, j = 0; i < expected.Length; i += 4, j += 3) + { + temp[0] = source[j]; + temp[1] = source[j + 1]; + temp[2] = source[j + 2]; + temp[3] = byte.MaxValue; + + expected[i] = temp[p0]; + expected[i + 1] = temp[p1]; + expected[i + 2] = temp[p2]; + expected[i + 3] = temp[p3]; + } + convert(source, result); for (int i = 0; i < expected.Length; i++) @@ -249,7 +296,7 @@ private static void TestShuffle4Slice3Channel( byte[] source = new byte[count]; new Random(count).NextBytes(source); - var result = new byte[(int)(count * (3 / 4D))]; + var result = new byte[count * 3 / 4]; byte[] expected = new byte[result.Length]; From 893bfdd17d29b31c9297eb81d3099835cd856bcc Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sun, 1 Nov 2020 23:51:38 +0000 Subject: [PATCH 09/20] Don't cast full spans --- .../Helpers/Shuffle/IComponentShuffle.cs | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs index 803321d06d..3f045a5799 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs @@ -6,6 +6,9 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +// The JIT can detect and optimize rotation idioms ROTL (Rotate Left) +// and ROTR (Rotate Right) emitting efficient CPU instructions: +// https://github.com/dotnet/coreclr/pull/1830 namespace SixLabors.ImageSharp { /// @@ -76,15 +79,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); - - // The JIT can detect and optimize rotation idioms ROTL (Rotate Left) - // and ROTR (Rotate Right) emitting efficient CPU instructions: - // https://github.com/dotnet/coreclr/pull/1830 - for (int i = 0; i < s.Length; i++) + ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 4; + + for (int i = 0; i < n; i++) { uint packed = Unsafe.Add(ref sBase, i); @@ -104,12 +103,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); + ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 4; - for (int i = 0; i < s.Length; i++) + for (int i = 0; i < n; i++) { uint packed = Unsafe.Add(ref sBase, i); @@ -129,12 +127,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); + ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 4; - for (int i = 0; i < s.Length; i++) + for (int i = 0; i < n; i++) { uint packed = Unsafe.Add(ref sBase, i); @@ -154,12 +151,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); + ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 4; - for (int i = 0; i < s.Length; i++) + for (int i = 0; i < n; i++) { uint packed = Unsafe.Add(ref sBase, i); From 76d52772b6ed69939ad82d87ca8a6a16b5d0579c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Nov 2020 01:52:06 +0000 Subject: [PATCH 10/20] Shuffle3 + Tests --- .../Helpers/Shuffle/IComponentShuffle.cs | 5 + .../Common/Helpers/Shuffle/IPad3Shuffle4.cs | 5 + .../Common/Helpers/Shuffle/IShuffle3.cs | 90 ++++++++++++++++ .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 12 +-- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 102 +++++++++++++++++- .../Common/Helpers/SimdUtils.Shuffle.cs | 48 ++++++++- .../Common/SimdUtilsTests.Shuffle.cs | 76 ++++++++++++- 7 files changed, 323 insertions(+), 15 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs index 3f045a5799..1a4c6ab446 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs @@ -40,6 +40,11 @@ internal interface IComponentShuffle public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0) { + Guard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); + Guard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); + Guard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); + Guard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); + this.p3 = p3; this.p2 = p2; this.p1 = p1; diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs index 97bd5aa725..b223a6bc27 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -21,6 +21,11 @@ internal interface IPad3Shuffle4 : IComponentShuffle public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0) { + Guard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); + Guard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); + Guard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); + Guard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); + this.p3 = p3; this.p2 = p2; this.p1 = p1; diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs new file mode 100644 index 0000000000..fa4260e63d --- /dev/null +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs @@ -0,0 +1,90 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + /// + internal interface IShuffle3 : IComponentShuffle + { + } + + internal readonly struct DefaultShuffle3 : IShuffle3 + { + private readonly byte p2; + private readonly byte p1; + private readonly byte p0; + + public DefaultShuffle3(byte p2, byte p1, byte p0) + { + Guard.MustBeBetweenOrEqualTo(p2, 0, 2, nameof(p2)); + Guard.MustBeBetweenOrEqualTo(p1, 0, 2, nameof(p1)); + Guard.MustBeBetweenOrEqualTo(p0, 0, 2, nameof(p0)); + + this.p2 = p2; + this.p1 = p1; + this.p0 = p0; + this.Control = SimdUtils.Shuffle.MmShuffle(3, p2, p1, p0); + } + + public byte Control { get; } + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + + int p2 = this.p2; + int p1 = this.p1; + int p0 = this.p0; + + for (int i = 0; i < source.Length; i += 3) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + } + } + } + + internal readonly struct ZYXShuffle3 : IShuffle3 + { + private static readonly byte ZYX = SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); + + public byte Control => ZYX; + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref Byte3 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); + int n = source.Length / 3; + + for (int i = 0; i < n; i++) + { + uint packed = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + + // packed = [W Z Y X] + // tmp1 = [W 0 Y 0] + // tmp2 = [0 Z 0 X] + // tmp3=ROTL(16, tmp2) = [0 X 0 Z] + // tmp1 + tmp3 = [W X Y Z] + uint tmp1 = packed & 0xFF00FF00; + uint tmp2 = packed & 0x00FF00FF; + uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); + packed = tmp1 + tmp3; + + Unsafe.Add(ref dBase, i) = Unsafe.As(ref packed); + } + } + } + + [StructLayout(LayoutKind.Explicit, Size = 3)] + internal readonly struct Byte3 + { + } +} diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index c65c50f684..1ceb38f1a4 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -20,6 +20,11 @@ internal interface IShuffle4Slice3 : IComponentShuffle public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0) { + Guard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); + Guard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); + Guard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); + Guard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); + this.p2 = p2; this.p1 = p1; this.p0 = p0; @@ -62,13 +67,8 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) int n = source.Length / 4; for (int i = 0, j = 0; i < n; i++, j += 3) { - Unsafe.As(ref Unsafe.Add(ref dBase, j)) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + Unsafe.As(ref Unsafe.Add(ref dBase, j)) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); } } } - - [StructLayout(LayoutKind.Explicit, Size = 3)] - internal readonly struct Xyz24 - { - } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index abda6c4df6..974516c3e5 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -86,6 +86,38 @@ public static void Shuffle4Reduce( } } + /// + /// Shuffles 8-bit integer triplets within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle3Reduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + int remainder = source.Length % (Vector128.Count * 3); + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + Shuffle3( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + /// /// Pads then shuffles 8-bit integers within 128-bit lanes in /// using the control and store the results in . @@ -94,7 +126,7 @@ public static void Shuffle4Reduce( /// The destination span of bytes. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] - public static unsafe void Pad3Shuffle4Reduce( + public static void Pad3Shuffle4Reduce( ref ReadOnlySpan source, ref Span dest, byte control) @@ -127,7 +159,7 @@ public static unsafe void Pad3Shuffle4Reduce( /// The destination span of bytes. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] - public static unsafe void Shuffle4Slice3Reduce( + public static void Shuffle4Slice3Reduce( ref ReadOnlySpan source, ref Span dest, byte control) @@ -313,7 +345,69 @@ private static void Shuffle4( } [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe void Pad3Shuffle4( + private static void Shuffle3( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + Vector128 vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte(); + Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); + Vector128 vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte(); + Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte(); + + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + int n = source.Length / Vector128.Count; + + for (int i = 0; i < n; i += 3) + { + ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); + Vector128 v1 = Unsafe.Add(ref v0, 1); + Vector128 v2 = Unsafe.Add(ref v0, 2); + Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); + + v2 = Ssse3.AlignRight(v2, v1, 8); + v1 = Ssse3.AlignRight(v1, v0, 12); + + v0 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); + v1 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); + v2 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); + v3 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); + + v0 = Ssse3.Shuffle(v0, vmaske); + v1 = Ssse3.Shuffle(v1, vmasko); + v2 = Ssse3.Shuffle(v2, vmaske); + v3 = Ssse3.Shuffle(v3, vmasko); + + v0 = Ssse3.AlignRight(v1, v0, 4); + v3 = Ssse3.AlignRight(v3, v2, 12); + + v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); + v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); + + v1 = Ssse3.AlignRight(v2, v1, 8); + + ref Vector128 vd = ref Unsafe.Add(ref destBase, i); + + vd = v0; + Unsafe.Add(ref vd, 1) = v1; + Unsafe.Add(ref vd, 2) = v3; + } + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void Pad3Shuffle4( ReadOnlySpan source, Span dest, byte control) @@ -356,7 +450,7 @@ private static unsafe void Pad3Shuffle4( } [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe void Shuffle4Slice3( + private static void Shuffle4Slice3( ReadOnlySpan source, Span dest, byte control) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 7ef3be6fe3..79cb0da372 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -23,7 +23,7 @@ public static void Shuffle4( Span dest, byte control) { - VerifyShuffleSpanInput(source, dest); + VerifyShuffle4SpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control); @@ -50,7 +50,7 @@ public static void Shuffle4( TShuffle shuffle) where TShuffle : struct, IComponentShuffle { - VerifyShuffleSpanInput(source, dest); + VerifyShuffle4SpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS HwIntrinsics.Shuffle4Reduce(ref source, ref dest, shuffle.Control); @@ -63,6 +63,33 @@ public static void Shuffle4( } } + /// + /// Shuffle 8-bit integer triplets within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The type of shuffle to perform. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle3( + ReadOnlySpan source, + Span dest, + TShuffle shuffle) + where TShuffle : struct, IShuffle3 + { + VerifyShuffle3SpanInput(source, dest); + +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle3Reduce(ref source, ref dest, shuffle.Control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + shuffle.RunFallbackShuffle(source, dest); + } + } + /// /// Pads then shuffles 8-bit integers within 128-bit lanes in /// using the control and store the results in . @@ -136,7 +163,7 @@ private static void Shuffle4Remainder( } [Conditional("DEBUG")] - private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + private static void VerifyShuffle4SpanInput(ReadOnlySpan source, Span dest) where T : struct { DebugGuard.IsTrue( @@ -150,6 +177,21 @@ private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span de "Input spans must be divisable by 4!"); } + [Conditional("DEBUG")] + private static void VerifyShuffle3SpanInput(ReadOnlySpan source, Span dest) + where T : struct + { + DebugGuard.IsTrue( + source.Length == dest.Length, + nameof(source), + "Input spans must be of same length!"); + + DebugGuard.IsTrue( + source.Length % 3 == 0, + nameof(source), + "Input spans must be divisable by 3!"); + } + [Conditional("DEBUG")] private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan source, Span dest) { diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 29f3925fc9..75d7c87299 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -92,6 +92,48 @@ static void RunTest(string serialized) HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); } + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy3))] + public void BulkShuffleByte3Channel(int count) + { + static void RunTest(string serialized) + { + int size = FeatureTestRunner.Deserialize(serialized); + + // These cannot be expressed as a theory as you cannot + // use RemoteExecutor within generic methods nor pass + // IShuffle3 to the generic utils method. + ZYXShuffle3 zyx = default; + TestShuffleByte3Channel( + size, + (s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zyx), + zyx.Control); + + var xyz = new DefaultShuffle3(2, 1, 0); + TestShuffleByte3Channel( + size, + (s, d) => SimdUtils.Shuffle3(s.Span, d.Span, xyz), + xyz.Control); + + var yyy = new DefaultShuffle3(1, 1, 1); + TestShuffleByte3Channel( + size, + (s, d) => SimdUtils.Shuffle3(s.Span, d.Span, yyy), + yyy.Control); + + var zzz = new DefaultShuffle3(2, 2, 2); + TestShuffleByte3Channel( + size, + (s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zzz), + zzz.Control); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); + } + [Theory] [MemberData(nameof(ArraySizesDivisibleBy3))] public void BulkPad3Shuffle4Channel(int count) @@ -102,7 +144,7 @@ static void RunTest(string serialized) // These cannot be expressed as a theory as you cannot // use RemoteExecutor within generic methods nor pass - // IComponentShuffle to the generic utils method. + // IPad3Shuffle4 to the generic utils method. XYZWPad3Shuffle4 xyzw = default; TestPad3Shuffle4Channel( size, @@ -144,7 +186,7 @@ static void RunTest(string serialized) // These cannot be expressed as a theory as you cannot // use RemoteExecutor within generic methods nor pass - // IComponentShuffle to the generic utils method. + // IShuffle4Slice3 to the generic utils method. XYZWShuffle4Slice3 xyzw = default; TestShuffle4Slice3Channel( size, @@ -237,6 +279,36 @@ private static void TestShuffleByte4Channel( Assert.Equal(expected, result); } + private static void TestShuffleByte3Channel( + int count, + Action, Memory> convert, + byte control) + { + byte[] source = new byte[count]; + new Random(count).NextBytes(source); + var result = new byte[count]; + + byte[] expected = new byte[count]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int _, + out int p2, + out int p1, + out int p0); + + for (int i = 0; i < expected.Length; i += 3) + { + expected[i] = source[p0 + i]; + expected[i + 1] = source[p1 + i]; + expected[i + 2] = source[p2 + i]; + } + + convert(source, result); + + Assert.Equal(expected, result); + } + private static void TestPad3Shuffle4Channel( int count, Action, Memory> convert, From 49062c4ee506c2a9a85ec7ffc34ae2df9f323bb6 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Nov 2020 11:46:49 +0000 Subject: [PATCH 11/20] Cleanup and fix tests --- .../Helpers/Shuffle/IComponentShuffle.cs | 23 ++++--- .../Common/Helpers/Shuffle/IPad3Shuffle4.cs | 8 +-- .../Common/Helpers/Shuffle/IShuffle3.cs | 43 +------------ .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 13 ++-- .../Common/Helpers/SimdUtils.Shuffle.cs | 2 +- .../PixelFormats/Utils/PixelConverter.cs | 18 +++++- .../Color/Bulk/Shuffle3Channel.cs | 64 +++++++++++++++++++ .../Common/SimdUtilsTests.Shuffle.cs | 4 +- 8 files changed, 113 insertions(+), 62 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle3Channel.cs diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs index 1a4c6ab446..2056075e7c 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs @@ -31,7 +31,12 @@ internal interface IComponentShuffle void RunFallbackShuffle(ReadOnlySpan source, Span dest); } - internal readonly struct DefaultShuffle4 : IComponentShuffle + /// + internal interface IShuffle4 : IComponentShuffle + { + } + + internal readonly struct DefaultShuffle4 : IShuffle4 { private readonly byte p3; private readonly byte p2; @@ -40,10 +45,10 @@ internal interface IComponentShuffle public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0) { - Guard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); - Guard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); - Guard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); - Guard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); + DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); + DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); + DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); + DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); this.p3 = p3; this.p2 = p2; @@ -75,7 +80,7 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) } } - internal readonly struct WXYZShuffle4 : IComponentShuffle + internal readonly struct WXYZShuffle4 : IShuffle4 { private static readonly byte WXYZ = SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); @@ -99,7 +104,7 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) } } - internal readonly struct WZYXShuffle4 : IComponentShuffle + internal readonly struct WZYXShuffle4 : IShuffle4 { private static readonly byte WZYX = SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); @@ -123,7 +128,7 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) } } - internal readonly struct YZWXShuffle4 : IComponentShuffle + internal readonly struct YZWXShuffle4 : IShuffle4 { private static readonly byte YZWX = SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); @@ -147,7 +152,7 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) } } - internal readonly struct ZYXWShuffle4 : IComponentShuffle + internal readonly struct ZYXWShuffle4 : IShuffle4 { private static readonly byte ZYXW = SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs index b223a6bc27..1f3fce541d 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -21,10 +21,10 @@ internal interface IPad3Shuffle4 : IComponentShuffle public DefaultPad3Shuffle4(byte p3, byte p2, byte p1, byte p0) { - Guard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); - Guard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); - Guard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); - Guard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); + DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); + DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); + DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); + DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); this.p3 = p3; this.p2 = p2; diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs index fa4260e63d..61e99890e7 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs @@ -20,9 +20,9 @@ internal interface IShuffle3 : IComponentShuffle public DefaultShuffle3(byte p2, byte p1, byte p0) { - Guard.MustBeBetweenOrEqualTo(p2, 0, 2, nameof(p2)); - Guard.MustBeBetweenOrEqualTo(p1, 0, 2, nameof(p1)); - Guard.MustBeBetweenOrEqualTo(p0, 0, 2, nameof(p0)); + DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 2, nameof(p2)); + DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 2, nameof(p1)); + DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 2, nameof(p0)); this.p2 = p2; this.p1 = p1; @@ -50,41 +50,4 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) } } } - - internal readonly struct ZYXShuffle3 : IShuffle3 - { - private static readonly byte ZYX = SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); - - public byte Control => ZYX; - - [MethodImpl(InliningOptions.ShortMethod)] - public void RunFallbackShuffle(ReadOnlySpan source, Span dest) - { - ref Byte3 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); - ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - int n = source.Length / 3; - - for (int i = 0; i < n; i++) - { - uint packed = Unsafe.As(ref Unsafe.Add(ref sBase, i)); - - // packed = [W Z Y X] - // tmp1 = [W 0 Y 0] - // tmp2 = [0 Z 0 X] - // tmp3=ROTL(16, tmp2) = [0 X 0 Z] - // tmp1 + tmp3 = [W X Y Z] - uint tmp1 = packed & 0xFF00FF00; - uint tmp2 = packed & 0x00FF00FF; - uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); - packed = tmp1 + tmp3; - - Unsafe.Add(ref dBase, i) = Unsafe.As(ref packed); - } - } - } - - [StructLayout(LayoutKind.Explicit, Size = 3)] - internal readonly struct Byte3 - { - } } diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index 1ceb38f1a4..c9b51aba2c 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -20,10 +20,10 @@ internal interface IShuffle4Slice3 : IComponentShuffle public DefaultShuffle4Slice3(byte p3, byte p2, byte p1, byte p0) { - Guard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); - Guard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); - Guard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); - Guard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); + DebugGuard.MustBeBetweenOrEqualTo(p3, 0, 3, nameof(p3)); + DebugGuard.MustBeBetweenOrEqualTo(p2, 0, 3, nameof(p2)); + DebugGuard.MustBeBetweenOrEqualTo(p1, 0, 3, nameof(p1)); + DebugGuard.MustBeBetweenOrEqualTo(p0, 0, 3, nameof(p0)); this.p2 = p2; this.p1 = p1; @@ -71,4 +71,9 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) } } } + + [StructLayout(LayoutKind.Explicit, Size = 3)] + internal readonly struct Byte3 + { + } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 79cb0da372..8fd6bcce68 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -48,7 +48,7 @@ public static void Shuffle4( ReadOnlySpan source, Span dest, TShuffle shuffle) - where TShuffle : struct, IComponentShuffle + where TShuffle : struct, IShuffle4 { VerifyShuffle4SpanInput(source, dest); diff --git a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs index c5f92648c0..7215fa860b 100644 --- a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs +++ b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs @@ -164,7 +164,14 @@ public static void ToArgb32(ReadOnlySpan source, Span dest) public static void ToBgra32(ReadOnlySpan source, Span dest) => SimdUtils.Pad3Shuffle4(source, dest, new DefaultPad3Shuffle4(3, 0, 1, 2)); - // TODO: Bgr24 + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToBgr24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle3(source, dest, new DefaultShuffle3(0, 1, 2)); } public static class FromBgr24 @@ -196,7 +203,14 @@ public static void ToRgba32(ReadOnlySpan source, Span dest) public static void ToBgra32(ReadOnlySpan source, Span dest) => SimdUtils.Pad3Shuffle4(source, dest, default); - // TODO: Rgb24 + /// + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public static void ToRgb24(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle3(source, dest, new DefaultShuffle3(0, 1, 2)); } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle3Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle3Channel.cs new file mode 100644 index 0000000000..3667b973ef --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle3Channel.cs @@ -0,0 +1,64 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Shuffle3Channel + { + private static readonly DefaultShuffle3 Control = new DefaultShuffle3(1, 0, 2); + private byte[] source; + private byte[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[this.Count]; + new Random(this.Count).NextBytes(this.source); + this.destination = new byte[this.Count]; + } + + [Params(96, 384, 768, 1536)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle3() + { + SimdUtils.Shuffle3(this.source, this.destination, Control); + } + } + + // 2020-11-02 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.403 + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |--------------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 48.46 ns | 1.034 ns | 2.438 ns | 47.46 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle3 | 2. AVX | Empty | 96 | 32.42 ns | 0.537 ns | 0.476 ns | 32.34 ns | 0.66 | 0.04 | - | - | - | - | + // | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 32.51 ns | 0.373 ns | 0.349 ns | 32.56 ns | 0.66 | 0.03 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 199.04 ns | 1.512 ns | 1.180 ns | 199.17 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle3 | 2. AVX | Empty | 384 | 71.20 ns | 2.654 ns | 7.784 ns | 69.60 ns | 0.41 | 0.02 | - | - | - | - | + // | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 63.23 ns | 0.569 ns | 0.505 ns | 63.21 ns | 0.32 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 391.28 ns | 5.087 ns | 3.972 ns | 391.22 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle3 | 2. AVX | Empty | 768 | 109.12 ns | 2.149 ns | 2.010 ns | 108.66 ns | 0.28 | 0.01 | - | - | - | - | + // | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 106.51 ns | 0.734 ns | 0.613 ns | 106.56 ns | 0.27 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle3 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 773.70 ns | 5.516 ns | 4.890 ns | 772.96 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle3 | 2. AVX | Empty | 1536 | 190.41 ns | 1.090 ns | 0.851 ns | 190.38 ns | 0.25 | 0.00 | - | - | - | - | + // | Shuffle3 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 190.94 ns | 0.985 ns | 0.769 ns | 190.85 ns | 0.25 | 0.00 | - | - | - | - | +} diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 75d7c87299..f1bfaa4ad4 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -42,7 +42,7 @@ static void RunTest(string serialized) // These cannot be expressed as a theory as you cannot // use RemoteExecutor within generic methods nor pass - // IComponentShuffle to the generic utils method. + // IShuffle4 to the generic utils method. WXYZShuffle4 wxyz = default; TestShuffleByte4Channel( size, @@ -103,7 +103,7 @@ static void RunTest(string serialized) // These cannot be expressed as a theory as you cannot // use RemoteExecutor within generic methods nor pass // IShuffle3 to the generic utils method. - ZYXShuffle3 zyx = default; + var zyx = new DefaultShuffle3(0, 1, 2); TestShuffleByte3Channel( size, (s, d) => SimdUtils.Shuffle3(s.Span, d.Span, zyx), From 8c3246966606cc98319662cbb58ab98fea73f090 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Nov 2020 16:24:04 +0000 Subject: [PATCH 12/20] Fix Shuffle4Slice3, wire up shuffles. --- .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 6 +- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 18 ++- .../Argb32.PixelOperations.Generated.cs | 72 +++++---- .../Bgr24.PixelOperations.Generated.cs | 140 +++++++++++------- .../Bgra32.PixelOperations.Generated.cs | 72 +++++---- .../Rgb24.PixelOperations.Generated.cs | 129 ++++++++++------ .../Rgba32.PixelOperations.Generated.cs | 72 +++++---- .../Generated/_Common.ttinclude | 6 +- 8 files changed, 323 insertions(+), 192 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index c9b51aba2c..70800f9de2 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -62,12 +62,12 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); - ref byte dBase = ref MemoryMarshal.GetReference(dest); + ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); int n = source.Length / 4; - for (int i = 0, j = 0; i < n; i++, j += 3) + for (int i = 0; i < n; i++) { - Unsafe.As(ref Unsafe.Add(ref dBase, j)) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + Unsafe.Add(ref dBase, i) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 974516c3e5..296970ddcc 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -371,9 +371,11 @@ private static void Shuffle3( for (int i = 0; i < n; i += 3) { - ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); - Vector128 v1 = Unsafe.Add(ref v0, 1); - Vector128 v2 = Unsafe.Add(ref v0, 2); + ref Vector128 vs = ref Unsafe.Add(ref sourceBase, i); + + Vector128 v0 = vs; + Vector128 v1 = Unsafe.Add(ref vs, 1); + Vector128 v2 = Unsafe.Add(ref vs, 2); Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); v2 = Ssse3.AlignRight(v2, v1, 8); @@ -474,10 +476,12 @@ private static void Shuffle4Slice3( for (int i = 0, j = 0; i < n; i += 4, j += 3) { - ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); - Vector128 v1 = Unsafe.Add(ref v0, 1); - Vector128 v2 = Unsafe.Add(ref v0, 2); - Vector128 v3 = Unsafe.Add(ref v0, 3); + ref Vector128 vs = ref Unsafe.Add(ref sourceBase, i); + + Vector128 v0 = vs; + Vector128 v1 = Unsafe.Add(ref vs, 1); + Vector128 v2 = Unsafe.Add(ref vs, 2); + Vector128 v3 = Unsafe.Add(ref vs, 3); v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske); v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko); diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs index 3f48d2acca..d30616997c 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs @@ -106,23 +106,59 @@ public override void FromBgra32( Span dest = MemoryMarshal.Cast(destinationPixels); PixelConverter.FromBgra32.ToArgb32(source, dest); } + /// + public override void ToRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToRgb24(source, dest); + } /// - public override void ToBgr24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Argb32 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Bgr24 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToArgb32(source, dest); + } + /// + public override void ToBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Argb32 sp = ref Unsafe.Add(ref sourceRef, i); - ref Bgr24 dp = ref Unsafe.Add(ref destRef, i); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToBgr24(source, dest); + } - dp.FromArgb32(sp); - } + /// + public override void FromBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToArgb32(source, dest); } /// @@ -197,24 +233,6 @@ public override void ToLa32(Configuration configuration, ReadOnlySpan so } } - /// - public override void ToRgb24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) - { - Guard.NotNull(configuration, nameof(configuration)); - Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - - ref Argb32 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Rgb24 destRef = ref MemoryMarshal.GetReference(destinationPixels); - - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Argb32 sp = ref Unsafe.Add(ref sourceRef, i); - ref Rgb24 dp = ref Unsafe.Add(ref destRef, i); - - dp.FromArgb32(sp); - } - } - /// public override void ToRgb48(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgr24.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgr24.PixelOperations.Generated.cs index b73bb8b831..50d4942ecb 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgr24.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgr24.PixelOperations.Generated.cs @@ -52,146 +52,182 @@ public override void ToVector4(Configuration configuration, ReadOnlySpan { Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale | PixelConversionModifiers.Premultiply)); } - /// - public override void ToArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Argb32 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToRgba32(source, dest); + } - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Argb32 dp = ref Unsafe.Add(ref destRef, i); + /// + public override void FromRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - dp.FromBgr24(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToBgr24(source, dest); } - /// - public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Bgra32 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToArgb32(source, dest); + } - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Bgra32 dp = ref Unsafe.Add(ref destRef, i); + /// + public override void FromArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - dp.FromBgr24(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToBgr24(source, dest); } - /// - public override void ToL8(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref L8 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToBgra32(source, dest); + } - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref L8 dp = ref Unsafe.Add(ref destRef, i); + /// + public override void FromBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - dp.FromBgr24(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToBgr24(source, dest); } - /// - public override void ToL16(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref L16 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToRgb24(source, dest); + } - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref L16 dp = ref Unsafe.Add(ref destRef, i); + /// + public override void FromRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - dp.FromBgr24(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToBgr24(source, dest); } /// - public override void ToLa16(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToL8(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref La16 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ref L8 destRef = ref MemoryMarshal.GetReference(destinationPixels); for (int i = 0; i < sourcePixels.Length; i++) { ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref La16 dp = ref Unsafe.Add(ref destRef, i); + ref L8 dp = ref Unsafe.Add(ref destRef, i); dp.FromBgr24(sp); } } /// - public override void ToLa32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToL16(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref La32 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ref L16 destRef = ref MemoryMarshal.GetReference(destinationPixels); for (int i = 0; i < sourcePixels.Length; i++) { ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref La32 dp = ref Unsafe.Add(ref destRef, i); + ref L16 dp = ref Unsafe.Add(ref destRef, i); dp.FromBgr24(sp); } } /// - public override void ToRgb24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToLa16(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Rgb24 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ref La16 destRef = ref MemoryMarshal.GetReference(destinationPixels); for (int i = 0; i < sourcePixels.Length; i++) { ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Rgb24 dp = ref Unsafe.Add(ref destRef, i); + ref La16 dp = ref Unsafe.Add(ref destRef, i); dp.FromBgr24(sp); } } /// - public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToLa32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); ref Bgr24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Rgba32 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ref La32 destRef = ref MemoryMarshal.GetReference(destinationPixels); for (int i = 0; i < sourcePixels.Length; i++) { ref Bgr24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Rgba32 dp = ref Unsafe.Add(ref destRef, i); + ref La32 dp = ref Unsafe.Add(ref destRef, i); dp.FromBgr24(sp); } diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs index 8cf2d5850a..b38e5f19d6 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs @@ -106,23 +106,59 @@ public override void FromArgb32( Span dest = MemoryMarshal.Cast(destinationPixels); PixelConverter.FromArgb32.ToBgra32(source, dest); } + /// + public override void ToRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToRgb24(source, dest); + } /// - public override void ToBgr24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Bgra32 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Bgr24 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToBgra32(source, dest); + } + /// + public override void ToBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Bgra32 sp = ref Unsafe.Add(ref sourceRef, i); - ref Bgr24 dp = ref Unsafe.Add(ref destRef, i); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToBgr24(source, dest); + } - dp.FromBgra32(sp); - } + /// + public override void FromBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToBgra32(source, dest); } /// @@ -197,24 +233,6 @@ public override void ToLa32(Configuration configuration, ReadOnlySpan so } } - /// - public override void ToRgb24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) - { - Guard.NotNull(configuration, nameof(configuration)); - Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - - ref Bgra32 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Rgb24 destRef = ref MemoryMarshal.GetReference(destinationPixels); - - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Bgra32 sp = ref Unsafe.Add(ref sourceRef, i); - ref Rgb24 dp = ref Unsafe.Add(ref destRef, i); - - dp.FromBgra32(sp); - } - } - /// public override void ToRgb48(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.cs index 332683fc7f..9a4173892e 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgb24.PixelOperations.Generated.cs @@ -52,59 +52,114 @@ public override void ToVector4(Configuration configuration, ReadOnlySpan { Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale | PixelConversionModifiers.Premultiply)); } - /// - public override void ToArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Rgb24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Argb32 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToRgba32(source, dest); + } - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Rgb24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Argb32 dp = ref Unsafe.Add(ref destRef, i); + /// + public override void FromRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - dp.FromRgb24(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToRgb24(source, dest); } /// - public override void ToBgr24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Rgb24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Bgr24 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToArgb32(source, dest); + } - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Rgb24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Bgr24 dp = ref Unsafe.Add(ref destRef, i); + /// + public override void FromArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - dp.FromRgb24(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToRgb24(source, dest); + } + /// + public override void ToBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToBgra32(source, dest); } /// - public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Rgb24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Bgra32 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToRgb24(source, dest); + } + /// + public override void ToBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Rgb24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Bgra32 dp = ref Unsafe.Add(ref destRef, i); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToBgr24(source, dest); + } - dp.FromRgb24(sp); - } + /// + public override void FromBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToRgb24(source, dest); } /// @@ -179,24 +234,6 @@ public override void ToLa32(Configuration configuration, ReadOnlySpan sou } } - /// - public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) - { - Guard.NotNull(configuration, nameof(configuration)); - Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - - ref Rgb24 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Rgba32 destRef = ref MemoryMarshal.GetReference(destinationPixels); - - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Rgb24 sp = ref Unsafe.Add(ref sourceRef, i); - ref Rgba32 dp = ref Unsafe.Add(ref destRef, i); - - dp.FromRgb24(sp); - } - } - /// public override void ToRgb48(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs index 9a36ec29a4..5b60ec10e3 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs @@ -95,23 +95,59 @@ public override void FromBgra32( Span dest = MemoryMarshal.Cast(destinationPixels); PixelConverter.FromBgra32.ToRgba32(source, dest); } + /// + public override void ToRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToRgb24(source, dest); + } /// - public override void ToBgr24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromRgb24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref Rgba32 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Bgr24 destRef = ref MemoryMarshal.GetReference(destinationPixels); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgb24.ToRgba32(source, dest); + } + /// + public override void ToBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Rgba32 sp = ref Unsafe.Add(ref sourceRef, i); - ref Bgr24 dp = ref Unsafe.Add(ref destRef, i); + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToBgr24(source, dest); + } - dp.FromRgba32(sp); - } + /// + public override void FromBgr24( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) + { + Guard.NotNull(configuration, nameof(configuration)); + Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); + + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgr24.ToRgba32(source, dest); } /// @@ -186,24 +222,6 @@ public override void ToLa32(Configuration configuration, ReadOnlySpan so } } - /// - public override void ToRgb24(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) - { - Guard.NotNull(configuration, nameof(configuration)); - Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - - ref Rgba32 sourceRef = ref MemoryMarshal.GetReference(sourcePixels); - ref Rgb24 destRef = ref MemoryMarshal.GetReference(destinationPixels); - - for (int i = 0; i < sourcePixels.Length; i++) - { - ref Rgba32 sp = ref Unsafe.Add(ref sourceRef, i); - ref Rgb24 dp = ref Unsafe.Add(ref destRef, i); - - dp.FromRgba32(sp); - } - } - /// public override void ToRgb48(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) { diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude index d8b5286cd7..b728b01152 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude @@ -17,7 +17,7 @@ using System.Runtime.InteropServices; <#+ static readonly string[] CommonPixelTypes = { "Argb32", "Bgr24", "Bgra32", "L8", "L16", "La16", "La32", "Rgb24", "Rgba32", "Rgb48", "Rgba64", "Bgra5551" }; - static readonly string[] Optimized32BitTypes = { "Rgba32", "Argb32", "Bgra32" }; + static readonly string[] OptimizedPixelTypes = { "Rgba32", "Argb32", "Bgra32", "Rgb24", "Bgr24" }; // Types with Rgba32-combatible to/from Vector4 conversion static readonly string[] Rgba32CompatibleTypes = { "Argb32", "Bgra32", "Rgb24", "Bgr24" }; @@ -148,8 +148,8 @@ using System.Runtime.InteropServices; GenerateRgba32CompatibleVector4ConversionMethods(pixelType, pixelType.EndsWith("32")); } - var matching32BitTypes = Optimized32BitTypes.Contains(pixelType) ? - Optimized32BitTypes.Where(p => p != pixelType) : + var matching32BitTypes = OptimizedPixelTypes.Contains(pixelType) ? + OptimizedPixelTypes.Where(p => p != pixelType) : Enumerable.Empty(); foreach (string destPixelType in matching32BitTypes) From 1f73b219207ab35ee18f771b29c30346cdca1ba2 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Nov 2020 18:14:11 +0000 Subject: [PATCH 13/20] Add Rgb24 <==> Vector4 benchmarks --- .../Color/Bulk/FromVector4.cs | 4 +- .../Color/Bulk/FromVector4_Rgb24.cs | 55 ++++++++++++++++ .../Color/Bulk/ToVector4_Rgb24.cs | 65 +++++++++++++++++++ 3 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4_Rgb24.cs create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgb24.cs diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs index dc030e07a7..1db9147ad2 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs @@ -30,7 +30,7 @@ public abstract class FromVector4 protected Configuration Configuration => Configuration.Default; // [Params(64, 2048)] - [Params(1024)] + [Params(64, 256, 2048)] public int Count { get; set; } [GlobalSetup] @@ -58,7 +58,7 @@ public void PerElement() } } - [Benchmark] + [Benchmark(Baseline = true)] public void PixelOperations_Base() { new PixelOperations().FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan()); diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4_Rgb24.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4_Rgb24.cs new file mode 100644 index 0000000000..5da6edc6b5 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4_Rgb24.cs @@ -0,0 +1,55 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.ShortClr))] + public class FromVector4_Rgb24 : FromVector4 + { + } +} + +// 2020-11-02 +// ########## +// +// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1) +// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores +// .NET Core SDK=3.1.403 +// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT +// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT +// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT +// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT +// +// IterationCount=3 LaunchCount=1 WarmupCount=3 +// +// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | +// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:| +// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 64 | 343.2 ns | 305.91 ns | 16.77 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 64 | 320.8 ns | 19.93 ns | 1.09 ns | 0.94 | 0.05 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 64 | 234.3 ns | 17.98 ns | 0.99 ns | 1.00 | 0.00 | 0.0052 | - | - | 24 B | +// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 64 | 246.0 ns | 82.34 ns | 4.51 ns | 1.05 | 0.02 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 64 | 222.3 ns | 39.46 ns | 2.16 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 64 | 243.4 ns | 33.58 ns | 1.84 ns | 1.09 | 0.01 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 256 | 824.9 ns | 32.77 ns | 1.80 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 256 | 967.0 ns | 39.09 ns | 2.14 ns | 1.17 | 0.01 | 0.0172 | - | - | 72 B | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 256 | 756.9 ns | 94.43 ns | 5.18 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B | +// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 256 | 1,003.3 ns | 3,192.09 ns | 174.97 ns | 1.32 | 0.22 | 0.0172 | - | - | 72 B | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 256 | 748.6 ns | 248.03 ns | 13.60 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 256 | 437.0 ns | 36.48 ns | 2.00 ns | 0.58 | 0.01 | 0.0172 | - | - | 72 B | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 2048 | 5,751.6 ns | 704.24 ns | 38.60 ns | 1.00 | 0.00 | - | - | - | 24 B | +// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 2048 | 4,391.6 ns | 718.17 ns | 39.37 ns | 0.76 | 0.00 | 0.0153 | - | - | 72 B | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 2048 | 6,202.0 ns | 1,815.18 ns | 99.50 ns | 1.00 | 0.00 | - | - | - | 24 B | +// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 2048 | 4,225.6 ns | 1,004.03 ns | 55.03 ns | 0.68 | 0.01 | 0.0153 | - | - | 72 B | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 2048 | 6,157.1 ns | 2,516.98 ns | 137.96 ns | 1.00 | 0.00 | - | - | - | 24 B | +// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 2048 | 1,822.7 ns | 1,764.43 ns | 96.71 ns | 0.30 | 0.02 | 0.0172 | - | - | 72 B | diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgb24.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgb24.cs new file mode 100644 index 0000000000..aecd418316 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4_Rgb24.cs @@ -0,0 +1,65 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using BenchmarkDotNet.Attributes; + +using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.ShortClr))] + public class ToVector4_Rgb24 : ToVector4 + { + [Benchmark(Baseline = true)] + public void PixelOperations_Base() + { + new PixelOperations().ToVector4( + this.Configuration, + this.source.GetSpan(), + this.destination.GetSpan()); + } + } +} + +// 2020-11-02 +// ########## +// +// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1) +// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores +// .NET Core SDK=3.1.403 +// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT +// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT +// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT +// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT +// +// IterationCount=3 LaunchCount=1 WarmupCount=3 +// +// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | +// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:| +// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 64 | 298.4 ns | 33.63 ns | 1.84 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 64 | 355.5 ns | 908.51 ns | 49.80 ns | 1.19 | 0.17 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 64 | 220.1 ns | 13.77 ns | 0.75 ns | 1.00 | 0.00 | 0.0055 | - | - | 24 B | +// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 64 | 228.5 ns | 41.41 ns | 2.27 ns | 1.04 | 0.01 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 64 | 213.6 ns | 12.47 ns | 0.68 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 64 | 217.0 ns | 9.95 ns | 0.55 ns | 1.02 | 0.01 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 256 | 829.0 ns | 242.93 ns | 13.32 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 256 | 448.9 ns | 4.04 ns | 0.22 ns | 0.54 | 0.01 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 256 | 863.0 ns | 1,253.26 ns | 68.70 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B | +// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 256 | 309.2 ns | 66.16 ns | 3.63 ns | 0.36 | 0.03 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 256 | 737.0 ns | 253.90 ns | 13.92 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | +// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 256 | 212.3 ns | 1.07 ns | 0.06 ns | 0.29 | 0.01 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-OIBEDX | .NET 4.7.2 | 2048 | 5,625.6 ns | 404.35 ns | 22.16 ns | 1.00 | 0.00 | - | - | - | 24 B | +// | PixelOperations_Specialized | Job-OIBEDX | .NET 4.7.2 | 2048 | 1,974.1 ns | 229.84 ns | 12.60 ns | 0.35 | 0.00 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-OPAORC | .NET Core 2.1 | 2048 | 5,467.2 ns | 537.29 ns | 29.45 ns | 1.00 | 0.00 | - | - | - | 24 B | +// | PixelOperations_Specialized | Job-OPAORC | .NET Core 2.1 | 2048 | 1,985.5 ns | 4,714.23 ns | 258.40 ns | 0.36 | 0.05 | - | - | - | - | +// | | | | | | | | | | | | | | +// | PixelOperations_Base | Job-VPSIRL | .NET Core 3.1 | 2048 | 5,888.2 ns | 1,622.23 ns | 88.92 ns | 1.00 | 0.00 | - | - | - | 24 B | +// | PixelOperations_Specialized | Job-VPSIRL | .NET Core 3.1 | 2048 | 1,165.0 ns | 191.71 ns | 10.51 ns | 0.20 | 0.00 | - | - | - | - | From a08f9062a15dd35f2f5a13e2646b66cb38b3fc9f Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Nov 2020 14:10:18 +0000 Subject: [PATCH 14/20] Unroll XYZWShuffle4Slice3 --- .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index 70800f9de2..083b0b2a96 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -65,9 +65,27 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); int n = source.Length / 4; - for (int i = 0; i < n; i++) + int m = ImageMaths.Modulo4(n); + int u = n - m; + + ref Byte3 dEnd = ref Unsafe.Add(ref dBase, u); + + while (Unsafe.IsAddressLessThan(ref dBase, ref dEnd)) + { + Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); + Unsafe.Add(ref dBase, 1) = Unsafe.As(ref Unsafe.Add(ref sBase, 1)); + Unsafe.Add(ref dBase, 2) = Unsafe.As(ref Unsafe.Add(ref sBase, 2)); + Unsafe.Add(ref dBase, 3) = Unsafe.As(ref Unsafe.Add(ref sBase, 3)); + dBase = ref Unsafe.Add(ref dBase, 4); + sBase = ref Unsafe.Add(ref sBase, 4); + } + + if (m > 0) { - Unsafe.Add(ref dBase, i) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + for (int i = u; i < n; i++) + { + Unsafe.Add(ref dBase, i) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + } } } } From 4416d3d95126b926c34860b979508dec76769131 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Nov 2020 16:55:17 +0000 Subject: [PATCH 15/20] Fix shuffle +m slice fallback --- .../Common/Helpers/Shuffle/IPad3Shuffle4.cs | 30 +++++++++---------- .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 18 ++++++----- .../Color/Bulk/FromVector4.cs | 2 +- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs index 1f3fce541d..fbd4a343db 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -72,29 +72,29 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) { - ref byte rs = ref MemoryMarshal.GetReference(source); - ref byte rd = ref MemoryMarshal.GetReference(dest); + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); - ref byte rsEnd = ref Unsafe.Add(ref rs, source.Length); - ref byte rsLoopEnd = ref Unsafe.Subtract(ref rsEnd, 4); + ref byte sEnd = ref Unsafe.Add(ref sBase, source.Length); + ref byte sLoopEnd = ref Unsafe.Subtract(ref sEnd, 4); - while (Unsafe.IsAddressLessThan(ref rs, ref rsLoopEnd)) + while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) { - Unsafe.As(ref rd) = Unsafe.As(ref rs) | 0xFF000000; + Unsafe.As(ref dBase) = Unsafe.As(ref sBase) | 0xFF000000; - rs = ref Unsafe.Add(ref rs, 3); - rd = ref Unsafe.Add(ref rd, 4); + sBase = ref Unsafe.Add(ref sBase, 3); + dBase = ref Unsafe.Add(ref dBase, 4); } - while (Unsafe.IsAddressLessThan(ref rs, ref rsEnd)) + while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) { - Unsafe.Add(ref rd, 0) = Unsafe.Add(ref rs, 0); - Unsafe.Add(ref rd, 1) = Unsafe.Add(ref rs, 1); - Unsafe.Add(ref rd, 2) = Unsafe.Add(ref rs, 2); - Unsafe.Add(ref rd, 3) = byte.MaxValue; + Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0); + Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1); + Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2); + Unsafe.Add(ref dBase, 3) = byte.MaxValue; - rs = ref Unsafe.Add(ref rs, 3); - rd = ref Unsafe.Add(ref rd, 4); + sBase = ref Unsafe.Add(ref sBase, 3); + dBase = ref Unsafe.Add(ref dBase, 4); } } } diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index 083b0b2a96..e11956c105 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -68,24 +68,26 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) int m = ImageMaths.Modulo4(n); int u = n - m; - ref Byte3 dEnd = ref Unsafe.Add(ref dBase, u); + ref uint sLoopEnd = ref Unsafe.Add(ref sBase, u); + ref uint sEnd = ref Unsafe.Add(ref sBase, n); - while (Unsafe.IsAddressLessThan(ref dBase, ref dEnd)) + while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) { Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); Unsafe.Add(ref dBase, 1) = Unsafe.As(ref Unsafe.Add(ref sBase, 1)); Unsafe.Add(ref dBase, 2) = Unsafe.As(ref Unsafe.Add(ref sBase, 2)); Unsafe.Add(ref dBase, 3) = Unsafe.As(ref Unsafe.Add(ref sBase, 3)); - dBase = ref Unsafe.Add(ref dBase, 4); + sBase = ref Unsafe.Add(ref sBase, 4); + dBase = ref Unsafe.Add(ref dBase, 4); } - if (m > 0) + while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) { - for (int i = u; i < n; i++) - { - Unsafe.Add(ref dBase, i) = Unsafe.As(ref Unsafe.Add(ref sBase, i)); - } + Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); + + sBase = ref Unsafe.Add(ref sBase, 1); + dBase = ref Unsafe.Add(ref dBase, 1); } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs index 1db9147ad2..04ca8cd652 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/FromVector4.cs @@ -91,7 +91,7 @@ public void BasicIntrinsics256() SimdUtils.BasicIntrinsics256.NormalizedFloatToByteSaturate(sBytes, dFloats); } - [Benchmark(Baseline = true)] + [Benchmark] public void ExtendedIntrinsic() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); From 11cc6af36eedad8fbb310c0d7f38699d39e04f57 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Nov 2020 17:31:27 +0000 Subject: [PATCH 16/20] Inline controls as constants --- .../Helpers/Shuffle/IComponentShuffle.cs | 32 ++++++++++++------- .../Common/Helpers/Shuffle/IPad3Shuffle4.cs | 8 +++-- .../Common/Helpers/Shuffle/IShuffle4Slice3.cs | 8 +++-- 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs index 2056075e7c..7687a5b95f 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs @@ -82,9 +82,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct WXYZShuffle4 : IShuffle4 { - private static readonly byte WXYZ = SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); - - public byte Control => WXYZ; + public byte Control + { + [MethodImpl(InliningOptions.ShortMethod)] + get => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); + } [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) @@ -106,9 +108,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct WZYXShuffle4 : IShuffle4 { - private static readonly byte WZYX = SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); - - public byte Control => WZYX; + public byte Control + { + [MethodImpl(InliningOptions.ShortMethod)] + get => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); + } [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) @@ -130,9 +134,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct YZWXShuffle4 : IShuffle4 { - private static readonly byte YZWX = SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); - - public byte Control => YZWX; + public byte Control + { + [MethodImpl(InliningOptions.ShortMethod)] + get => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); + } [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) @@ -154,9 +160,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct ZYXWShuffle4 : IShuffle4 { - private static readonly byte ZYXW = SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); - - public byte Control => ZYXW; + public byte Control + { + [MethodImpl(InliningOptions.ShortMethod)] + get => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); + } [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs index fbd4a343db..0c2b1d5082 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -65,9 +65,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4 { - private static readonly byte XYZW = SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); - - public byte Control => XYZW; + public byte Control + { + [MethodImpl(InliningOptions.ShortMethod)] + get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); + } [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index e11956c105..86e4174f11 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -54,9 +54,11 @@ public void RunFallbackShuffle(ReadOnlySpan source, Span dest) internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3 { - private static readonly byte XYZW = SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); - - public byte Control => XYZW; + public byte Control + { + [MethodImpl(InliningOptions.ShortMethod)] + get => SimdUtils.Shuffle.MmShuffle(3, 2, 1, 0); + } [MethodImpl(InliningOptions.ShortMethod)] public void RunFallbackShuffle(ReadOnlySpan source, Span dest) From 1ad9fcdad63259ee628b04c81b85108e9948bbfb Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 6 Nov 2020 19:49:52 +0000 Subject: [PATCH 17/20] Handle Bmp encoder padding. --- src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs | 4 ++-- tests/ImageSharp.Tests/Common/SimdUtilsTests.cs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 8fd6bcce68..07744566a3 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -225,9 +225,9 @@ private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan source, Spa "Output span must be divisable by 3!"); DebugGuard.IsTrue( - source.Length == dest.Length * 4 / 3, + dest.Length >= source.Length * 3 / 4, nameof(source), - "Output span must be 3/4 the length of the input span!"); + "Output span must be at least 3/4 the length of the input span!"); } public static class Shuffle diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index fe432107a2..ec09e43e57 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -164,8 +164,6 @@ public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedDa public static readonly TheoryData ArraySizesDivisibleBy8 = new TheoryData { 0, 8, 16, 1024 }; public static readonly TheoryData ArraySizesDivisibleBy4 = new TheoryData { 0, 4, 8, 28, 1020 }; public static readonly TheoryData ArraySizesDivisibleBy3 = new TheoryData { 0, 3, 9, 36, 957 }; - - public static readonly TheoryData ArraySizesDivisibleBy32 = new TheoryData { 0, 32, 512 }; public static readonly TheoryData ArbitraryArraySizes = From e1168ad5509bfc48013beef7ce2a57a7101002f4 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 6 Nov 2020 19:50:29 +0000 Subject: [PATCH 18/20] Update src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs Co-authored-by: Clinton Ingram --- src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 296970ddcc..86ba074280 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -489,12 +489,8 @@ private static void Shuffle4Slice3( v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); v0 = Ssse3.AlignRight(v1, v0, 4); - v3 = Ssse3.AlignRight(v3, v2, 12); - - v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); - v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); - - v1 = Ssse3.AlignRight(v2, v1, 8); + v1 = Sse2.Or(Sse2.ShiftRightLogical128BitLane(v1, 4), Sse2.ShiftLeftLogical128BitLane(v2, 4)); + v2 = Ssse3.AlignRight(v3, v2, 12); ref Vector128 vd = ref Unsafe.Add(ref destBase, j); From 74dd8cdc49067e9872e9e10a89d80ad4521e6d60 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 6 Nov 2020 20:25:28 +0000 Subject: [PATCH 19/20] Use ROS trick all round and optimize Shuffle3 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 86ba074280..51c81be066 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -18,6 +18,10 @@ public static class HwIntrinsics public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + private static ReadOnlySpan ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 }; + + private static ReadOnlySpan ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 }; + /// /// Shuffle single-precision (32-bit) floating-point elements in /// using the control and store the results in . @@ -352,10 +356,12 @@ private static void Shuffle3( { if (Ssse3.IsSupported) { - Vector128 vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte(); + ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); + Vector128 vmask = Unsafe.As>(ref vmaskBase); Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); - Vector128 vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte(); - Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte(); + ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); + Vector128 vmasko = Unsafe.As>(ref vmaskoBase); + Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); @@ -381,10 +387,10 @@ private static void Shuffle3( v2 = Ssse3.AlignRight(v2, v1, 8); v1 = Ssse3.AlignRight(v1, v0, 12); - v0 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); - v1 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); - v2 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); - v3 = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); + v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle); + v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle); + v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle); + v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle); v0 = Ssse3.Shuffle(v0, vmaske); v1 = Ssse3.Shuffle(v1, vmasko); @@ -392,12 +398,8 @@ private static void Shuffle3( v3 = Ssse3.Shuffle(v3, vmasko); v0 = Ssse3.AlignRight(v1, v0, 4); - v3 = Ssse3.AlignRight(v3, v2, 12); - - v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); - v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); - - v1 = Ssse3.AlignRight(v2, v1, 8); + v1 = Sse2.Or(Sse2.ShiftRightLogical128BitLane(v1, 4), Sse2.ShiftLeftLogical128BitLane(v2, 4)); + v2 = Ssse3.AlignRight(v3, v2, 12); ref Vector128 vd = ref Unsafe.Add(ref destBase, i); @@ -416,7 +418,8 @@ private static void Pad3Shuffle4( { if (Ssse3.IsSupported) { - Vector128 vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte(); + ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); + Vector128 vmask = Unsafe.As>(ref vmaskBase); Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); Span bytes = stackalloc byte[Vector128.Count]; @@ -459,8 +462,9 @@ private static void Shuffle4Slice3( { if (Ssse3.IsSupported) { - Vector128 vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte(); - Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte(); + ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); + Vector128 vmasko = Unsafe.As>(ref vmaskoBase); + Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); From 3cda066270912f3289de3085b6719f03cb0fbf3c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 6 Nov 2020 20:45:39 +0000 Subject: [PATCH 20/20] Fix shuffle --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 51c81be066..2ea7f2c9bd 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -358,7 +358,6 @@ private static void Shuffle3( { ref byte vmaskBase = ref MemoryMarshal.GetReference(ShuffleMaskPad4Nx16); Vector128 vmask = Unsafe.As>(ref vmaskBase); - Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); ref byte vmaskoBase = ref MemoryMarshal.GetReference(ShuffleMaskSlice4Nx16); Vector128 vmasko = Unsafe.As>(ref vmaskoBase); Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); @@ -398,8 +397,12 @@ private static void Shuffle3( v3 = Ssse3.Shuffle(v3, vmasko); v0 = Ssse3.AlignRight(v1, v0, 4); - v1 = Sse2.Or(Sse2.ShiftRightLogical128BitLane(v1, 4), Sse2.ShiftLeftLogical128BitLane(v2, 4)); - v2 = Ssse3.AlignRight(v3, v2, 12); + v3 = Ssse3.AlignRight(v3, v2, 12); + + v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); + v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); + + v1 = Ssse3.AlignRight(v2, v1, 8); ref Vector128 vd = ref Unsafe.Add(ref destBase, i); @@ -493,8 +496,12 @@ private static void Shuffle4Slice3( v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); v0 = Ssse3.AlignRight(v1, v0, 4); - v1 = Sse2.Or(Sse2.ShiftRightLogical128BitLane(v1, 4), Sse2.ShiftLeftLogical128BitLane(v2, 4)); - v2 = Ssse3.AlignRight(v3, v2, 12); + v3 = Ssse3.AlignRight(v3, v2, 12); + + v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); + v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); + + v1 = Ssse3.AlignRight(v2, v1, 8); ref Vector128 vd = ref Unsafe.Add(ref destBase, j);