Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
49e9364
Initial 3padshuffle4
JimBobSquarePants Oct 30, 2020
1d21dc9
Add Shuffle4Slice3
JimBobSquarePants Oct 30, 2020
9f38d40
Cleanup
JimBobSquarePants Oct 30, 2020
2421a56
Merge branch 'master' into js/Shuffle3Channel
JimBobSquarePants Oct 30, 2020
1b85483
fix spans directly
JimBobSquarePants Oct 31, 2020
21611e1
Faster Pad3Shuffle4
JimBobSquarePants Oct 31, 2020
f462bfe
Faster Shuffle4Slice3
JimBobSquarePants Oct 31, 2020
2d1f2cc
Update benchmark
JimBobSquarePants Oct 31, 2020
d5b2577
Fast fallbacks
JimBobSquarePants Nov 1, 2020
893bfdd
Don't cast full spans
JimBobSquarePants Nov 1, 2020
76d5277
Shuffle3 + Tests
JimBobSquarePants Nov 2, 2020
49062c4
Cleanup and fix tests
JimBobSquarePants Nov 2, 2020
8c32469
Fix Shuffle4Slice3, wire up shuffles.
JimBobSquarePants Nov 2, 2020
1f73b21
Add Rgb24 <==> Vector4 benchmarks
JimBobSquarePants Nov 2, 2020
a08f906
Unroll XYZWShuffle4Slice3
JimBobSquarePants Nov 4, 2020
4416d3d
Fix shuffle +m slice fallback
JimBobSquarePants Nov 4, 2020
11cc6af
Inline controls as constants
JimBobSquarePants Nov 4, 2020
1ad9fcd
Handle Bmp encoder padding.
JimBobSquarePants Nov 6, 2020
e1168ad
Update src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
JimBobSquarePants Nov 6, 2020
a46fb9b
Merge branch 'js/Shuffle3Channel' of https://github.com/SixLabors/Ima…
JimBobSquarePants Nov 6, 2020
74dd8cd
Use ROS trick all round and optimize Shuffle3
JimBobSquarePants Nov 6, 2020
56cfd96
Merge branch 'master' into js/Shuffle3Channel
JimBobSquarePants Nov 6, 2020
3cda066
Fix shuffle
JimBobSquarePants Nov 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 142 additions & 7 deletions src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public static class HwIntrinsics
/// <param name="dest">The destination span of floats.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4ChannelReduce(
public static void Shuffle4Reduce(
ref ReadOnlySpan<float> source,
ref Span<float> dest,
byte control)
Expand All @@ -41,7 +41,7 @@ public static void Shuffle4ChannelReduce(

if (adjustedCount > 0)
{
Shuffle4Channel(
Shuffle4(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount),
control);
Expand All @@ -53,14 +53,14 @@ public static void Shuffle4ChannelReduce(
}

/// <summary>
/// Shuffle 8-bit integers in a within 128-bit lanes in <paramref name="source"/>
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of bytes.</param>
/// <param name="dest">The destination span of bytes.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4ChannelReduce(
public static void Shuffle4Reduce(
ref ReadOnlySpan<byte> source,
ref Span<byte> dest,
byte control)
Expand All @@ -75,7 +75,7 @@ public static void Shuffle4ChannelReduce(

if (adjustedCount > 0)
{
Shuffle4Channel(
Shuffle4(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount),
control);
Expand All @@ -86,8 +86,74 @@ public static void Shuffle4ChannelReduce(
}
}

/// <summary>
/// Pads then shuffles 8-bit integers within 128-bit lanes in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of bytes.</param>
/// <param name="dest">The destination span of bytes.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static unsafe void Pad3Shuffle4Reduce(
ref ReadOnlySpan<byte> source,
ref Span<byte> dest,
byte control)
{
if (Ssse3.IsSupported)
{
int remainder = ImageMaths.ModuloP2(source.Length, Vector128<byte>.Count);

int adjustedCount = source.Length - remainder;
int sourceSlice = (int)(adjustedCount * (3 / 4F));

if (adjustedCount > 0)
{
Pad3Shuffle4(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount),
control);

source = source.Slice(sourceSlice);
dest = dest.Slice(adjustedCount);
}
}
}

/// <summary>
/// Shuffles then slices 8-bit integers within 128-bit lanes in <paramref name="source"/>
/// using the control and store the results in <paramref name="dest"/>.
/// </summary>
/// <param name="source">The source span of bytes.</param>
/// <param name="dest">The destination span of bytes.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4Channel(
public static unsafe void Shuffle4Slice3Reduce(
ref ReadOnlySpan<byte> source,
ref Span<byte> dest,
byte control)
{
if (Ssse3.IsSupported)
{
int remainder = ImageMaths.ModuloP2(dest.Length, Vector128<byte>.Count);

int adjustedCount = dest.Length - remainder;
int destSlice = (int)(adjustedCount * (3 / 4F));

if (adjustedCount > 0)
{
Shuffle4Slice3(
source.Slice(0, adjustedCount),
dest.Slice(0, adjustedCount),
control);

source = source.Slice(adjustedCount);
dest = dest.Slice(destSlice);
}
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4(
ReadOnlySpan<float> source,
Span<float> dest,
byte control)
Expand Down Expand Up @@ -165,7 +231,7 @@ private static void Shuffle4Channel(
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4Channel(
private static void Shuffle4(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
Expand Down Expand Up @@ -246,6 +312,75 @@ private static void Shuffle4Channel(
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe void Pad3Shuffle4(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
if (Ssse3.IsSupported)
{
Vector128<byte> wMask = Vector128.Create(0xff000000u).AsByte();
Vector128<byte> padMask = Vector128.Create(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1).AsByte();

Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
Shuffle.MmShuffleSpan(ref bytes, control);
Vector128<byte> vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes));

fixed (byte* sBase = source)
fixed (byte* dBase = dest)
{
byte* s = sBase;
byte* d = dBase;

// TODO: Consider unrolling and shuffling 4 at a time using Ssse3.AlignRight
// See https://stackoverflow.com/questions/2973708/fast-24-bit-array-32-bit-array-conversion
for (int i = 0; i < source.Length; i += 16)
{
Vector128<byte> vs0 = Sse2.LoadVector128(s);
Vector128<byte> val = Sse2.Or(wMask, Ssse3.Shuffle(vs0, padMask));
val = Ssse3.Shuffle(val, vcm);
Sse2.Store(d, val);

s += 12;
d += 16;
}
}
}
}

[MethodImpl(InliningOptions.ShortMethod)]
private static unsafe void Shuffle4Slice3(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
if (Ssse3.IsSupported)
{
Vector128<byte> sliceMask = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1).AsByte();

Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
Shuffle.MmShuffleSpan(ref bytes, control);
Vector128<byte> vcm = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes));

fixed (byte* sBase = source)
fixed (byte* dBase = dest)
{
byte* s = sBase;
byte* d = dBase;

for (int i = 0; i < source.Length; i += 16)
{
Vector128<byte> vs0 = Ssse3.Shuffle(Sse2.LoadVector128(s), vcm);
Sse2.Store(d, Ssse3.Shuffle(vs0, sliceMask));

s += 16;
d += 12;
}
}
}
}

/// <summary>
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
/// </summary>
Expand Down
125 changes: 118 additions & 7 deletions src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ internal static partial class SimdUtils
/// <param name="dest">The destination span of floats.</param>
/// <param name="control">The byte control.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Channel(
public static void Shuffle4(
ReadOnlySpan<float> source,
Span<float> dest,
byte control)
{
VerifyShuffleSpanInput(source, dest);

#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control);
HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control);
#endif

// Deal with the remainder:
if (source.Length > 0)
{
ShuffleRemainder4Channel(source, dest, control);
Shuffle4Remainder(source, dest, control);
}
}

Expand All @@ -44,7 +44,7 @@ public static void Shuffle4Channel(
/// <param name="dest">The destination span of bytes.</param>
/// <param name="shuffle">The type of shuffle to perform.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Channel<TShuffle>(
public static void Shuffle4<TShuffle>(
ReadOnlySpan<byte> source,
Span<byte> dest,
TShuffle shuffle)
Expand All @@ -53,7 +53,7 @@ public static void Shuffle4Channel<TShuffle>(
VerifyShuffleSpanInput(source, dest);

#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, shuffle.Control);
HwIntrinsics.Shuffle4Reduce(ref source, ref dest, shuffle.Control);
#endif

// Deal with the remainder:
Expand All @@ -63,7 +63,45 @@ public static void Shuffle4Channel<TShuffle>(
}
}

public static void ShuffleRemainder4Channel(
[MethodImpl(InliningOptions.ShortMethod)]
public static void Pad3Shuffle4(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
VerifyPad3Shuffle4SpanInput(source, dest);

#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, control);
#endif

// Deal with the remainder:
if (source.Length > 0)
{
Pad3Shuffle4Remainder(source, dest, control);
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Slice3(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
VerifyShuffle4Slice3SpanInput(source, dest);

#if SUPPORTS_RUNTIME_INTRINSICS
HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, control);
#endif

// Deal with the remainder:
if (source.Length > 0)
{
Shuffle4Slice3Remainder(source, dest, control);
}
}

public static void Shuffle4Remainder(
ReadOnlySpan<float> source,
Span<float> dest,
byte control)
Expand All @@ -81,6 +119,41 @@ public static void ShuffleRemainder4Channel(
}
}

public static void Pad3Shuffle4Remainder(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0);

for (int i = 0, j = 0; i < dest.Length; i += 4, j += 3)
{
Unsafe.Add(ref dBase, p0 + i) = Unsafe.Add(ref sBase, j);
Unsafe.Add(ref dBase, p1 + i) = Unsafe.Add(ref sBase, j + 1);
Unsafe.Add(ref dBase, p2 + i) = Unsafe.Add(ref sBase, j + 2);
Unsafe.Add(ref dBase, p3 + i) = byte.MaxValue;
}
}

public static void Shuffle4Slice3Remainder(
ReadOnlySpan<byte> source,
Span<byte> dest,
byte control)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
Shuffle.InverseMmShuffle(control, out int _, out int p2, out int p1, out int p0);

for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4)
{
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j);
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j);
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j);
}
}

[Conditional("DEBUG")]
private static void VerifyShuffleSpanInput<T>(ReadOnlySpan<T> source, Span<T> dest)
where T : struct
Expand All @@ -93,7 +166,45 @@ private static void VerifyShuffleSpanInput<T>(ReadOnlySpan<T> source, Span<T> de
DebugGuard.IsTrue(
source.Length % 4 == 0,
nameof(source),
"Input spans must be divisiable by 4!");
"Input spans must be divisable by 4!");
}

[Conditional("DEBUG")]
private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan<byte> source, Span<byte> dest)
{
DebugGuard.IsTrue(
source.Length % 3 == 0,
nameof(source),
"Input span must be divisable by 3!");

DebugGuard.IsTrue(
dest.Length % 4 == 0,
nameof(dest),
"Output span must be divisable by 4!");

DebugGuard.IsTrue(
source.Length == (int)(dest.Length * 3 / 4F),
nameof(source),
"Input span must be 3/4 the length of the output span!");
}

[Conditional("DEBUG")]
private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan<byte> source, Span<byte> dest)
{
DebugGuard.IsTrue(
source.Length % 4 == 0,
nameof(source),
"Input span must be divisable by 4!");

DebugGuard.IsTrue(
dest.Length % 3 == 0,
nameof(dest),
"Output span must be divisable by 3!");

DebugGuard.IsTrue(
source.Length == (int)(dest.Length * 4 / 3F),
nameof(source),
"Output span must be 3/4 the length of the input span!");
}

public static class Shuffle
Expand Down
Loading