Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2374,9 +2374,8 @@ public static void Reverse(ref byte buf, nuint length)
buf = ref Unsafe.Add(ref buf, numIters * numElements);
length -= numIters * numElements * 2;
}
else if (Ssse3.IsSupported && (nuint)Vector128<byte>.Count * 2 <= length)
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<byte>.Count * 2 <= length)
{
Vector128<byte> reverseMask = Vector128.Create((byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
nuint numElements = (nuint)Vector128<byte>.Count;
nuint numIters = (length / numElements) / 2;
for (nuint i = 0; i < numIters; i++)
Expand All @@ -2396,8 +2395,10 @@ public static void Reverse(ref byte buf, nuint length)
// +---------------------------------------------------------------+
// | P | O | N | M | L | K | J | I | H | G | F | E | D | C | B | A |
// +---------------------------------------------------------------+
tempFirst = Ssse3.Shuffle(tempFirst, reverseMask);
tempLast = Ssse3.Shuffle(tempLast, reverseMask);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(
(byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));

// Store the reversed vectors
tempLast.StoreUnsafe(ref buf, firstOffset);
Expand Down
35 changes: 18 additions & 17 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2018,10 +2018,10 @@ private static int FindFirstMatchedLane(Vector128<ushort> compareResult)

public static void Reverse(ref char buf, nuint length)
{
ref byte bufByte = ref Unsafe.As<char, byte>(ref buf);
nuint byteLength = length * sizeof(char);
if (Avx2.IsSupported && (nuint)Vector256<short>.Count * 2 <= length)
{
ref byte bufByte = ref Unsafe.As<char, byte>(ref buf);
nuint byteLength = length * sizeof(char);
Vector256<byte> reverseMask = Vector256.Create(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue with when you tried to replace Avx2.Shuffle with Vector256.Shuffle is that you didn't adjust the reverseMask.

You should change this:

Vector256.Create(
    (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,   // first 128-bit lane
          15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); // second 128-bit lane

To:

Vector256.Create(
    (byte)15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,   // first 128-bit lane
          31, 30, 29, 28, 27 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16)); // second 128-bit lane

The Vector256 APIs operate as if it is 1x256-bit vector rather than as 2x128-bit vector lanes. This is consistent with how AVX-512, Arm64, WASM, Vector64, Vector128, and other types all operate.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you do this, then Vector256.Shuffle(tempFirst, Vector256.Create(...)) will work as expected and still be performant on AVX2 hardware where you don't want to cross lanes.

Copy link
Contributor Author

@SwapnilGaikwad SwapnilGaikwad Aug 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Vector256 APIs operate as if it is 1x256-bit vector

In this case, shouldn't we adjust the mask to

Vector256.Create(
    (byte)31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
          15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ));

I noticed the issue the reverse mask but couldn't reproduce the failure yet. Does the runfo tool expects to use Windows only? The steps using runfo to reproduce the pipeline failure seem create a batch file to run on Windows.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

runfo

It can be used on Linux as well. You just need to pass the right job-id and work item which you can find it in AzDo.

image

The RunTests.cmd/sh is present in the zip folder you download.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ooh, right. Thanks Kunal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we leave the AVX2 changes out of this patch. The patch is now self contained to changes in the 128bit variants.

I'm happy for 256bit part to be changed if it's obvious, but clearly it's going to require some extra debugging to get right and not break performance (given #72793). Let's avoid feature creeping this PR.

Copy link
Member

@EgorBo EgorBo Aug 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine to leave it untouched, crossplatform Vector256 apis are mostly for consistency, they're not crossplatform and unlikely to be ever so

(byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, // first 128-bit lane
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); // second 128-bit lane
Expand Down Expand Up @@ -2060,20 +2060,22 @@ public static void Reverse(ref char buf, nuint length)
}
bufByte = ref Unsafe.Add(ref bufByte, numIters * numElements);
length -= numIters * (nuint)Vector256<short>.Count * 2;
// Store any remaining values one-by-one
buf = ref Unsafe.As<byte, char>(ref bufByte);
}
else if (Ssse3.IsSupported && (nuint)Vector128<short>.Count * 2 <= length)
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<short>.Count * 2 <= length)
{
Vector128<byte> reverseMask = Vector128.Create((byte)14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
nuint numElements = (nuint)Vector128<byte>.Count;
nuint numIters = ((length * sizeof(char)) / numElements) / 2;
ref short bufShort = ref Unsafe.As<char, short>(ref buf);
nuint numElements = (nuint)Vector128<short>.Count;
nuint numIters = (length / numElements) / 2;
for (nuint i = 0; i < numIters; i++)
{
nuint firstOffset = i * numElements;
nuint lastOffset = byteLength - ((1 + i) * numElements);
nuint lastOffset = length - ((1 + i) * numElements);

// Load in values from beginning and end of the array.
Vector128<byte> tempFirst = Vector128.LoadUnsafe(ref bufByte, firstOffset);
Vector128<byte> tempLast = Vector128.LoadUnsafe(ref bufByte, lastOffset);
Vector128<short> tempFirst = Vector128.LoadUnsafe(ref bufShort, firstOffset);
Vector128<short> tempLast = Vector128.LoadUnsafe(ref bufShort, lastOffset);

// Shuffle to reverse each vector:
// +-------------------------------+
Expand All @@ -2083,19 +2085,18 @@ public static void Reverse(ref char buf, nuint length)
// +-------------------------------+
// | H | G | F | E | D | C | B | A |
// +-------------------------------+
tempFirst = Ssse3.Shuffle(tempFirst, reverseMask);
tempLast = Ssse3.Shuffle(tempLast, reverseMask);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(7, 6, 5, 4, 3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(7, 6, 5, 4, 3, 2, 1, 0));

// Store the reversed vectors
tempLast.StoreUnsafe(ref bufByte, firstOffset);
tempFirst.StoreUnsafe(ref bufByte, lastOffset);
tempLast.StoreUnsafe(ref bufShort, firstOffset);
tempFirst.StoreUnsafe(ref bufShort, lastOffset);
}
bufByte = ref Unsafe.Add(ref bufByte, numIters * numElements);
bufShort = ref Unsafe.Add(ref bufShort, numIters * numElements);
length -= numIters * (nuint)Vector128<short>.Count * 2;
// Store any remaining values one-by-one
buf = ref Unsafe.As<short, char>(ref bufShort);
}

// Store any remaining values one-by-one
buf = ref Unsafe.As<byte, char>(ref bufByte);
ReverseInner(ref buf, length);
}
}
Expand Down
32 changes: 15 additions & 17 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;

namespace System
Expand Down Expand Up @@ -441,7 +442,7 @@ public static void Reverse(ref int buf, nuint length)
buf = ref Unsafe.Add(ref buf, numIters * numElements);
length -= numIters * numElements * 2;
}
else if (Sse2.IsSupported && (nuint)Vector128<int>.Count * 2 <= length)
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<int>.Count * 2 <= length)
{
nuint numElements = (nuint)Vector128<int>.Count;
nuint numIters = (length / numElements) / 2;
Expand All @@ -462,8 +463,8 @@ public static void Reverse(ref int buf, nuint length)
// +---------------+
// | D | C | B | A |
// +---------------+
tempFirst = Sse2.Shuffle(tempFirst, 0b00_01_10_11);
tempLast = Sse2.Shuffle(tempLast, 0b00_01_10_11);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(3, 2, 1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(3, 2, 1, 0));

// Store the values into final location
tempLast.StoreUnsafe(ref buf, firstOffset);
Expand Down Expand Up @@ -508,19 +509,17 @@ public static void Reverse(ref long buf, nuint length)
buf = ref Unsafe.Add(ref buf, numIters * numElements);
length -= numIters * numElements * 2;
}
else if (Sse2.IsSupported && (nuint)Vector128<long>.Count * 2 <= length)
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<long>.Count * 2 <= length)
{
ref int bufInt = ref Unsafe.As<long, int>(ref buf);
nuint intLength = length * (sizeof(long) / sizeof(int));
nuint numElements = (nuint)Vector128<int>.Count;
nuint numIters = (intLength / numElements) / 2;
nuint numElements = (nuint)Vector128<long>.Count;
nuint numIters = (length / numElements) / 2;
for (nuint i = 0; i < numIters; i++)
{
nuint firstOffset = i * numElements;
nuint lastOffset = intLength - ((1 + i) * numElements);
nuint lastOffset = length - ((1 + i) * numElements);
// Load the values into vectors
Vector128<int> tempFirst = Vector128.LoadUnsafe(ref bufInt, firstOffset);
Vector128<int> tempLast = Vector128.LoadUnsafe(ref bufInt, lastOffset);
Vector128<long> tempFirst = Vector128.LoadUnsafe(ref buf, firstOffset);
Vector128<long> tempLast = Vector128.LoadUnsafe(ref buf, lastOffset);

// Shuffle to reverse each vector:
// +-------+
Expand All @@ -530,15 +529,14 @@ public static void Reverse(ref long buf, nuint length)
// +-------+
// | B | A |
// +-------+
tempFirst = Sse2.Shuffle(tempFirst, 0b0100_1110);
tempLast = Sse2.Shuffle(tempLast, 0b0100_1110);
tempFirst = Vector128.Shuffle(tempFirst, Vector128.Create(1, 0));
tempLast = Vector128.Shuffle(tempLast, Vector128.Create(1, 0));

// Store the values into final location
tempLast.StoreUnsafe(ref bufInt, firstOffset);
tempFirst.StoreUnsafe(ref bufInt, lastOffset);
tempLast.StoreUnsafe(ref buf, firstOffset);
tempFirst.StoreUnsafe(ref buf, lastOffset);
}
bufInt = ref Unsafe.Add(ref bufInt, numIters * numElements);
buf = ref Unsafe.As<int, long>(ref bufInt);
buf = ref Unsafe.Add(ref buf, numIters * numElements);
length -= numIters * (nuint)Vector128<long>.Count * 2;
}

Expand Down
Loading