From 0e185af11b3f35e34ebdf9aa1282f2ad9e8b0788 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Thu, 4 Aug 2022 14:58:48 +0200 Subject: [PATCH 1/3] port SpanHelpers.IndexOfAny(ref byte, byte, byte, int) to Vector128/256 --- .../src/System/SpanHelpers.Byte.cs | 97 ++++--------------- 1 file changed, 21 insertions(+), 76 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 06d29a5bba2846..854178c841aef2 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -808,7 +808,7 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations nuint lengthToExamine = (nuint)(uint)length; - if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Avx2 branch also operates on Sse2 sizes, so check is combined. nint vectorDiff = (nint)length - Vector128.Count; @@ -924,10 +924,10 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int // the end and forwards, which may overlap on an earlier compare. // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - int matches; - if (Avx2.IsSupported) + uint matches; + if (Vector256.IsHardwareAccelerated) { Vector256 search; // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 @@ -943,13 +943,10 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int // First time this checks again against 0, however we will move into final compare if it fails. while (lengthToExamine > offset) { - search = LoadVector256(ref searchSpace, offset); + search = Vector256.LoadUnsafe(ref searchSpace, offset); // Bitwise Or to combine the flagged matches for the second value to our match flags - matches = Avx2.MoveMask( - Avx2.Or( - Avx2.CompareEqual(values0, search), - Avx2.CompareEqual(values1, search))); - // Note that MoveMask has converted the equal vector elements into a set of bit flags, + matches = (Vector256.Equals(values0, search) | Vector256.Equals(values1, search)).ExtractMostSignificantBits(); + // Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { @@ -962,13 +959,10 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int } // Move to Vector length from end for final compare - search = LoadVector256(ref searchSpace, lengthToExamine); + search = Vector256.LoadUnsafe(ref searchSpace, lengthToExamine); offset = lengthToExamine; // Same as method as above - matches = Avx2.MoveMask( - Avx2.Or( - Avx2.CompareEqual(values0, search), - Avx2.CompareEqual(values1, search))); + matches = (Vector256.Equals(values0, search) | Vector256.Equals(values1, search)).ExtractMostSignificantBits(); if (matches == 0) { // None matched @@ -980,6 +974,7 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int } // Initial size check was done on method entry. + Vector128 compareResult; Debug.Assert(length >= Vector128.Count); { Vector128 search; @@ -988,37 +983,33 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int // First time this checks against 0 and we will move into final compare if it fails. while (lengthToExamine > offset) { - search = LoadVector128(ref searchSpace, offset); + search = Vector128.LoadUnsafe(ref searchSpace, offset); - matches = Sse2.MoveMask( - Sse2.Or( - Sse2.CompareEqual(values0, search), - Sse2.CompareEqual(values1, search)) - .AsByte()); - // Note that MoveMask has converted the equal vector elements into a set of bit flags, - // So the bit position in 'matches' corresponds to the element offset. - if (matches == 0) + compareResult = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); + + if (compareResult == Vector128.Zero) { // None matched offset += (nuint)Vector128.Count; continue; } + matches = compareResult.ExtractMostSignificantBits(); goto IntrinsicsMatch; } // Move to Vector length from end for final compare - search = LoadVector128(ref searchSpace, lengthToExamine); + search = Vector128.LoadUnsafe(ref searchSpace, lengthToExamine); offset = lengthToExamine; // Same as method as above - matches = Sse2.MoveMask( - Sse2.Or( - Sse2.CompareEqual(values0, search), - Sse2.CompareEqual(values1, search))); - if (matches == 0) + compareResult = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); + + if (compareResult == Vector128.Zero) { // None matched goto NotFound; } + + matches = compareResult.ExtractMostSignificantBits(); } IntrinsicsMatch: @@ -1026,52 +1017,6 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int offset += (nuint)BitOperations.TrailingZeroCount(matches); goto Found; } - else if (AdvSimd.Arm64.IsSupported) - { - Vector128 search; - Vector128 matches; - Vector128 values0 = Vector128.Create(value0); - Vector128 values1 = Vector128.Create(value1); - // First time this checks against 0 and we will move into final compare if it fails. - while (lengthToExamine > offset) - { - search = LoadVector128(ref searchSpace, offset); - - matches = AdvSimd.Or( - AdvSimd.CompareEqual(values0, search), - AdvSimd.CompareEqual(values1, search)); - - if (matches == Vector128.Zero) - { - offset += (nuint)Vector128.Count; - continue; - } - - // Find bitflag offset of first match and add to current offset - offset += FindFirstMatchedLane(matches); - - goto Found; - } - - // Move to Vector length from end for final compare - search = LoadVector128(ref searchSpace, lengthToExamine); - offset = lengthToExamine; - // Same as method as above - matches = AdvSimd.Or( - AdvSimd.CompareEqual(values0, search), - AdvSimd.CompareEqual(values1, search)); - - if (matches == Vector128.Zero) - { - // None matched - goto NotFound; - } - - // Find bitflag offset of first match and add to current offset - offset += FindFirstMatchedLane(matches); - - goto Found; - } else if (Vector.IsHardwareAccelerated) { Vector values0 = new Vector(value0); From 288bcfc4b3a65c4da386a6525257e19d234e0eca Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Tue, 9 Aug 2022 14:12:15 +0200 Subject: [PATCH 2/3] rewrite the code to get rid of gotos --- .../src/System/SpanHelpers.Byte.cs | 309 +++++------------- 1 file changed, 87 insertions(+), 222 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 9af0e6ac14b227..fc1c66473e93a1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -774,264 +774,126 @@ public static int LastIndexOf(ref byte searchSpace, byte value, int length) [MethodImpl(MethodImplOptions.AggressiveOptimization)] public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int length) { - Debug.Assert(length >= 0); - - uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions - uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions - nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations - nuint lengthToExamine = (nuint)(uint)length; - - if (Vector128.IsHardwareAccelerated) + // Vector128 code path is executed for Vector256 when Vector128.Count <= length < Vector256.Count + if (!Vector.IsHardwareAccelerated || (length < Vector.Count && !(Vector128.IsHardwareAccelerated && length >= Vector128.Count))) { - // Avx2 branch also operates on Sse2 sizes, so check is combined. - nint vectorDiff = (nint)length - Vector128.Count; - if (vectorDiff >= 0) + // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue0 = value0, uValue1 = value1, lookUp; + // Use nuint for offset to avoid unnecessary 64->32->64 truncations + for (nuint offset = 0; offset < (uint)length; offset++) { - // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. - // We jump forward to the intrinsics at the end of the method so a naive branch predict - // will choose the non-intrinsic path so short lengths which don't gain anything aren't - // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths - // more than make this back from the intrinsics. - lengthToExamine = (nuint)vectorDiff; - goto IntrinsicsCompare; - } - } - else if (Vector.IsHardwareAccelerated) - { - // Calculate lengthToExamine here for test, as it is used later - nint vectorDiff = (nint)length - Vector.Count; - if (vectorDiff >= 0) - { - // Similar as above for Vector version - lengthToExamine = (nuint)vectorDiff; - goto IntrinsicsCompare; + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) + { + return (int)offset; + } } } - - uint lookUp; - while (lengthToExamine >= 8) + else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { - lengthToExamine -= 8; - - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found1; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found2; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found3; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found4; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found5; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found6; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found7; - - offset += 8; - } - - if (lengthToExamine >= 4) - { - lengthToExamine -= 4; - - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found1; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found2; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found3; - - offset += 4; - } - - while (lengthToExamine > 0) - { - - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); - if (uValue0 == lookUp || uValue1 == lookUp) - goto Found; - - offset += 1; - lengthToExamine -= 1; - } - - NotFound: - return -1; - Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 - return (int)offset; - Found1: - return (int)(offset + 1); - Found2: - return (int)(offset + 2); - Found3: - return (int)(offset + 3); - Found4: - return (int)(offset + 4); - Found5: - return (int)(offset + 5); - Found6: - return (int)(offset + 6); - Found7: - return (int)(offset + 7); - - IntrinsicsCompare: - // When we move into a Vectorized block, we process everything of Vector size; - // and then for any remainder we do a final compare of Vector size but starting at - // the end and forwards, which may overlap on an earlier compare. + ref byte current = ref searchSpace; + ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector256.Count); + Vector256 values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), search, compareResult; - // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. - if (Vector128.IsHardwareAccelerated) - { - uint matches; - if (Vector256.IsHardwareAccelerated) + do { - Vector256 search; - // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 - // We have already subtracted Vector128.Count from lengthToExamine so compare against that - // to see if we have double the size for Vector256.Count - if (lengthToExamine >= (nuint)Vector128.Count) + search = Vector256.LoadUnsafe(ref current); + compareResult = Vector256.Equals(values0, search) | Vector256.Equals(values1, search); + if (compareResult == Vector256.Zero) { - Vector256 values0 = Vector256.Create(value0); - Vector256 values1 = Vector256.Create(value1); - - // Subtract Vector128.Count so we have now subtracted Vector256.Count - lengthToExamine -= (nuint)Vector128.Count; - // First time this checks again against 0, however we will move into final compare if it fails. - while (lengthToExamine > offset) - { - search = Vector256.LoadUnsafe(ref searchSpace, offset); - // Bitwise Or to combine the flagged matches for the second value to our match flags - matches = (Vector256.Equals(values0, search) | Vector256.Equals(values1, search)).ExtractMostSignificantBits(); - // Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags, - // So the bit position in 'matches' corresponds to the element offset. - if (matches == 0) - { - // None matched - offset += (nuint)Vector256.Count; - continue; - } - - goto IntrinsicsMatch; - } + current = ref Unsafe.Add(ref current, Vector256.Count); + continue; + } - // Move to Vector length from end for final compare - search = Vector256.LoadUnsafe(ref searchSpace, lengthToExamine); - offset = lengthToExamine; - // Same as method as above - matches = (Vector256.Equals(values0, search) | Vector256.Equals(values1, search)).ExtractMostSignificantBits(); - if (matches == 0) - { - // None matched - goto NotFound; - } + return GetIndexFromMask(ref searchSpace, ref current, compareResult.ExtractMostSignificantBits()); + } while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); - goto IntrinsicsMatch; + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector256.Count != 0) + { + search = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd); + compareResult = Vector256.Equals(values0, search) | Vector256.Equals(values1, search); + if (compareResult != Vector256.Zero) + { + return GetIndexFromMask(ref searchSpace, ref oneVectorAwayFromEnd, compareResult.ExtractMostSignificantBits()); } } + } + else if (Vector128.IsHardwareAccelerated) + { + ref byte current = ref searchSpace; + ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector128.Count); + Vector128 values0 = Vector128.Create(value0), values1 = Vector128.Create(value1), search, equals; - // Initial size check was done on method entry. - Vector128 compareResult; - Debug.Assert(length >= Vector128.Count); + do { - Vector128 search; - Vector128 values0 = Vector128.Create(value0); - Vector128 values1 = Vector128.Create(value1); - // First time this checks against 0 and we will move into final compare if it fails. - while (lengthToExamine > offset) + search = Vector128.LoadUnsafe(ref current); + equals = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); + if (equals == Vector128.Zero) { - search = Vector128.LoadUnsafe(ref searchSpace, offset); - - compareResult = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); - - if (compareResult == Vector128.Zero) - { - // None matched - offset += (nuint)Vector128.Count; - continue; - } - - matches = compareResult.ExtractMostSignificantBits(); - goto IntrinsicsMatch; + current = ref Unsafe.Add(ref current, Vector128.Count); + continue; } - // Move to Vector length from end for final compare - search = Vector128.LoadUnsafe(ref searchSpace, lengthToExamine); - offset = lengthToExamine; - // Same as method as above - compareResult = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); - if (compareResult == Vector128.Zero) + return GetIndexFromMask(ref searchSpace, ref current, equals.ExtractMostSignificantBits()); + } while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); + + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector128.Count != 0) + { + search = Vector128.LoadUnsafe(ref oneVectorAwayFromEnd); + equals = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); + if (equals != Vector128.Zero) { - // None matched - goto NotFound; + return GetIndexFromMask(ref searchSpace, ref oneVectorAwayFromEnd, equals.ExtractMostSignificantBits()); } - - matches = compareResult.ExtractMostSignificantBits(); } - - IntrinsicsMatch: - // Find bitflag offset of first difference and add to current offset - offset += (nuint)BitOperations.TrailingZeroCount(matches); - goto Found; } else if (Vector.IsHardwareAccelerated) { - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); + ref byte current = ref searchSpace; + ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector.Count); + Vector values0 = new Vector(value0), values1 = new Vector(value1), search, equals; - Vector search; - // First time this checks against 0 and we will move into final compare if it fails. - while (lengthToExamine > offset) + do { - search = LoadVector(ref searchSpace, offset); - search = Vector.BitwiseOr( - Vector.Equals(search, values0), - Vector.Equals(search, values1)); + search = LoadVector(ref current); + equals = Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1)); if (Vector.Zero.Equals(search)) { - // None matched - offset += (nuint)Vector.Count; + current = ref Unsafe.Add(ref current, Vector.Count); continue; } - goto VectorMatch; - } + return GetIndexFromVector(ref searchSpace, ref current, equals); + } while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); - // Move to Vector length from end for final compare - search = LoadVector(ref searchSpace, lengthToExamine); - offset = lengthToExamine; - search = Vector.BitwiseOr( - Vector.Equals(search, values0), - Vector.Equals(search, values1)); - if (Vector.Zero.Equals(search)) + // If any elements remain, process the last vector in the search space. + if ((uint)length % Vector.Count != 0) { - // None matched - goto NotFound; + search = LoadVector(ref oneVectorAwayFromEnd); + equals = Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1)); + if (!Vector.Zero.Equals(search)) + { + return GetIndexFromVector(ref searchSpace, ref oneVectorAwayFromEnd, equals); + } } + } - VectorMatch: - offset += (nuint)LocateFirstFoundByte(search); - goto Found; + return -1; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static int GetIndexFromMask(ref byte searchSpace, ref byte current, uint matches) + { + // Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + // Find bitflag offset of first difference and add to current offset + return (int)Unsafe.ByteOffset(ref searchSpace, ref current) + BitOperations.TrailingZeroCount(matches); } - Debug.Fail("Unreachable"); - goto NotFound; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static int GetIndexFromVector(ref byte searchSpace, ref byte current, Vector search) + => (int)Unsafe.ByteOffset(ref searchSpace, ref current) + LocateFirstFoundByte(search); } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -2177,6 +2039,9 @@ private static nuint LoadNUInt(ref byte start) private static nuint LoadNUInt(ref byte start, nuint offset) => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector LoadVector(ref byte start) => Unsafe.ReadUnaligned>(ref start); + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector LoadVector(ref byte start, nuint offset) => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); From 425b532d98d78156a3854587c1eba5c18f35ee87 Mon Sep 17 00:00:00 2001 From: Adam Sitnik Date: Tue, 9 Aug 2022 19:55:43 +0200 Subject: [PATCH 3/3] simplify the code --- .../src/System/SpanHelpers.Byte.cs | 125 ++++++------------ 1 file changed, 41 insertions(+), 84 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index fc1c66473e93a1..c15ccba855a017 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -774,126 +774,86 @@ public static int LastIndexOf(ref byte searchSpace, byte value, int length) [MethodImpl(MethodImplOptions.AggressiveOptimization)] public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int length) { - // Vector128 code path is executed for Vector256 when Vector128.Count <= length < Vector256.Count - if (!Vector.IsHardwareAccelerated || (length < Vector.Count && !(Vector128.IsHardwareAccelerated && length >= Vector128.Count))) - { - // Use uint for comparisons to avoid unnecessary 8->32 extensions - uint uValue0 = value0, uValue1 = value1, lookUp; - // Use nuint for offset to avoid unnecessary 64->32->64 truncations - for (nuint offset = 0; offset < (uint)length; offset++) - { - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); - if (uValue0 == lookUp || uValue1 == lookUp) - { - return (int)offset; - } - } - } - else if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) + nuint offset = 0; + + if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { - ref byte current = ref searchSpace; - ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector256.Count); - Vector256 values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), search, compareResult; + nuint lastVectorOffset = GetByteVector256SpanLength(offset, length); + Vector256 values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), search, match; do { - search = Vector256.LoadUnsafe(ref current); - compareResult = Vector256.Equals(values0, search) | Vector256.Equals(values1, search); - if (compareResult == Vector256.Zero) + search = Vector256.LoadUnsafe(ref searchSpace, offset); + match = Vector256.Equals(values0, search) | Vector256.Equals(values1, search); + if (match == Vector256.Zero) { - current = ref Unsafe.Add(ref current, Vector256.Count); + offset += (nuint)Vector256.Count; continue; } - return GetIndexFromMask(ref searchSpace, ref current, compareResult.ExtractMostSignificantBits()); - } while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); - - // If any elements remain, process the last vector in the search space. - if ((uint)length % Vector256.Count != 0) - { - search = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd); - compareResult = Vector256.Equals(values0, search) | Vector256.Equals(values1, search); - if (compareResult != Vector256.Zero) - { - return GetIndexFromMask(ref searchSpace, ref oneVectorAwayFromEnd, compareResult.ExtractMostSignificantBits()); - } - } + return GetIndexFromMask(offset, match.ExtractMostSignificantBits()); + } while (offset < lastVectorOffset); } - else if (Vector128.IsHardwareAccelerated) + // Vector128 code path is executed for Vector256 when Vector128.Count <= length < Vector256.Count + else if (Vector128.IsHardwareAccelerated && length >= Vector128.Count) { - ref byte current = ref searchSpace; - ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector128.Count); - Vector128 values0 = Vector128.Create(value0), values1 = Vector128.Create(value1), search, equals; + nuint lastVectorOffset = GetByteVector128SpanLength(offset, length); + Vector128 values0 = Vector128.Create(value0), values1 = Vector128.Create(value1), search, match; do { - search = Vector128.LoadUnsafe(ref current); - equals = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); - if (equals == Vector128.Zero) + search = Vector128.LoadUnsafe(ref searchSpace, offset); + match = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); + if (match == Vector128.Zero) { - current = ref Unsafe.Add(ref current, Vector128.Count); + offset += (nuint)Vector128.Count; continue; } - return GetIndexFromMask(ref searchSpace, ref current, equals.ExtractMostSignificantBits()); - } while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); - - // If any elements remain, process the last vector in the search space. - if ((uint)length % Vector128.Count != 0) - { - search = Vector128.LoadUnsafe(ref oneVectorAwayFromEnd); - equals = Vector128.Equals(values0, search) | Vector128.Equals(values1, search); - if (equals != Vector128.Zero) - { - return GetIndexFromMask(ref searchSpace, ref oneVectorAwayFromEnd, equals.ExtractMostSignificantBits()); - } - } + return GetIndexFromMask(offset, match.ExtractMostSignificantBits()); + } while (offset < lastVectorOffset); } - else if (Vector.IsHardwareAccelerated) + else if (Vector.IsHardwareAccelerated && length >= Vector.Count) { - ref byte current = ref searchSpace; - ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector.Count); - Vector values0 = new Vector(value0), values1 = new Vector(value1), search, equals; + nuint lastVectorOffset = GetByteVectorSpanLength(offset, length); + Vector values0 = new Vector(value0), values1 = new Vector(value1), search, match; do { - search = LoadVector(ref current); - equals = Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1)); - if (Vector.Zero.Equals(search)) + search = LoadVector(ref searchSpace, offset); + match = Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1)); + if (Vector.Zero.Equals(match)) { - current = ref Unsafe.Add(ref current, Vector.Count); + offset += (nuint)Vector.Count; continue; } - return GetIndexFromVector(ref searchSpace, ref current, equals); - } while (!Unsafe.IsAddressGreaterThan(ref current, ref oneVectorAwayFromEnd)); + return (int)offset + LocateFirstFoundByte(search); + } while (offset < lastVectorOffset); + } - // If any elements remain, process the last vector in the search space. - if ((uint)length % Vector.Count != 0) + // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue0 = value0, uValue1 = value1, lookUp; + // Use nuint for offset to avoid unnecessary 64->32->64 truncations + for (; offset < (uint)length; offset++) + { + lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); + if (uValue0 == lookUp || uValue1 == lookUp) { - search = LoadVector(ref oneVectorAwayFromEnd); - equals = Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1)); - if (!Vector.Zero.Equals(search)) - { - return GetIndexFromVector(ref searchSpace, ref oneVectorAwayFromEnd, equals); - } + return (int)offset; } } return -1; [MethodImpl(MethodImplOptions.AggressiveInlining)] - static int GetIndexFromMask(ref byte searchSpace, ref byte current, uint matches) + static int GetIndexFromMask(nuint offset, uint matches) { // Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. // Find bitflag offset of first difference and add to current offset - return (int)Unsafe.ByteOffset(ref searchSpace, ref current) + BitOperations.TrailingZeroCount(matches); + return (int)offset + BitOperations.TrailingZeroCount(matches); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static int GetIndexFromVector(ref byte searchSpace, ref byte current, Vector search) - => (int)Unsafe.ByteOffset(ref searchSpace, ref current) + LocateFirstFoundByte(search); } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -2039,9 +1999,6 @@ private static nuint LoadNUInt(ref byte start) private static nuint LoadNUInt(ref byte start, nuint offset) => Unsafe.ReadUnaligned(ref Unsafe.AddByteOffset(ref start, offset)); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector LoadVector(ref byte start) => Unsafe.ReadUnaligned>(ref start); - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector LoadVector(ref byte start, nuint offset) => Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset));