From 510b767a0f175b50dded6700785999a219623beb Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 12 Jun 2020 13:06:24 -0700 Subject: [PATCH] Optimize SpanHelpers.IndexOfAny() for byte --- .../src/System/SpanHelpers.Byte.cs | 91 ++++++++++++++++++- 1 file changed, 88 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 12e0ea00d2bc54..e031af7ee8cd05 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -43,7 +43,7 @@ public static int IndexOf(ref byte searchSpace, int searchSpaceLength, ref byte break; // The unsearched portion is now shorter than the sequence we're looking for. So it can't be there. // Found the first element of "value". See if the tail matches. - if (SequenceEqual(ref Unsafe.Add(ref searchSpace, offset + 1), ref valueTail, (nuint)valueTailLength)) // The (nunit)-cast is necessary to pick the correct overload + if (SequenceEqual(ref Unsafe.Add(ref searchSpace, offset + 1), ref valueTail, (nuint)valueTailLength)) // The (nuint)-cast is necessary to pick the correct overload return offset; // The tail matched. Return a successful find. remainingSearchSpaceLength--; @@ -601,7 +601,7 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations nuint lengthToExamine = (nuint)(uint)length; - if (Sse2.IsSupported) + if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { // Avx2 branch also operates on Sse2 sizes, so check is combined. if (length >= Vector128.Count * 2) @@ -780,6 +780,47 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int } } } + else if (AdvSimd.Arm64.IsSupported) + { + if (offset < (nuint)(uint)length) + { + lengthToExamine = GetByteVector128SpanLength(offset, length); + + // Mask to help find the first lane in compareResult that is set. + // LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. + Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); + int matchedLane = 0; + + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + + while (lengthToExamine > offset) + { + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as above + Vector128 compareResult = AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)); + + if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)matchedLane); + } + + if (offset < (nuint)(uint)length) + { + lengthToExamine = ((nuint)(uint)length - offset); + goto SequentialScan; + } + } + } else if (Vector.IsHardwareAccelerated) { if (offset < (nuint)(uint)length) @@ -842,7 +883,7 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, byt nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations nuint lengthToExamine = (nuint)(uint)length; - if (Sse2.IsSupported) + if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { // Avx2 branch also operates on Sse2 sizes, so check is combined. if (length >= Vector128.Count * 2) @@ -1025,6 +1066,50 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, byt } } } + else if (AdvSimd.Arm64.IsSupported) + { + if (offset < (nuint)(uint)length) + { + lengthToExamine = GetByteVector128SpanLength(offset, length); + + // Mask to help find the first lane in compareResult that is set. + // LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. + Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); + int matchedLane = 0; + + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + + while (lengthToExamine > offset) + { + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as above + Vector128 matches0 = AdvSimd.CompareEqual(values0, search); + Vector128 matches1 = AdvSimd.CompareEqual(values1, search); + Vector128 matches2 = AdvSimd.CompareEqual(values2, search); + + Vector128 compareResult = AdvSimd.Or(AdvSimd.Or(matches0, matches1), matches2); + + if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) + { + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset + return (int)(offset + (uint)matchedLane); + } + + if (offset < (nuint)(uint)length) + { + lengthToExamine = ((nuint)(uint)length - offset); + goto SequentialScan; + } + } + } else if (Vector.IsHardwareAccelerated) { if (offset < (nuint)(uint)length)