-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Vectorized common String.Split() paths #38001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
52ad5ac
10417a3
140c2ce
58918a1
be469ff
1969c67
79b32be
44db429
995719b
65cba0c
b437551
0cdcfd7
3d2a645
d9a8640
dc8926e
1b73a5d
2b0b8f8
cc4ae1f
11c968b
707871a
19e57f0
82329c6
7d1fe48
aa7454a
e0f8337
14eaf11
10da409
d55cf92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -6,6 +6,9 @@ | |||||
| using System.Diagnostics; | ||||||
| using System.Globalization; | ||||||
| using System.Numerics; | ||||||
| using System.Runtime.InteropServices; | ||||||
| using System.Runtime.Intrinsics; | ||||||
| using System.Runtime.Intrinsics.X86; | ||||||
| using System.Text; | ||||||
| using Internal.Runtime.CompilerServices; | ||||||
|
|
||||||
|
|
@@ -1573,6 +1576,13 @@ private void MakeSeparatorList(ReadOnlySpan<char> separators, ref ValueListBuild | |||||
| // Special-case the common cases of 1, 2, and 3 separators, with manual comparisons against each separator. | ||||||
| case 1: | ||||||
| sep0 = separators[0]; | ||||||
|
|
||||||
| if (Avx2.IsSupported && 16 <= Length) | ||||||
| { | ||||||
| MakeSeparatorListVectorized(ref sepListBuilder, sep0); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| for (int i = 0; i < Length; i++) | ||||||
| { | ||||||
| if (this[i] == sep0) | ||||||
|
|
@@ -1584,6 +1594,13 @@ private void MakeSeparatorList(ReadOnlySpan<char> separators, ref ValueListBuild | |||||
| case 2: | ||||||
| sep0 = separators[0]; | ||||||
| sep1 = separators[1]; | ||||||
|
|
||||||
| if (Avx2.IsSupported && 16 <= Length) | ||||||
| { | ||||||
| MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| for (int i = 0; i < Length; i++) | ||||||
| { | ||||||
| char c = this[i]; | ||||||
|
|
@@ -1597,6 +1614,13 @@ private void MakeSeparatorList(ReadOnlySpan<char> separators, ref ValueListBuild | |||||
| sep0 = separators[0]; | ||||||
| sep1 = separators[1]; | ||||||
| sep2 = separators[2]; | ||||||
|
|
||||||
| if (Avx2.IsSupported && 16 <= Length) | ||||||
| { | ||||||
| MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep2); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| for (int i = 0; i < Length; i++) | ||||||
| { | ||||||
| char c = this[i]; | ||||||
|
|
@@ -1630,6 +1654,61 @@ private void MakeSeparatorList(ReadOnlySpan<char> separators, ref ValueListBuild | |||||
| } | ||||||
| } | ||||||
|
|
||||||
| private void MakeSeparatorListVectorized(ref ValueListBuilder<int> sepListBuilder, char c, char? c2 = null, char? c3 = null) | ||||||
| { | ||||||
| Vector256<byte> shuffleConstant = Vector256.Create(0x02, 0x06, 0x0A, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, | ||||||
| 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x06, 0x0A, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); | ||||||
bbartels marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
|
|
||||||
| Vector256<ushort> v1 = Vector256.Create(c); | ||||||
| Vector256<ushort>? v2 = c2 is char sep2 ? Vector256.Create(sep2) : (Vector256<ushort>?)null; | ||||||
| Vector256<ushort>? v3 = c3 is char sep3 ? Vector256.Create(sep3) : (Vector256<ushort>?)null; | ||||||
|
|
||||||
| ref char c0 = ref MemoryMarshal.GetReference(this.AsSpan()); | ||||||
| int cond = Length - (Length % Vector256<ushort>.Count); | ||||||
bbartels marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| int i = 0; | ||||||
|
|
||||||
| for (; i < cond; i += Vector256<ushort>.Count) | ||||||
| { | ||||||
| ref char ci = ref Unsafe.Add(ref c0, i); | ||||||
| Vector256<ushort> charVector = Unsafe.As<char, Vector256<ushort>>(ref ci); | ||||||
bbartels marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| Vector256<ushort> cmp = Avx2.CompareEqual(charVector, v1); | ||||||
|
|
||||||
| if (v2 is Vector256<ushort> vecSep2) | ||||||
|
||||||
| case 1: | |
| sep0 = separators[0]; |
as there it's known. You know what I mean?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As in calling it like MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep0, sep0) ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep.
// ...
MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep0, sep0);
// ...
MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep0);
// ...
MakeSeparatorListVectorized(ref sepListBuilder, sep0, sep1, sep2);Then the checks in MakeSeparatorListVectorized for null (default arg) can be eliminated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done 2b0b8f8
bbartels marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
bbartels marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This causes the vectors to be spilled to the stack.
asm from loop
G_M15376_IG06:
8BFA mov edi, edx
C4C17E6F247F vmovdqu ymm4, ymmword ptr[r15+2*rdi]
C5FD114D90 vmovupd ymmword ptr[rbp-70H], ymm1
C5DD75E9 vpcmpeqw ymm5, ymm4, ymm1
C5FD119570FFFFFF vmovupd ymmword ptr[rbp-90H], ymm2
C5DD75F2 vpcmpeqw ymm6, ymm4, ymm2
C5CDEBED vpor ymm5, ymm6, ymm5
C5FD119D50FFFFFF vmovupd ymmword ptr[rbp-B0H], ymm3
C5DD75E3 vpcmpeqw ymm4, ymm4, ymm3
C5DDEBED vpor ymm5, ymm4, ymm5
C4E27D17ED vptest ymm5, ymm5
0F84B7010000 je G_M15376_IG22
;; bbWeight=4 PerfScore 57.67
G_M15376_IG07:
C5DD73F504 vpsllq ymm4, ymm5, 4
C5FD1145B0 vmovupd ymmword ptr[rbp-50H], ymm0
C4E25D00E0 vpshufb ymm4, ymm4, ymm0
C4E37D39E501 vextracti128 ymm5, ymm4, 1
C5D9EBE5 vpor xmm4, xmm4, xmm5
C4E1F97EE1 vmovd rcx, xmm4
49B81032547698BADCFE mov r8, 0xFEDCBA9876543210
4933C8 xor rcx, r8
;; bbWeight=2 PerfScore 17.67
G_M15376_IG08:
48898D40FFFFFF mov qword ptr [rbp-C0H], rcx
448BC1 mov r8d, ecx
4183E00F and r8d, 15
899548FFFFFF mov dword ptr [rbp-B8H], edx
4403C2 add r8d, edx
4489853CFFFFFF mov dword ptr [rbp-C4H], r8d
448B4B08 mov r9d, dword ptr [rbx+8]
4C8D5310 lea r10, bword ptr [rbx+16]
4C899528FFFFFF mov bword ptr [rbp-D8H], r10
498BFA mov rdi, r10
44898D38FFFFFF mov dword ptr [rbp-C8H], r9d
443B4F08 cmp r9d, dword ptr [rdi+8]
7C08 jl SHORT G_M15376_IG10
;; bbWeight=16 PerfScore 184.00
G_M15376_IG09:
488BFB mov rdi, rbx
E870F1FFFF call System.Collections.Generic.ValueListBuilder`1[Int32][System.Int32]:Grow():this
;; bbWeight=4 PerfScore 5.00
G_M15376_IG10:
4C8B9528FFFFFF mov r10, bword ptr [rbp-D8H]
448B8D38FFFFFF mov r9d, dword ptr [rbp-C8H]
453B4A08 cmp r9d, dword ptr [r10+8]
0F8334010000 jae G_M15376_IG23
498B3A mov rdi, bword ptr [r10]
4D63D1 movsxd r10, r9d
448B853CFFFFFF mov r8d, dword ptr [rbp-C4H]
46890497 mov dword ptr [rdi+4*r10], r8d
41FFC1 inc r9d
44894B08 mov dword ptr [rbx+8], r9d
488B8D40FFFFFF mov rcx, qword ptr [rbp-C0H]
48C1E904 shr rcx, 4
4885C9 test rcx, rcx
0F85EC000000 jne G_M15376_IG21
;; bbWeight=16 PerfScore 236.00
G_M15376_IG11:
8B9548FFFFFF mov edx, dword ptr [rbp-B8H]
83C210 add edx, 16
8BB54CFFFFFF mov esi, dword ptr [rbp-B4H]
3BD6 cmp edx, esi
89B54CFFFFFF mov dword ptr [rbp-B4H], esi
C5FD1045B0 vmovupd ymm0, ymmword ptr[rbp-50H]
C5FD104D90 vmovupd ymm1, ymmword ptr[rbp-70H]
C5FD109570FFFFFF vmovupd ymm2, ymmword ptr[rbp-90H]
C5FD109D50FFFFFF vmovupd ymm3, ymmword ptr[rbp-B0H]
0F82D9FEFFFF jb G_M15376_IG06It's not exactely the loop-code from this PR, it's with some minor modifications from my attempts to remove the spills. The code from the PR has the same spills, anyway.
I don't understand why the JIT spills the vectors. They could (and should) be kept in the registers, as the ValueListBuilder<int> doesn't need any registers. Or is this from the builder's internal usage of span and zeroing behavior, but even if so, the JIT shouldn't spill the vectors...
Things I tried to tweak the JIT into not spilling:
- Moving this line into a non-inlineable local function didn't change anything.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it matter though? The vectors don't seem to be used after L1687 anyways?
EDIT: Or is this about shuffleConstant, v1, v2, v3?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or is this about shuffleConstant, v1, v2, v3?
No.
Does it matter though?
Perf-wise: yes.
- stack-spills are (here) unnecessary memory accesses
- the machine code will be quite a lot smaller without the spills -- especially important as it's within a loop
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ended up removing the need for PEXT and just iterated over each potential index within the extracted vector. For some reason I've been seeing some decent performance gains, even in scenarios with low separator frequency (which to me would point to perf regressions over PEXT). Could this inadvertently have fixed the stack-spills? Either way I can't quite understand the perf uplift.
Uh oh!
There was an error while loading. Please reload this page.