-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Description
Wondering why the hot loop codegen differs in method Span() between the generic usage of Span<T>(_array) vs Span<string>(_array), where T ends up being string at runtime.
Example taken from: https://github.com/dotnet/performance/blob/main/src/benchmarks/micro/libraries/System.Collections/Indexer/IndexerSet.cs
C#: With Generics
[BenchmarkCategory(Categories.Runtime, Categories.Libraries, Categories.Collections, Categories.GenericCollections)]
[GenericTypeArguments(typeof(string))]
public class IndexerSet<T>
{
[Params(Utils.DefaultCollectionSize)]
public int Size;
private T[] _array;
[GlobalSetup(Targets = new[] { nameof(Span) })]
public void SetupArray() => _array = ValuesGenerator.ArrayOfUniqueValues<T>(Size);
[BenchmarkCategory(Categories.Span)]
[Benchmark]
public T Span()
{
T result = default;
var collection = new Span<T>(_array);
for (int i = 0; i < collection.Length; i++)
collection[i] = default;
return result;
}
}
ASM for Span(): With Generics
*************** After end code gen, before unwindEmit()
G_M3772_IG01: ; func=00, offs=000000H, size=001AH, bbWeight=1 PerfScore 5.75, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
IN0019: 000000 push rdi
IN001a: 000001 push rsi
IN001b: 000002 sub rsp, 56
IN001c: 000006 xor eax, eax
IN001d: 000008 mov qword ptr [V02 rsp+20H], rax
IN001e: 00000D mov qword ptr [V02+0x8 rsp+28H], rax
IN001f: 000012 mov qword ptr [rsp+30H], rcx
IN0020: 000017 mov rsi, rcx
G_M3772_IG02: ; offs=00001AH, size=0017H, bbWeight=1 PerfScore 11.25, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB01 [0000], byref, isz
IN0001: 00001A mov rdi, gword ptr [rsi+8]
IN0002: 00001E mov rcx, qword ptr [rsi]
IN0003: 000021 mov rdx, qword ptr [rcx+48]
IN0004: 000025 mov rdx, qword ptr [rdx]
IN0005: 000028 mov rdx, qword ptr [rdx+24]
IN0006: 00002C test rdx, rdx
IN0007: 00002F je SHORT G_M3772_IG04
G_M3772_IG03: ; offs=000031H, size=0002H, bbWeight=0.25 PerfScore 0.50, gcrefRegs=000000C0 {rsi rdi}, byrefRegs=00000000 {}, BB02 [0014], byref, isz
IN0008: 000031 jmp SHORT G_M3772_IG05
G_M3772_IG04: ; offs=000033H, size=0012H, bbWeight=0.25 PerfScore 0.38, gcrefRegs=000000C0 {rsi rdi}, byrefRegs=00000000 {}, BB03 [0013], byref
IN0009: 000033 mov rdx, 0x7FFC2FD85238
IN000a: 00003D call CORINFO_HELP_RUNTIMEHANDLE_CLASS
IN000b: 000042 mov rdx, rax
G_M3772_IG05: ; offs=000045H, size=0011H, bbWeight=1 PerfScore 4.00, gcrefRegs=000000C0 {rsi rdi}, byrefRegs=00000000 {}, BB04 [0011], byref, isz
IN000c: 000045 lea rcx, bword ptr [V02 rsp+20H]
IN000d: 00004A mov r8, rdi
IN000e: 00004D call Span`1:.ctor(ref):this
IN000f: 000052 xor eax, eax
IN0010: 000054 jmp SHORT G_M3772_IG07
G_M3772_IG06: ; offs=000056H, size=0010H, bbWeight=2 PerfScore 5.50, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB05 [0001], byref
IN0011: 000056 mov rdx, bword ptr [V10 rsp+20H]
IN0012: 00005B mov ecx, eax
IN0013: 00005D xor r8d, r8d
IN0014: 000060 mov qword ptr [rdx+8*rcx], r8
IN0015: 000064 inc eax
G_M3772_IG07: ; offs=000066H, size=0006H, bbWeight=8 PerfScore 24.00, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB08 [0017], byref, isz
IN0016: 000066 cmp eax, dword ptr [V11 rsp+28H]
IN0017: 00006A jl SHORT G_M3772_IG06
G_M3772_IG08: ; offs=00006CH, size=0002H, bbWeight=1 PerfScore 0.25, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB10 [0003], byref
IN0018: 00006C xor rax, rax
G_M3772_IG09: ; offs=00006EH, size=0007H, bbWeight=1 PerfScore 2.25, epilog, nogc, extend
IN0021: 00006E add rsp, 56
IN0022: 000072 pop rsi
IN0023: 000073 pop rdi
IN0024: 000074 ret
*************** Finishing PHASE Emit code
C#: Without Generics (replace T with string)
[BenchmarkCategory(Categories.Runtime, Categories.Libraries, Categories.Collections, Categories.GenericCollections)]
public class IndexerSet
{
[Params(Utils.DefaultCollectionSize)]
public int Size;
private string[] _array;
[GlobalSetup(Targets = new[] { nameof(Span) })]
public void SetupArray() => _array = ValuesGenerator.ArrayOfUniqueValues<string>(Size);
[BenchmarkCategory(Categories.Span)]
[Benchmark]
public string Span()
{
string result = default;
var collection = new Span<string>(_array);
for (int i = 0; i < collection.Length; i++)
collection[i] = default;
return result;
}
}
ASM for Span(): Without Generics (replace T with string)
*************** After end code gen, before unwindEmit()
G_M55285_IG01: ; func=00, offs=000000H, size=0000H, bbWeight=1 PerfScore 0.00, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
G_M55285_IG02: ; offs=000000H, size=0009H, bbWeight=1 PerfScore 3.25, gcrefRegs=00000002 {rcx}, byrefRegs=00000000 {}, BB01 [0000], byref, isz
IN0001: 000000 mov rax, gword ptr [rcx+8]
IN0002: 000004 test rax, rax
IN0003: 000007 jne SHORT G_M55285_IG04
G_M55285_IG03: ; offs=000009H, size=000AH, bbWeight=0.50 PerfScore 1.25, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, BB02 [0005], byref, isz, align
IN0004: 000009 xor rdx, rdx
IN0005: 00000B xor ecx, ecx
IN0006: 00000D jmp SHORT G_M55285_IG05
IN0007: 00000F align [4 bytes for IG06]
G_M55285_IG04: ; offs=000013H, size=0007H, bbWeight=0.50 PerfScore 1.25, gcrefRegs=00000001 {rax}, byrefRegs=00000000 {}, BB03 [0006], byref
IN0008: 000013 lea rdx, bword ptr [rax+16]
IN0009: 000017 mov ecx, dword ptr [rax+8]
G_M55285_IG05: ; offs=00001AH, size=0006H, bbWeight=1 PerfScore 1.50, gcrefRegs=00000000 {}, byrefRegs=00000004 {rdx}, BB05 [0010], byref, isz
IN000a: 00001A xor eax, eax
IN000b: 00001C test ecx, ecx
IN000c: 00001E jle SHORT G_M55285_IG07
G_M55285_IG06: ; offs=000020H, size=0010H, bbWeight=4 PerfScore 12.00, gcrefRegs=00000000 {}, byrefRegs=00000004 {rdx}, loop=IG06, BB06 [0001], byref, isz
IN000d: 000020 mov r8d, eax
IN000e: 000023 xor r9, r9
IN000f: 000026 mov gword ptr [rdx+8*r8], r9
IN0010: 00002A inc eax
IN0011: 00002C cmp eax, ecx
IN0012: 00002E jl SHORT G_M55285_IG06
G_M55285_IG07: ; offs=000030H, size=0002H, bbWeight=1 PerfScore 0.25, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, BB07 [0003], byref
IN0013: 000030 xor rax, rax
G_M55285_IG08: ; offs=000032H, size=0001H, bbWeight=1 PerfScore 1.00, epilog, nogc, extend
IN0014: 000032 ret
*************** Finishing PHASE Emit code
The hot loop's asm differs slightly, with the non-generic version skipping two loads.
for (int i = 0; i < collection.Length; i++)
collection[i] = default;
With Generics:
G_M3772_IG06:
mov rdx, bword ptr [V10 rsp+20H] <--load span _pointer field in rdx (not enrigistered)
mov ecx, eax
xor r8d, r8d
mov qword ptr [rdx+8*rcx], r8 <--store
inc eax
G_M3772_IG07:
cmp eax, dword ptr [V11 rsp+28H] <--compare with span _length field (not enregistered)
jl SHORT G_M3772_IG06
Without Generics:
G_M55285_IG06:
mov r8d, eax
xor r9, r9
mov gword ptr [rdx+8*r8], r9 <--store (rdx already holds span _pointer field)
inc eax
cmp eax, ecx <--ecx already holds span _length field
jl SHORT G_M55285_IG06
Comparing the jit dumps of both versions, the main difference I see comes from Phase Morph - Structs/AddrExp which designates the generic version's span fields (_pointer and _length) as address exposed:
Phase Morph: Structs/AddrExp - With Generics
LocalAddressVisitor visiting statement:
STMT00004 ( ??? ... ??? )
[000009] --C-G------- * CALL void Span`1..ctor
[000006] ------------ this in rcx +--* ADDR byref
[000005] -------N---- | \--* LCL_VAR struct<Span`1, 16>(P) V02 loc1
| \--* byref V02._pointer (offs=0x00) -> V10 tmp6
| \--* int V02._length (offs=0x08) -> V11 tmp7
[000043] ------------ arg1 +--* RUNTIMELOOKUP long 0x7ffc2fafb6c0 class
[000042] ------------ | \--* LCL_VAR long V07 tmp3
[000016] ------------ arg2 \--* LCL_VAR ref V06 tmp2
Local V10 should not be enregistered because: it is address exposed
Local V11 should not be enregistered because: it is address exposed
Local V02 should not be enregistered because: it is address exposed
LocalAddressVisitor visiting statement:
STMT00005 ( 0x015[E-] ... ??? )
[000046] -A---------- * ASG int
[000045] D------N---- +--* LCL_VAR int V03 loc2
[000044] ------------ \--* CNS_INT int 0
LocalAddressVisitor visiting statement:
STMT00010 ( 0x019[E-] ... 0x022 )
[000101] IA-X-------- * ASG struct (init)
[000100] ---X---N---- +--* BLK struct<8>
[000098] ---X-------- | \--* COMMA byref
[000092] ---X-------- | +--* BOUNDS_CHECK_Rng void
[000087] ------------ | | +--* LCL_VAR int V03 loc2
[000091] ------------ | | \--* FIELD int _length
[000086] ------------ | | \--* ADDR byref
[000085] -------N---- | | \--* LCL_VAR struct<Span`1, 16>(AX)(P) V02 loc1
| | \--* byref V02._pointer (offs=0x00) -> V10 tmp6
| | \--* int V02._length (offs=0x08) -> V11 tmp7
[000097] ------------ | \--* ADD byref
[000096] ------------ | +--* FIELD byref _pointer
[000090] ------------ | | \--* ADDR byref
[000089] -------N---- | | \--* LCL_VAR struct<Span`1, 16>(AX)(P) V02 loc1
| | \--* byref V02._pointer (offs=0x00) -> V10 tmp6
| | \--* int V02._length (offs=0x08) -> V11 tmp7
[000095] ------------ | \--* MUL long
[000093] ---------U-- | +--* CAST long <- uint
[000088] ------------ | | \--* LCL_VAR int V03 loc2
[000094] ------------ | \--* CNS_INT long 8
[000099] ------------ \--* CNS_INT int 0
Replacing the field in promoted struct with local var V11
Replacing the field in promoted struct with local var V10
LocalAddressVisitor modified statement:
STMT00010 ( 0x019[E-] ... 0x022 )
[000101] IA-X-------- * ASG struct (init)
[000100] ---X---N---- +--* BLK struct<8>
[000098] ---X-------- | \--* COMMA byref
[000092] ---X-------- | +--* BOUNDS_CHECK_Rng void
[000087] ------------ | | +--* LCL_VAR int V03 loc2
[000091] ------------ | | \--* LCL_VAR int (AX) V11 tmp7
[000097] ------------ | \--* ADD byref
[000096] ------------ | +--* LCL_VAR byref (AX) V10 tmp6
[000095] ------------ | \--* MUL long
[000093] ---------U-- | +--* CAST long <- uint
[000088] ------------ | | \--* LCL_VAR int V03 loc2
[000094] ------------ | \--* CNS_INT long 8
[000099] ------------ \--* CNS_INT int 0
Phase Morph: Structs/AddrExp - Without Generics
LocalAddressVisitor visiting statement:
STMT00005 ( 0x013[E-] ... 0x01C )
[000036] -A-XG------- * ASG ref
[000035] *--X---N---- +--* IND ref
[000033] ---X-------- | \--* COMMA byref
[000027] ---X-------- | +--* BOUNDS_CHECK_Rng void
[000022] ------------ | | +--* LCL_VAR int V03 loc2
[000026] ------------ | | \--* FIELD int _length
[000021] ------------ | | \--* ADDR byref
[000020] -------N---- | | \--* LCL_VAR struct<Span`1, 16>(P) V02 loc1
| | \--* byref V02._pointer (offs=0x00) -> V07 tmp3
| | \--* int V02._length (offs=0x08) -> V08 tmp4
[000032] ------------ | \--* ADD byref
[000031] ------------ | +--* FIELD byref _pointer
[000025] ------------ | | \--* ADDR byref
[000024] -------N---- | | \--* LCL_VAR struct<Span`1, 16>(P) V02 loc1
| | \--* byref V02._pointer (offs=0x00) -> V07 tmp3
| | \--* int V02._length (offs=0x08) -> V08 tmp4
[000030] ------------ | \--* MUL long
[000028] ---------U-- | +--* CAST long <- uint
[000023] ------------ | | \--* LCL_VAR int V03 loc2
[000029] ------------ | \--* CNS_INT long 8
[000034] ------------ \--* CNS_INT ref null
Replacing the field in promoted struct with local var V08
Replacing the field in promoted struct with local var V07
LocalAddressVisitor modified statement:
STMT00005 ( 0x013[E-] ... 0x01C )
[000036] -A-XG------- * ASG ref
[000035] *--X---N---- +--* IND ref
[000033] ---X-------- | \--* COMMA byref
[000027] ---X-------- | +--* BOUNDS_CHECK_Rng void
[000022] ------------ | | +--* LCL_VAR int V03 loc2
[000026] ------------ | | \--* LCL_VAR int V08 tmp4
[000032] ------------ | \--* ADD byref
[000031] ------------ | +--* LCL_VAR byref V07 tmp3
[000030] ------------ | \--* MUL long
[000028] ---------U-- | +--* CAST long <- uint
[000023] ------------ | | \--* LCL_VAR int V03 loc2
[000029] ------------ | \--* CNS_INT long 8
[000034] ------------ \--* CNS_INT ref null
It seems to me like the difference happens when the generic version encounters this runtime lookup and marks the fields as address exposed (AX) and any future usage of these fields also keeps them AX without enregistering:
STMT00004 ( ??? ... ??? )
[000009] --C-G------- * CALL void Span`1..ctor
[000006] ------------ this in rcx +--* ADDR byref
[000005] -------N---- | \--* LCL_VAR struct<Span`1, 16>(P) V02 loc1
| \--* byref V02._pointer (offs=0x00) -> V10 tmp6
| \--* int V02._length (offs=0x08) -> V11 tmp7
[000043] ------------ arg1 +--* RUNTIMELOOKUP long 0x7ffc2fafb6c0 class
[000042] ------------ | \--* LCL_VAR long V07 tmp3
[000016] ------------ arg2 \--* LCL_VAR ref V06 tmp2
Local V10 should not be enregistered because: it is address exposed
Local V11 should not be enregistered because: it is address exposed
Local V02 should not be enregistered because: it is address exposed
Why does address exposed kick in only for the generic version? In other words, why can the non-generic version get away with not loading in the span's _pointer and _length in each iteration of the hot loop? (it keeps them enregistered for the duration of the loop and this leads it to being more performant).
Thanks!
category:cq
theme:morph