Skip to content

Span<T> vs Span<String> difference in Morph: Structs/AddrExp #65815

@L2

Description

@L2

Wondering why the hot loop codegen differs in method Span() between the generic usage of Span<T>(_array) vs Span<string>(_array), where T ends up being string at runtime.

Example taken from: https://github.com/dotnet/performance/blob/main/src/benchmarks/micro/libraries/System.Collections/Indexer/IndexerSet.cs

C#: With Generics

[BenchmarkCategory(Categories.Runtime, Categories.Libraries, Categories.Collections, Categories.GenericCollections)]
[GenericTypeArguments(typeof(string))]
public class IndexerSet<T>
{
    [Params(Utils.DefaultCollectionSize)]
    public int Size;

    private T[] _array;

    [GlobalSetup(Targets = new[] { nameof(Span) })]
    public void SetupArray() => _array = ValuesGenerator.ArrayOfUniqueValues<T>(Size);

    [BenchmarkCategory(Categories.Span)]
    [Benchmark]
    public T Span()
    {
        T result = default;
        var collection = new Span<T>(_array);
        for (int i = 0; i < collection.Length; i++)
            collection[i] = default;
        return result;
    }
}

ASM for Span(): With Generics

*************** After end code gen, before unwindEmit()
G_M3772_IG01:        ; func=00, offs=000000H, size=001AH, bbWeight=1    PerfScore 5.75, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG

IN0019: 000000 push     rdi
IN001a: 000001 push     rsi
IN001b: 000002 sub      rsp, 56
IN001c: 000006 xor      eax, eax
IN001d: 000008 mov      qword ptr [V02 rsp+20H], rax
IN001e: 00000D mov      qword ptr [V02+0x8 rsp+28H], rax
IN001f: 000012 mov      qword ptr [rsp+30H], rcx
IN0020: 000017 mov      rsi, rcx

G_M3772_IG02:        ; offs=00001AH, size=0017H, bbWeight=1    PerfScore 11.25, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB01 [0000], byref, isz

IN0001: 00001A mov      rdi, gword ptr [rsi+8]
IN0002: 00001E mov      rcx, qword ptr [rsi]
IN0003: 000021 mov      rdx, qword ptr [rcx+48]
IN0004: 000025 mov      rdx, qword ptr [rdx]
IN0005: 000028 mov      rdx, qword ptr [rdx+24]
IN0006: 00002C test     rdx, rdx
IN0007: 00002F je       SHORT G_M3772_IG04

G_M3772_IG03:        ; offs=000031H, size=0002H, bbWeight=0.25 PerfScore 0.50, gcrefRegs=000000C0 {rsi rdi}, byrefRegs=00000000 {}, BB02 [0014], byref, isz

IN0008: 000031 jmp      SHORT G_M3772_IG05

G_M3772_IG04:        ; offs=000033H, size=0012H, bbWeight=0.25 PerfScore 0.38, gcrefRegs=000000C0 {rsi rdi}, byrefRegs=00000000 {}, BB03 [0013], byref

IN0009: 000033 mov      rdx, 0x7FFC2FD85238
IN000a: 00003D call     CORINFO_HELP_RUNTIMEHANDLE_CLASS
IN000b: 000042 mov      rdx, rax

G_M3772_IG05:        ; offs=000045H, size=0011H, bbWeight=1    PerfScore 4.00, gcrefRegs=000000C0 {rsi rdi}, byrefRegs=00000000 {}, BB04 [0011], byref, isz

IN000c: 000045 lea      rcx, bword ptr [V02 rsp+20H]
IN000d: 00004A mov      r8, rdi
IN000e: 00004D call     Span`1:.ctor(ref):this
IN000f: 000052 xor      eax, eax
IN0010: 000054 jmp      SHORT G_M3772_IG07

G_M3772_IG06:        ; offs=000056H, size=0010H, bbWeight=2    PerfScore 5.50, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB05 [0001], byref

IN0011: 000056 mov      rdx, bword ptr [V10 rsp+20H]
IN0012: 00005B mov      ecx, eax
IN0013: 00005D xor      r8d, r8d
IN0014: 000060 mov      qword ptr [rdx+8*rcx], r8
IN0015: 000064 inc      eax

G_M3772_IG07:        ; offs=000066H, size=0006H, bbWeight=8    PerfScore 24.00, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB08 [0017], byref, isz

IN0016: 000066 cmp      eax, dword ptr [V11 rsp+28H]
IN0017: 00006A jl       SHORT G_M3772_IG06

G_M3772_IG08:        ; offs=00006CH, size=0002H, bbWeight=1    PerfScore 0.25, gcrefRegs=00000040 {rsi}, byrefRegs=00000000 {}, BB10 [0003], byref

IN0018: 00006C xor      rax, rax

G_M3772_IG09:        ; offs=00006EH, size=0007H, bbWeight=1    PerfScore 2.25, epilog, nogc, extend

IN0021: 00006E add      rsp, 56
IN0022: 000072 pop      rsi
IN0023: 000073 pop      rdi
IN0024: 000074 ret


*************** Finishing PHASE Emit code

C#: Without Generics (replace T with string)

    [BenchmarkCategory(Categories.Runtime, Categories.Libraries, Categories.Collections, Categories.GenericCollections)]
    public class IndexerSet
    {
        [Params(Utils.DefaultCollectionSize)]
        public int Size;

        private string[] _array;

        [GlobalSetup(Targets = new[] { nameof(Span) })]
        public void SetupArray() => _array = ValuesGenerator.ArrayOfUniqueValues<string>(Size);

        [BenchmarkCategory(Categories.Span)]
        [Benchmark]
        public string Span()
        {
            string result = default;
            var collection = new Span<string>(_array);
            for (int i = 0; i < collection.Length; i++)
                collection[i] = default;
            return result;
        }
    }

ASM for Span(): Without Generics (replace T with string)

*************** After end code gen, before unwindEmit()
G_M55285_IG01:        ; func=00, offs=000000H, size=0000H, bbWeight=1    PerfScore 0.00, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
G_M55285_IG02:        ; offs=000000H, size=0009H, bbWeight=1    PerfScore 3.25, gcrefRegs=00000002 {rcx}, byrefRegs=00000000 {}, BB01 [0000], byref, isz

IN0001: 000000 mov      rax, gword ptr [rcx+8]
IN0002: 000004 test     rax, rax
IN0003: 000007 jne      SHORT G_M55285_IG04

G_M55285_IG03:        ; offs=000009H, size=000AH, bbWeight=0.50 PerfScore 1.25, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, BB02 [0005], byref, isz, align

IN0004: 000009 xor      rdx, rdx
IN0005: 00000B xor      ecx, ecx
IN0006: 00000D jmp      SHORT G_M55285_IG05
IN0007: 00000F align    [4 bytes for IG06]

G_M55285_IG04:        ; offs=000013H, size=0007H, bbWeight=0.50 PerfScore 1.25, gcrefRegs=00000001 {rax}, byrefRegs=00000000 {}, BB03 [0006], byref

IN0008: 000013 lea      rdx, bword ptr [rax+16]
IN0009: 000017 mov      ecx, dword ptr [rax+8]

G_M55285_IG05:        ; offs=00001AH, size=0006H, bbWeight=1    PerfScore 1.50, gcrefRegs=00000000 {}, byrefRegs=00000004 {rdx}, BB05 [0010], byref, isz

IN000a: 00001A xor      eax, eax
IN000b: 00001C test     ecx, ecx
IN000c: 00001E jle      SHORT G_M55285_IG07

G_M55285_IG06:        ; offs=000020H, size=0010H, bbWeight=4    PerfScore 12.00, gcrefRegs=00000000 {}, byrefRegs=00000004 {rdx}, loop=IG06, BB06 [0001], byref, isz

IN000d: 000020 mov      r8d, eax
IN000e: 000023 xor      r9, r9
IN000f: 000026 mov      gword ptr [rdx+8*r8], r9
IN0010: 00002A inc      eax
IN0011: 00002C cmp      eax, ecx
IN0012: 00002E jl       SHORT G_M55285_IG06

G_M55285_IG07:        ; offs=000030H, size=0002H, bbWeight=1    PerfScore 0.25, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, BB07 [0003], byref

IN0013: 000030 xor      rax, rax

G_M55285_IG08:        ; offs=000032H, size=0001H, bbWeight=1    PerfScore 1.00, epilog, nogc, extend

IN0014: 000032 ret


*************** Finishing PHASE Emit code

The hot loop's asm differs slightly, with the non-generic version skipping two loads.

for (int i = 0; i < collection.Length; i++)
    collection[i] = default;

With Generics:

G_M3772_IG06:
    mov      rdx, bword ptr [V10 rsp+20H]   <--load span _pointer field in rdx (not enrigistered)
    mov      ecx, eax
    xor      r8d, r8d
    mov      qword ptr [rdx+8*rcx], r8      <--store
    inc      eax
G_M3772_IG07:
    cmp      eax, dword ptr [V11 rsp+28H]   <--compare with span _length field (not enregistered)
    jl       SHORT G_M3772_IG06

Without Generics:

G_M55285_IG06:
    mov      r8d, eax
    xor      r9, r9
    mov      gword ptr [rdx+8*r8], r9       <--store (rdx already holds span _pointer field)
    inc      eax
    cmp      eax, ecx                       <--ecx already holds span _length field
    jl       SHORT G_M55285_IG06

Comparing the jit dumps of both versions, the main difference I see comes from Phase Morph - Structs/AddrExp which designates the generic version's span fields (_pointer and _length) as address exposed:

Phase Morph: Structs/AddrExp - With Generics

LocalAddressVisitor visiting statement:
STMT00004 ( ??? ... ??? )
               [000009] --C-G-------              *  CALL      void   Span`1..ctor
               [000006] ------------ this in rcx  +--*  ADDR      byref
               [000005] -------N----              |  \--*  LCL_VAR   struct<Span`1, 16>(P) V02 loc1
                                                  |  \--*    byref  V02._pointer (offs=0x00) -> V10 tmp6
                                                  |  \--*    int    V02._length (offs=0x08) -> V11 tmp7
               [000043] ------------ arg1         +--*  RUNTIMELOOKUP long   0x7ffc2fafb6c0 class
               [000042] ------------              |  \--*  LCL_VAR   long   V07 tmp3
               [000016] ------------ arg2         \--*  LCL_VAR   ref    V06 tmp2

Local V10 should not be enregistered because: it is address exposed

Local V11 should not be enregistered because: it is address exposed

Local V02 should not be enregistered because: it is address exposed

LocalAddressVisitor visiting statement:
STMT00005 ( 0x015[E-] ... ??? )
               [000046] -A----------              *  ASG       int
               [000045] D------N----              +--*  LCL_VAR   int    V03 loc2
               [000044] ------------              \--*  CNS_INT   int    0

LocalAddressVisitor visiting statement:
STMT00010 ( 0x019[E-] ... 0x022 )
               [000101] IA-X--------              *  ASG       struct (init)
               [000100] ---X---N----              +--*  BLK       struct<8>
               [000098] ---X--------              |  \--*  COMMA     byref
               [000092] ---X--------              |     +--*  BOUNDS_CHECK_Rng void
               [000087] ------------              |     |  +--*  LCL_VAR   int    V03 loc2
               [000091] ------------              |     |  \--*  FIELD     int    _length
               [000086] ------------              |     |     \--*  ADDR      byref
               [000085] -------N----              |     |        \--*  LCL_VAR   struct<Span`1, 16>(AX)(P) V02 loc1
                                                  |     |        \--*    byref  V02._pointer (offs=0x00) -> V10 tmp6
                                                  |     |        \--*    int    V02._length (offs=0x08) -> V11 tmp7
               [000097] ------------              |     \--*  ADD       byref
               [000096] ------------              |        +--*  FIELD     byref  _pointer
               [000090] ------------              |        |  \--*  ADDR      byref
               [000089] -------N----              |        |     \--*  LCL_VAR   struct<Span`1, 16>(AX)(P) V02 loc1
                                                  |        |     \--*    byref  V02._pointer (offs=0x00) -> V10 tmp6
                                                  |        |     \--*    int    V02._length (offs=0x08) -> V11 tmp7
               [000095] ------------              |        \--*  MUL       long
               [000093] ---------U--              |           +--*  CAST      long <- uint
               [000088] ------------              |           |  \--*  LCL_VAR   int    V03 loc2
               [000094] ------------              |           \--*  CNS_INT   long   8
               [000099] ------------              \--*  CNS_INT   int    0
Replacing the field in promoted struct with local var V11
Replacing the field in promoted struct with local var V10
LocalAddressVisitor modified statement:
STMT00010 ( 0x019[E-] ... 0x022 )
               [000101] IA-X--------              *  ASG       struct (init)
               [000100] ---X---N----              +--*  BLK       struct<8>
               [000098] ---X--------              |  \--*  COMMA     byref
               [000092] ---X--------              |     +--*  BOUNDS_CHECK_Rng void
               [000087] ------------              |     |  +--*  LCL_VAR   int    V03 loc2
               [000091] ------------              |     |  \--*  LCL_VAR   int   (AX) V11 tmp7
               [000097] ------------              |     \--*  ADD       byref
               [000096] ------------              |        +--*  LCL_VAR   byref (AX) V10 tmp6
               [000095] ------------              |        \--*  MUL       long
               [000093] ---------U--              |           +--*  CAST      long <- uint
               [000088] ------------              |           |  \--*  LCL_VAR   int    V03 loc2
               [000094] ------------              |           \--*  CNS_INT   long   8
               [000099] ------------              \--*  CNS_INT   int    0

Phase Morph: Structs/AddrExp - Without Generics

LocalAddressVisitor visiting statement:
STMT00005 ( 0x013[E-] ... 0x01C )
               [000036] -A-XG-------              *  ASG       ref
               [000035] *--X---N----              +--*  IND       ref
               [000033] ---X--------              |  \--*  COMMA     byref
               [000027] ---X--------              |     +--*  BOUNDS_CHECK_Rng void
               [000022] ------------              |     |  +--*  LCL_VAR   int    V03 loc2
               [000026] ------------              |     |  \--*  FIELD     int    _length
               [000021] ------------              |     |     \--*  ADDR      byref
               [000020] -------N----              |     |        \--*  LCL_VAR   struct<Span`1, 16>(P) V02 loc1
                                                  |     |        \--*    byref  V02._pointer (offs=0x00) -> V07 tmp3
                                                  |     |        \--*    int    V02._length (offs=0x08) -> V08 tmp4
               [000032] ------------              |     \--*  ADD       byref
               [000031] ------------              |        +--*  FIELD     byref  _pointer
               [000025] ------------              |        |  \--*  ADDR      byref
               [000024] -------N----              |        |     \--*  LCL_VAR   struct<Span`1, 16>(P) V02 loc1
                                                  |        |     \--*    byref  V02._pointer (offs=0x00) -> V07 tmp3
                                                  |        |     \--*    int    V02._length (offs=0x08) -> V08 tmp4
               [000030] ------------              |        \--*  MUL       long
               [000028] ---------U--              |           +--*  CAST      long <- uint
               [000023] ------------              |           |  \--*  LCL_VAR   int    V03 loc2
               [000029] ------------              |           \--*  CNS_INT   long   8
               [000034] ------------              \--*  CNS_INT   ref    null
Replacing the field in promoted struct with local var V08
Replacing the field in promoted struct with local var V07
LocalAddressVisitor modified statement:
STMT00005 ( 0x013[E-] ... 0x01C )
               [000036] -A-XG-------              *  ASG       ref
               [000035] *--X---N----              +--*  IND       ref
               [000033] ---X--------              |  \--*  COMMA     byref
               [000027] ---X--------              |     +--*  BOUNDS_CHECK_Rng void
               [000022] ------------              |     |  +--*  LCL_VAR   int    V03 loc2
               [000026] ------------              |     |  \--*  LCL_VAR   int    V08 tmp4
               [000032] ------------              |     \--*  ADD       byref
               [000031] ------------              |        +--*  LCL_VAR   byref  V07 tmp3
               [000030] ------------              |        \--*  MUL       long
               [000028] ---------U--              |           +--*  CAST      long <- uint
               [000023] ------------              |           |  \--*  LCL_VAR   int    V03 loc2
               [000029] ------------              |           \--*  CNS_INT   long   8
               [000034] ------------              \--*  CNS_INT   ref    null

It seems to me like the difference happens when the generic version encounters this runtime lookup and marks the fields as address exposed (AX) and any future usage of these fields also keeps them AX without enregistering:

STMT00004 ( ??? ... ??? )
               [000009] --C-G-------              *  CALL      void   Span`1..ctor
               [000006] ------------ this in rcx  +--*  ADDR      byref
               [000005] -------N----              |  \--*  LCL_VAR   struct<Span`1, 16>(P) V02 loc1
                                                  |  \--*    byref  V02._pointer (offs=0x00) -> V10 tmp6
                                                  |  \--*    int    V02._length (offs=0x08) -> V11 tmp7
               [000043] ------------ arg1         +--*  RUNTIMELOOKUP long   0x7ffc2fafb6c0 class
               [000042] ------------              |  \--*  LCL_VAR   long   V07 tmp3
               [000016] ------------ arg2         \--*  LCL_VAR   ref    V06 tmp2

Local V10 should not be enregistered because: it is address exposed

Local V11 should not be enregistered because: it is address exposed

Local V02 should not be enregistered because: it is address exposed

Why does address exposed kick in only for the generic version? In other words, why can the non-generic version get away with not loading in the span's _pointer and _length in each iteration of the hot loop? (it keeps them enregistered for the duration of the loop and this leads it to being more performant).

Thanks!

category:cq
theme:morph

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIin-prThere is an active PR which will close this issue when it is merged

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions