- 
                Notifications
    You must be signed in to change notification settings 
- Fork 5.2k
Ensure that Shuffle is marked as HW_Flag_CanBenefitFromConstantProp #111303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
          
     Merged
      
      
    Conversation
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
    | Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch | 
              
                    EgorBo
  
              
              approved these changes
              
                  
                    Jan 11, 2025 
                  
              
              
            
            
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM assuming there are no other intrinsics needing this flag too
| The example diff is as follows, which takes us from 380 to 59 bytes of codegen: public static Vector4 Cross(Vector4 vector1, Vector4 vector2)
{
    Vector128<float> v1 = vector1.AsVector128();
    Vector128<float> v2 = vector2.AsVector128();
    Vector128<int> shuftleYZXW = Vector128.Create(1, 2, 0, 3);
    Vector128<int> shuftleZXYW = Vector128.Create(2, 0, 1, 3);
    Vector128<float> m1 = Vector128.Shuffle(v1, shuftleYZXW) *
        Vector128.Shuffle(v2, shuftleZXYW);
    Vector128<float> m2 = Vector128.Shuffle(v1, shuftleZXYW) *
        Vector128.Shuffle(v2, shuftleYZXW);
    m2 = m2.WithElement(3, 0);
    return (m1 - m2).AsVector4();
}Before; Method Program:Cross(System.Numerics.Vector4,System.Numerics.Vector4):System.Numerics.Vector4 (FullOpts)
G_M000_IG01:                ;; offset=0x0000
       push     rbx
       sub      rsp, 320
       vmovaps  xmmword ptr [rsp+0x130], xmm6
       vmovaps  xmmword ptr [rsp+0x120], xmm7
       vmovaps  xmmword ptr [rsp+0x110], xmm8
       vmovaps  xmmword ptr [rsp+0x100], xmm9
       mov      rbx, rcx
G_M000_IG02:                ;; offset=0x002F
       vmovups  xmm6, xmmword ptr [r8]
       vmovups  xmm7, xmmword ptr [reloc @RWD00]
       vmovups  xmm8, xmmword ptr [reloc @RWD16]
       vmovups  xmm9, xmmword ptr [rdx]
       vmovaps  xmmword ptr [rsp+0xC0], xmm9
       vmovaps  xmmword ptr [rsp+0xB0], xmm7
       lea      rdx, [rsp+0xC0]
       lea      r8, [rsp+0xB0]
       lea      rcx, [rsp+0xD0]
       call     [System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]]
       vmovaps  xmm0, xmmword ptr [rsp+0xD0]
       vmovups  xmmword ptr [rsp+0xF0], xmm0
       vmovaps  xmmword ptr [rsp+0x90], xmm6
       vmovaps  xmmword ptr [rsp+0x80], xmm8
       lea      rdx, [rsp+0x90]
       lea      r8, [rsp+0x80]
       lea      rcx, [rsp+0xA0]
       call     [System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]]
       vmovups  xmm0, xmmword ptr [rsp+0xF0]
       vmulps   xmm0, xmm0, xmmword ptr [rsp+0xA0]
       vmovups  xmmword ptr [rsp+0xF0], xmm0
       vmovaps  xmmword ptr [rsp+0x60], xmm9
       vmovaps  xmmword ptr [rsp+0x50], xmm8
       lea      rdx, [rsp+0x60]
       lea      r8, [rsp+0x50]
       lea      rcx, [rsp+0x70]
       call     [System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]]
       vmovaps  xmm0, xmmword ptr [rsp+0x70]
       vmovups  xmmword ptr [rsp+0xE0], xmm0
       vmovaps  xmmword ptr [rsp+0x30], xmm6
       vmovaps  xmmword ptr [rsp+0x20], xmm7
       lea      rdx, [rsp+0x30]
       lea      r8, [rsp+0x20]
       lea      rcx, [rsp+0x40]
       call     [System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]]
       vmovups  xmm0, xmmword ptr [rsp+0xE0]
       vmulps   xmm0, xmm0, xmmword ptr [rsp+0x40]
       vinsertps xmm0, xmm0, xmm0, 56
       vmovups  xmm1, xmmword ptr [rsp+0xF0]
       vsubps   xmm0, xmm1, xmm0
       vmovups  xmmword ptr [rbx], xmm0
       mov      rax, rbx
G_M000_IG03:                ;; offset=0x014F
       vmovaps  xmm6, xmmword ptr [rsp+0x130]
       vmovaps  xmm7, xmmword ptr [rsp+0x120]
       vmovaps  xmm8, xmmword ptr [rsp+0x110]
       vmovaps  xmm9, xmmword ptr [rsp+0x100]
       add      rsp, 320
       pop      rbx
       ret      
RWD00  	dq	0000000200000001h, 0000000300000000h
RWD16  	dq	0000000000000002h, 0000000300000001h
; Total bytes of code: 380After; Method Program:Cross(System.Numerics.Vector4,System.Numerics.Vector4):System.Numerics.Vector4 (FullOpts)
G_M35725_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00
G_M35725_IG02:  ;; offset=0x0000
       vmovups  xmm0, xmmword ptr [r8]
       vmovups  xmm1, xmmword ptr [rdx]
       vpermilps xmm2, xmm1, -55
       vpermilps xmm3, xmm0, -46
       vmulps   xmm2, xmm3, xmm2
       vpermilps xmm1, xmm1, -46
       vpermilps xmm0, xmm0, -55
       vmulps   xmm0, xmm0, xmm1
       vinsertps xmm0, xmm0, xmm0, 56
       vsubps   xmm0, xmm2, xmm0
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx
						;; size=58 bbWeight=1 PerfScore 24.25
G_M35725_IG03:  ;; offset=0x003A
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 59 | 
    
  grendello 
      added a commit
        to grendello/runtime
      that referenced
      this pull request
    
      Jan 13, 2025 
    
    
      
  
    
      
    
  
* main: JIT: Model GT_RETURN kills with contained operand (dotnet#111230) Update dependencies from https://github.com/dotnet/runtime-assets build 20250110.2 (dotnet#111290) [NativeAOT/ARM64] Generate frames compatible with Apple compact unwinding (dotnet#107766) Cleanup unused JIT stubs in vm (dotnet#111237) Ensure that Shuffle is marked as HW_Flag_CanBenefitFromConstantProp (dotnet#111303) Fix CMP0173 policy warning with cmake 3.31 (dotnet#110522) [RISC-V] Fix HostActivation.Tests unknown-rid (dotnet#110687) Fix accidentally duplicated global-build-step.yml in runtime-official.yml (dotnet#111302) JIT: run extra SPMI queries for arrays (dotnet#111293) Split the Runtime Shared Framework project and combine legs in the official build (dotnet#111136) Do not ignore `MemoryMarshal.TryWrite` result (dotnet#108661) Update dependencies from https://github.com/dotnet/emsdk build 20250109.1 (dotnet#111263) Clean up in Number.Formatting.cs (dotnet#110955)
  
      Sign up for free
      to subscribe to this conversation on GitHub.
      Already have an account?
      Sign in.
  
      Labels
      
    area-CodeGen-coreclr
  CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI 
  Add this suggestion to a batch that can be applied as a single commit.
  This suggestion is invalid because no changes were made to the code.
  Suggestions cannot be applied while the pull request is closed.
  Suggestions cannot be applied while viewing a subset of changes.
  Only one suggestion per line can be applied in a batch.
  Add this suggestion to a batch that can be applied as a single commit.
  Applying suggestions on deleted lines is not supported.
  You must change the existing code in this line in order to create a valid suggestion.
  Outdated suggestions cannot be applied.
  This suggestion has been applied or marked resolved.
  Suggestions cannot be applied from pending reviews.
  Suggestions cannot be applied on multi-line comments.
  Suggestions cannot be applied while the pull request is queued to merge.
  Suggestion cannot be applied right now. Please check back later.
  
    
  
    
Without this flag constants aren't propagated into Shuffle if there are two or more uses, which results in significantly worse codegen.