@@ -10452,8 +10452,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
1045210452 }
1045310453#endif
1045410454
10455+ genClearAvxStateInEpilog ();
10456+
1045510457 // Restore float registers that were saved to stack before SP is modified.
10456- genRestoreCalleeSavedFltRegs (compiler-> compLclFrameSize );
10458+ genRestoreCalleeSavedFltRegs ();
1045710459
1045810460#ifdef JIT32_GCENCODER
1045910461 // When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10913,6 +10915,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1091310915
1091410916 // This is the end of the OS-reported prolog for purposes of unwinding
1091510917 compiler->unwindEndProlog ();
10918+
10919+ genClearAvxStateInProlog ();
1091610920}
1091710921
1091810922/* ****************************************************************************
@@ -10933,6 +10937,8 @@ void CodeGen::genFuncletEpilog()
1093310937
1093410938 ScopedSetVariable<bool > _setGeneratingEpilog (&compiler->compGeneratingEpilog , true );
1093510939
10940+ genClearAvxStateInEpilog ();
10941+
1093610942 inst_RV_IV (INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta , EA_PTRSIZE);
1093710943 instGen_Return (0 );
1093810944}
@@ -11030,6 +11036,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1103011036 // Add a padding for 16-byte alignment
1103111037 inst_RV_IV (INS_sub, REG_SPBASE, 12 , EA_PTRSIZE);
1103211038#endif
11039+
11040+ genClearAvxStateInProlog ();
1103311041}
1103411042
1103511043/* ****************************************************************************
@@ -11048,6 +11056,8 @@ void CodeGen::genFuncletEpilog()
1104811056
1104911057 ScopedSetVariable<bool > _setGeneratingEpilog (&compiler->compGeneratingEpilog , true );
1105011058
11059+ genClearAvxStateInEpilog ();
11060+
1105111061#ifdef UNIX_X86_ABI
1105211062 // Revert a padding that was added for 16-byte alignment
1105311063 inst_RV_IV (INS_add, REG_SPBASE, 12 , EA_PTRSIZE);
@@ -11337,40 +11347,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1133711347// Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1133811348// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1133911349// Here offset = 16-byte aligned offset after pushing integer registers.
11340- //
11341- // Params
11342- // lclFrameSize - Fixed frame size excluding callee pushed int regs.
11343- // non-funclet: this will be compLclFrameSize.
11344- // funclet frames: this will be FuncletInfo.fiSpDelta.
11345- void CodeGen::genPreserveCalleeSavedFltRegs (unsigned lclFrameSize)
11350+ void CodeGen::genPreserveCalleeSavedFltRegs ()
1134611351{
1134711352 regMaskTP regMask = compiler->compCalleeFPRegsSavedMask ;
1134811353
1134911354 // Only callee saved floating point registers should be in regMask
1135011355 assert ((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1135111356
11352- if (GetEmitter ()->ContainsCallNeedingVzeroupper () && !GetEmitter ()->Contains256bitOrMoreAVX ())
11353- {
11354- // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11355- // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11356- // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11357- // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11358- // register) and before any call to an unknown function.
11359-
11360- // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11361- // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11362- // This reduces the overall amount of codegen, particularly for more common paths not using any
11363- // SIMD or floating-point.
11364-
11365- instGen (INS_vzeroupper);
11366- }
11367-
1136811357 // fast path return
1136911358 if (regMask == RBM_NONE)
1137011359 {
1137111360 return ;
1137211361 }
1137311362
11363+ unsigned lclFrameSize = compiler->compLclFrameSize ;
11364+
1137411365#ifdef TARGET_AMD64
1137511366 unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven () ? REGSIZE_BYTES : 0 ;
1137611367 unsigned offset = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11400,35 +11391,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1140011391// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1140111392// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1140211393// Here offset = 16-byte aligned offset after pushing integer registers.
11403- //
11404- // Params
11405- // lclFrameSize - Fixed frame size excluding callee pushed int regs.
11406- // non-funclet: this will be compLclFrameSize.
11407- // funclet frames: this will be FuncletInfo.fiSpDelta.
11408- void CodeGen::genRestoreCalleeSavedFltRegs (unsigned lclFrameSize)
11394+ void CodeGen::genRestoreCalleeSavedFltRegs ()
1140911395{
1141011396 regMaskTP regMask = compiler->compCalleeFPRegsSavedMask ;
1141111397
1141211398 // Only callee saved floating point registers should be in regMask
1141311399 assert ((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1141411400
11415- if (GetEmitter ()->Contains256bitOrMoreAVX ())
11416- {
11417- // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11418- // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11419- // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11420- // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11421- // register) and before any call to an unknown function.
11422-
11423- instGen (INS_vzeroupper);
11424- }
11425-
1142611401 // fast path return
1142711402 if (regMask == RBM_NONE)
1142811403 {
1142911404 return ;
1143011405 }
1143111406
11407+ unsigned lclFrameSize = compiler->compLclFrameSize ;
11408+
1143211409#ifdef TARGET_AMD64
1143311410 unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven () ? REGSIZE_BYTES : 0 ;
1143411411 instruction copyIns = ins_Copy (TYP_FLOAT);
@@ -11470,6 +11447,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1147011447 }
1147111448}
1147211449
11450+ // -----------------------------------------------------------------------------------
11451+ // genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
11452+ //
11453+ void CodeGen::genClearAvxStateInProlog ()
11454+ {
11455+ if (GetEmitter ()->ContainsCallNeedingVzeroupper () && !GetEmitter ()->Contains256bitOrMoreAVX ())
11456+ {
11457+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11458+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11459+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11460+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11461+ // register) and before any call to an unknown function.
11462+
11463+ // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11464+ // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11465+ // This reduces the overall amount of codegen, particularly for more common paths not using any
11466+ // SIMD or floating-point.
11467+
11468+ instGen (INS_vzeroupper);
11469+ }
11470+ }
11471+
11472+ // -----------------------------------------------------------------------------------
11473+ // genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
11474+ //
11475+ void CodeGen::genClearAvxStateInEpilog ()
11476+ {
11477+ if (GetEmitter ()->Contains256bitOrMoreAVX ())
11478+ {
11479+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11480+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11481+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11482+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11483+ // register) and before any call to an unknown function.
11484+
11485+ instGen (INS_vzeroupper);
11486+ }
11487+ }
11488+
1147311489// -----------------------------------------------------------------------------------
1147411490// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
1147511491//
0 commit comments