Skip to content

Commit 52ac82f

Browse files
committed
Move vzeroupper emit back to JIT
vzeroupper is AVX instruction and so it cannot be executed unconditionally in static asm helpers Fixes dotnet#115019
1 parent 29638e8 commit 52ac82f

File tree

7 files changed

+65
-67
lines changed

7 files changed

+65
-67
lines changed

src/coreclr/jit/codegen.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,12 @@ class CodeGen final : public CodeGenInterface
515515
#if defined(TARGET_XARCH)
516516

517517
// Save/Restore callee saved float regs to stack
518-
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
519-
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
518+
void genPreserveCalleeSavedFltRegs();
519+
void genRestoreCalleeSavedFltRegs();
520+
521+
// Generate vzeroupper instruction to clear AVX state if necessary
522+
void genClearAvxStateInProlog();
523+
void genClearAvxStateInEpilog();
520524

521525
#endif // TARGET_XARCH
522526

src/coreclr/jit/codegencommon.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5285,8 +5285,10 @@ void CodeGen::genFnProlog()
52855285
#endif // TARGET_ARMARCH
52865286

52875287
#if defined(TARGET_XARCH)
5288+
genClearAvxStateInProlog();
5289+
52885290
// Preserve callee saved float regs to stack.
5289-
genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize);
5291+
genPreserveCalleeSavedFltRegs();
52905292
#endif // defined(TARGET_XARCH)
52915293

52925294
#ifdef TARGET_AMD64

src/coreclr/jit/codegenxarch.cpp

Lines changed: 56 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10452,8 +10452,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
1045210452
}
1045310453
#endif
1045410454

10455+
genClearAvxStateInEpilog();
10456+
1045510457
// Restore float registers that were saved to stack before SP is modified.
10456-
genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize);
10458+
genRestoreCalleeSavedFltRegs();
1045710459

1045810460
#ifdef JIT32_GCENCODER
1045910461
// When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10913,6 +10915,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1091310915

1091410916
// This is the end of the OS-reported prolog for purposes of unwinding
1091510917
compiler->unwindEndProlog();
10918+
10919+
genClearAvxStateInProlog();
1091610920
}
1091710921

1091810922
/*****************************************************************************
@@ -10933,6 +10937,8 @@ void CodeGen::genFuncletEpilog()
1093310937

1093410938
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
1093510939

10940+
genClearAvxStateInEpilog();
10941+
1093610942
inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE);
1093710943
instGen_Return(0);
1093810944
}
@@ -11030,6 +11036,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1103011036
// Add a padding for 16-byte alignment
1103111037
inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
1103211038
#endif
11039+
11040+
genClearAvxStateInProlog();
1103311041
}
1103411042

1103511043
/*****************************************************************************
@@ -11048,6 +11056,8 @@ void CodeGen::genFuncletEpilog()
1104811056

1104911057
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
1105011058

11059+
genClearAvxStateInEpilog();
11060+
1105111061
#ifdef UNIX_X86_ABI
1105211062
// Revert a padding that was added for 16-byte alignment
1105311063
inst_RV_IV(INS_add, REG_SPBASE, 12, EA_PTRSIZE);
@@ -11337,40 +11347,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1133711347
// Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1133811348
// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1133911349
// Here offset = 16-byte aligned offset after pushing integer registers.
11340-
//
11341-
// Params
11342-
// lclFrameSize - Fixed frame size excluding callee pushed int regs.
11343-
// non-funclet: this will be compLclFrameSize.
11344-
// funclet frames: this will be FuncletInfo.fiSpDelta.
11345-
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
11350+
void CodeGen::genPreserveCalleeSavedFltRegs()
1134611351
{
1134711352
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1134811353

1134911354
// Only callee saved floating point registers should be in regMask
1135011355
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1135111356

11352-
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11353-
{
11354-
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11355-
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11356-
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11357-
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11358-
// register) and before any call to an unknown function.
11359-
11360-
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11361-
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11362-
// This reduces the overall amount of codegen, particularly for more common paths not using any
11363-
// SIMD or floating-point.
11364-
11365-
instGen(INS_vzeroupper);
11366-
}
11367-
1136811357
// fast path return
1136911358
if (regMask == RBM_NONE)
1137011359
{
1137111360
return;
1137211361
}
1137311362

11363+
unsigned lclFrameSize = compiler->compLclFrameSize;
11364+
1137411365
#ifdef TARGET_AMD64
1137511366
unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
1137611367
unsigned offset = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11400,35 +11391,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1140011391
// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1140111392
// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1140211393
// Here offset = 16-byte aligned offset after pushing integer registers.
11403-
//
11404-
// Params
11405-
// lclFrameSize - Fixed frame size excluding callee pushed int regs.
11406-
// non-funclet: this will be compLclFrameSize.
11407-
// funclet frames: this will be FuncletInfo.fiSpDelta.
11408-
void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
11394+
void CodeGen::genRestoreCalleeSavedFltRegs()
1140911395
{
1141011396
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1141111397

1141211398
// Only callee saved floating point registers should be in regMask
1141311399
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1141411400

11415-
if (GetEmitter()->Contains256bitOrMoreAVX())
11416-
{
11417-
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11418-
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11419-
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11420-
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11421-
// register) and before any call to an unknown function.
11422-
11423-
instGen(INS_vzeroupper);
11424-
}
11425-
1142611401
// fast path return
1142711402
if (regMask == RBM_NONE)
1142811403
{
1142911404
return;
1143011405
}
1143111406

11407+
unsigned lclFrameSize = compiler->compLclFrameSize;
11408+
1143211409
#ifdef TARGET_AMD64
1143311410
unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
1143411411
instruction copyIns = ins_Copy(TYP_FLOAT);
@@ -11470,6 +11447,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1147011447
}
1147111448
}
1147211449

11450+
//-----------------------------------------------------------------------------------
11451+
// genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
11452+
//
11453+
void CodeGen::genClearAvxStateInProlog()
11454+
{
11455+
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11456+
{
11457+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11458+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11459+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11460+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11461+
// register) and before any call to an unknown function.
11462+
11463+
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11464+
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11465+
// This reduces the overall amount of codegen, particularly for more common paths not using any
11466+
// SIMD or floating-point.
11467+
11468+
instGen(INS_vzeroupper);
11469+
}
11470+
}
11471+
11472+
//-----------------------------------------------------------------------------------
11473+
// genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
11474+
//
11475+
void CodeGen::genClearAvxStateInEpilog()
11476+
{
11477+
if (GetEmitter()->Contains256bitOrMoreAVX())
11478+
{
11479+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11480+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11481+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11482+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11483+
// register) and before any call to an unknown function.
11484+
11485+
instGen(INS_vzeroupper);
11486+
}
11487+
}
11488+
1147311489
//-----------------------------------------------------------------------------------
1147411490
// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
1147511491
//

src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,19 +245,13 @@ NESTED_END RhpRethrow, _TEXT
245245

246246
alloc_stack stack_alloc_size
247247

248-
// Mirror clearing of AVX state done by regular method prologs
249-
vzeroupper
250-
251248
END_PROLOGUE
252249
.endm
253250

254251
//
255252
// Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
256253
//
257254
.macro FUNCLET_CALL_EPILOGUE
258-
// Mirror clearing of AVX state done by regular method epilogs
259-
vzeroupper
260-
261255
free_stack stack_alloc_size
262256

263257
pop_nonvol_reg rbp

src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
308308

309309
alloc_stack stack_alloc_size
310310

311-
;; Mirror clearing of AVX state done by regular method prologs
312-
vzeroupper
313-
314311
save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h)
315312
save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h)
316313
save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h)
@@ -329,9 +326,6 @@ endm
329326
;; Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
330327
;;
331328
FUNCLET_CALL_EPILOGUE macro
332-
;; Mirror clearing of AVX state done by regular method epilogs
333-
vzeroupper
334-
335329
movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h]
336330
movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h]
337331
movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h]

src/coreclr/vm/amd64/AsmHelpers.asm

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -523,9 +523,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
523523

524524
alloc_stack stack_alloc_size
525525

526-
;; Mirror clearing of AVX state done by regular method prologs
527-
vzeroupper
528-
529526
save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h)
530527
save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h)
531528
save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h)
@@ -544,9 +541,6 @@ endm
544541
;; Epilogue of all funclet calling helpers (CallXXXXFunclet)
545542
;;
546543
FUNCLET_CALL_EPILOGUE macro
547-
;; Mirror clearing of AVX state done by regular method epilogs
548-
vzeroupper
549-
550544
movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h]
551545
movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h]
552546
movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h]

src/coreclr/vm/amd64/asmhelpers.S

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,19 +378,13 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT
378378

379379
alloc_stack stack_alloc_size
380380

381-
// Mirror clearing of AVX state done by regular method prologs
382-
vzeroupper
383-
384381
END_PROLOGUE
385382
.endm
386383

387384
//
388385
// Epilogue of all funclet calling helpers (CallXXXXFunclet)
389386
//
390387
.macro FUNCLET_CALL_EPILOGUE
391-
// Mirror clearing of AVX state done by regular method epilogs
392-
vzeroupper
393-
394388
free_stack stack_alloc_size
395389

396390
pop_nonvol_reg rbp

0 commit comments

Comments
 (0)