diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs index d4a62bae8ee381..a5db6f21277f4c 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs @@ -113,7 +113,7 @@ private static CastResult TryGet(nuint source, nuint target) { ref CastCacheEntry pEntry = ref Element(ref tableData, index); - // must read in this order: version -> entry parts -> version + // must read in this order: version -> [entry parts] -> version // if version is odd or changes, the entry is inconsistent and thus ignored int version = Volatile.Read(ref pEntry._version); nuint entrySource = pEntry._source; @@ -124,12 +124,19 @@ private static CastResult TryGet(nuint source, nuint target) if (entrySource == source) { - nuint entryTargetAndResult = Volatile.Read(ref pEntry._targetAndResult); + nuint entryTargetAndResult = pEntry._targetAndResult; // target never has its lower bit set. // a matching entryTargetAndResult would the have same bits, except for the lowest one, which is the result. entryTargetAndResult ^= target; if (entryTargetAndResult <= 1) { + // make sure 'version' is loaded after 'source' and 'targetAndResults' + // + // We can either: + // - use acquires for both _source and _targetAndResults or + // - issue a load barrier before reading _version + // benchmarks on available hardware show that use of a read barrier is cheaper. + Interlocked.ReadMemoryBarrier(); if (version != pEntry._version) { // oh, so close, the entry is in inconsistent state. diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs index 33437141e76121..c9153f3cda7539 100644 --- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs +++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs @@ -232,6 +232,15 @@ public static long Read(ref long location) => [MethodImpl(MethodImplOptions.InternalCall)] public static extern void MemoryBarrier(); + /// + /// Synchronizes memory access as follows: + /// The processor that executes the current thread cannot reorder instructions in such a way that memory reads before + /// the call to execute after memory accesses that follow the call to . + /// + [Intrinsic] + [MethodImpl(MethodImplOptions.InternalCall)] + internal static extern void ReadMemoryBarrier(); + [DllImport(RuntimeHelpers.QCall, CharSet = CharSet.Unicode)] private static extern void _MemoryBarrierProcessWide(); diff --git a/src/coreclr/src/inc/corinfo.h b/src/coreclr/src/inc/corinfo.h index fb4489ceb87f88..2c8c1260b73925 100644 --- a/src/coreclr/src/inc/corinfo.h +++ b/src/coreclr/src/inc/corinfo.h @@ -947,6 +947,7 @@ enum CorInfoIntrinsics CORINFO_INTRINSIC_InterlockedCmpXchg32, CORINFO_INTRINSIC_InterlockedCmpXchg64, CORINFO_INTRINSIC_MemoryBarrier, + CORINFO_INTRINSIC_MemoryBarrierLoad, CORINFO_INTRINSIC_GetCurrentManagedThread, CORINFO_INTRINSIC_GetManagedThreadId, CORINFO_INTRINSIC_ByReference_Ctor, diff --git a/src/coreclr/src/inc/volatile.h b/src/coreclr/src/inc/volatile.h index 960e33cc982aa6..6c73dc4c65667b 100644 --- a/src/coreclr/src/inc/volatile.h +++ b/src/coreclr/src/inc/volatile.h @@ -286,6 +286,27 @@ void VolatileStoreWithoutBarrier(T* pt, T val) #endif } +// +// Memory ordering barrier that waits for loads in progress to complete. +// Any effects of loads or stores that appear after, in program order, will "happen after" relative to this. +// Other operations such as computation or instruction prefetch are not affected. +// +// Architectural mapping: +// arm64 : dmb ishld +// arm : dmb ish +// x86/64 : compiler fence +inline +void VolatileLoadBarrier() +{ +#if defined(HOST_ARM64) && defined(__GNUC__) + asm volatile ("dmb ishld" : : : "memory"); +#elif defined(HOST_ARM64) && defined(_MSC_VER) + __dmb(_ARM64_BARRIER_ISHLD); +#else + VOLATILE_MEMORY_BARRIER(); +#endif +} + // // Volatile implements accesses with our volatile semantics over a variable of type T. // Wherever you would have used a "volatile Foo" or, equivalently, "Foo volatile", use Volatile diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h index 0b76f07284b13e..93227bc7f6c1fe 100644 --- a/src/coreclr/src/jit/codegen.h +++ b/src/coreclr/src/jit/codegen.h @@ -1465,11 +1465,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void instGen_Return(unsigned stkArgSize); -#ifdef TARGET_ARM64 - void instGen_MemoryBarrier(insBarrier barrierType = INS_BARRIER_ISH); -#else - void instGen_MemoryBarrier(); -#endif + enum BarrierKind + { + BARRIER_FULL, // full barrier + BARRIER_LOAD_ONLY, // load barier + }; + + void instGen_MemoryBarrier(BarrierKind barrierKind = BARRIER_FULL); void instGen_Set_Reg_To_Zero(emitAttr size, regNumber reg, insFlags flags = INS_FLAGS_DONT_CARE); diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index e03ffe6f3b0867..431801ec66bd48 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -2665,8 +2665,8 @@ void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode) if (cpObjNode->gtFlags & GTF_BLK_VOLATILE) { - // issue a INS_BARRIER_ISHLD after a volatile CpObj operation - instGen_MemoryBarrier(INS_BARRIER_ISHLD); + // issue a load barrier after a volatile CpObj operation + instGen_MemoryBarrier(BARRIER_LOAD_ONLY); } // Clear the gcInfo for REG_WRITE_BARRIER_SRC_BYREF and REG_WRITE_BARRIER_DST_BYREF. @@ -2775,7 +2775,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode) assert(!"Unexpected treeNode->gtOper"); } - instGen_MemoryBarrier(INS_BARRIER_ISH); + instGen_MemoryBarrier(); } else { @@ -2855,7 +2855,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode) GetEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg); - instGen_MemoryBarrier(INS_BARRIER_ISH); + instGen_MemoryBarrier(); gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask()); } @@ -2904,7 +2904,7 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode) } GetEmitter()->emitIns_R_R_R(INS_casal, dataSize, targetReg, dataReg, addrReg); - instGen_MemoryBarrier(INS_BARRIER_ISH); + instGen_MemoryBarrier(); } else { @@ -2984,7 +2984,7 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode) genDefineTempLabel(labelCompareFail); - instGen_MemoryBarrier(INS_BARRIER_ISH); + instGen_MemoryBarrier(); gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask()); } diff --git a/src/coreclr/src/jit/codegenarmarch.cpp b/src/coreclr/src/jit/codegenarmarch.cpp index 3d6508de5fb582..6e55a28da2cf7c 100644 --- a/src/coreclr/src/jit/codegenarmarch.cpp +++ b/src/coreclr/src/jit/codegenarmarch.cpp @@ -398,8 +398,13 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) break; case GT_MEMORYBARRIER: - instGen_MemoryBarrier(); + { + CodeGen::BarrierKind barrierKind = + treeNode->gtFlags & GTF_MEMORYBARRIER_LOAD ? BARRIER_LOAD_ONLY : BARRIER_FULL; + + instGen_MemoryBarrier(barrierKind); break; + } #ifdef TARGET_ARM64 case GT_XCHG: @@ -1944,11 +1949,9 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree) if (emitBarrier) { -#ifdef TARGET_ARM64 - instGen_MemoryBarrier(INS_BARRIER_ISHLD); -#else - instGen_MemoryBarrier(); -#endif + // when INS_ldar* could not be used for a volatile load, + // we use an ordinary load followed by a load barrier. + instGen_MemoryBarrier(BARRIER_LOAD_ONLY); } genProduceReg(tree); @@ -1980,13 +1983,8 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode) if (cpBlkNode->gtFlags & GTF_BLK_VOLATILE) { -#ifdef TARGET_ARM64 - // issue a INS_BARRIER_ISHLD after a volatile CpBlk operation - instGen_MemoryBarrier(INS_BARRIER_ISHLD); -#else - // issue a full memory barrier after a volatile CpBlk operation - instGen_MemoryBarrier(); -#endif // TARGET_ARM64 + // issue a load barrier after a volatile CpBlk operation + instGen_MemoryBarrier(BARRIER_LOAD_ONLY); } } @@ -2207,6 +2205,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (node->IsVolatile()) { + // issue a full memory barrier before a volatile CpBlk operation instGen_MemoryBarrier(); } @@ -2304,11 +2303,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) if (node->IsVolatile()) { -#ifdef TARGET_ARM64 - instGen_MemoryBarrier(INS_BARRIER_ISHLD); -#else - instGen_MemoryBarrier(); -#endif + // issue a load barrier after a volatile CpBlk operation + instGen_MemoryBarrier(BARRIER_LOAD_ONLY); } } diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index c47c001147c003..2070291a208080 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -1862,8 +1862,13 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) break; case GT_MEMORYBARRIER: - instGen_MemoryBarrier(); + { + CodeGen::BarrierKind barrierKind = + treeNode->gtFlags & GTF_MEMORYBARRIER_LOAD ? BARRIER_LOAD_ONLY : BARRIER_FULL; + + instGen_MemoryBarrier(barrierKind); break; + } case GT_CMPXCHG: genCodeForCmpXchg(treeNode->AsCmpXchg()); diff --git a/src/coreclr/src/jit/gentree.h b/src/coreclr/src/jit/gentree.h index 45c80be0647c3a..a3fe7aab7474cc 100644 --- a/src/coreclr/src/jit/gentree.h +++ b/src/coreclr/src/jit/gentree.h @@ -814,6 +814,8 @@ struct GenTree #define GTF_CALL_POP_ARGS 0x04000000 // GT_CALL -- caller pop arguments? #define GTF_CALL_HOISTABLE 0x02000000 // GT_CALL -- call is hoistable +#define GTF_MEMORYBARRIER_LOAD 0x40000000 // GT_MEMORYBARRIER -- Load barrier + #define GTF_NOP_DEATH 0x40000000 // GT_NOP -- operand dies here #define GTF_FLD_VOLATILE 0x40000000 // GT_FIELD/GT_CLS_VAR -- same as GTF_IND_VOLATILE diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index 526d63d953cfc5..99d51600991c2e 100644 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -3627,11 +3627,20 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, #endif // defined(TARGET_XARCH) || defined(TARGET_ARM64) case CORINFO_INTRINSIC_MemoryBarrier: + case CORINFO_INTRINSIC_MemoryBarrierLoad: assert(sig->numArgs == 0); op1 = new (this, GT_MEMORYBARRIER) GenTree(GT_MEMORYBARRIER, TYP_VOID); op1->gtFlags |= GTF_GLOB_REF | GTF_ASG; + + // On XARCH `CORINFO_INTRINSIC_MemoryBarrierLoad` fences need not be emitted. + // However, we still need to capture the effect on reordering. + if (intrinsicID == CORINFO_INTRINSIC_MemoryBarrierLoad) + { + op1->gtFlags |= GTF_MEMORYBARRIER_LOAD; + } + retNode = op1; break; diff --git a/src/coreclr/src/jit/instr.cpp b/src/coreclr/src/jit/instr.cpp index 2ee631fceeff82..969efe5a4573d1 100644 --- a/src/coreclr/src/jit/instr.cpp +++ b/src/coreclr/src/jit/instr.cpp @@ -2364,11 +2364,7 @@ void CodeGen::instGen_Return(unsigned stkArgSize) * Note: all MemoryBarriers instructions can be removed by * SET COMPlus_JitNoMemoryBarriers=1 */ -#ifdef TARGET_ARM64 -void CodeGen::instGen_MemoryBarrier(insBarrier barrierType) -#else -void CodeGen::instGen_MemoryBarrier() -#endif +void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind) { #ifdef DEBUG if (JitConfig.JitNoMemoryBarriers() == 1) @@ -2378,12 +2374,19 @@ void CodeGen::instGen_MemoryBarrier() #endif // DEBUG #if defined(TARGET_XARCH) + // only full barrier needs to be emitted on Xarch + if (barrierKind != BARRIER_FULL) + { + return; + } + instGen(INS_lock); GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0); #elif defined(TARGET_ARM) + // ARM has only full barriers, so all barriers need to be emitted as full. GetEmitter()->emitIns_I(INS_dmb, EA_4BYTE, 0xf); #elif defined(TARGET_ARM64) - GetEmitter()->emitIns_BARR(INS_dmb, barrierType); + GetEmitter()->emitIns_BARR(INS_dmb, barrierKind == BARRIER_LOAD_ONLY ? INS_BARRIER_ISHLD : INS_BARRIER_ISH); #else #error "Unknown TARGET" #endif diff --git a/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs b/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs index cce06a7b7b83fa..a0538e9063fdf5 100644 --- a/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs +++ b/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs @@ -152,6 +152,7 @@ static IntrinsicHashtable InitializeIntrinsicHashtable() table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_InterlockedCmpXchg32, "CompareExchange", "System.Threading", "Interlocked"); // table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_InterlockedCmpXchg64, "CompareExchange", "System.Threading", "Interlocked"); // ambiguous match table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_MemoryBarrier, "MemoryBarrier", "System.Threading", "Interlocked"); + table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_MemoryBarrierLoad, "LoadBarrier", "System.Threading", "Interlocked"); // table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_GetCurrentManagedThread, "GetCurrentThreadNative", "System", "Thread"); // not in .NET Core // table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_GetManagedThreadId, "get_ManagedThreadId", "System", "Thread"); // not in .NET Core table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_ByReference_Ctor, ".ctor", "System", "ByReference`1"); diff --git a/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs index 1c13a74c864525..990c2713de72c9 100644 --- a/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs +++ b/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs @@ -463,6 +463,7 @@ public enum CorInfoIntrinsics CORINFO_INTRINSIC_InterlockedCmpXchg32, CORINFO_INTRINSIC_InterlockedCmpXchg64, CORINFO_INTRINSIC_MemoryBarrier, + CORINFO_INTRINSIC_MemoryBarrierLoad, CORINFO_INTRINSIC_GetCurrentManagedThread, CORINFO_INTRINSIC_GetManagedThreadId, CORINFO_INTRINSIC_ByReference_Ctor, diff --git a/src/coreclr/src/vm/castcache.cpp b/src/coreclr/src/vm/castcache.cpp index b4fb3a81832f26..9fb5f7a61c7e76 100644 --- a/src/coreclr/src/vm/castcache.cpp +++ b/src/coreclr/src/vm/castcache.cpp @@ -160,7 +160,7 @@ TypeHandle::CastResult CastCache::TryGet(TADDR source, TADDR target) { CastCacheEntry* pEntry = &Elements(tableData)[index]; - // must read in this order: version -> entry parts -> version + // must read in this order: version -> [entry parts] -> version // if version is odd or changes, the entry is inconsistent and thus ignored DWORD version1 = VolatileLoad(&pEntry->version); TADDR entrySource = pEntry->source; @@ -171,12 +171,14 @@ TypeHandle::CastResult CastCache::TryGet(TADDR source, TADDR target) if (entrySource == source) { - TADDR entryTargetAndResult = VolatileLoad(&pEntry->targetAndResult); + TADDR entryTargetAndResult = pEntry->targetAndResult; // target never has its lower bit set. // a matching entryTargetAndResult would have the same bits, except for the lowest one, which is the result. entryTargetAndResult ^= target; if (entryTargetAndResult <= 1) { + // make sure 'version' is loaded after 'source' and 'targetAndResults' + VolatileLoadBarrier(); if (version1 != pEntry->version) { // oh, so close, the entry is in inconsistent state. diff --git a/src/coreclr/src/vm/comutilnative.cpp b/src/coreclr/src/vm/comutilnative.cpp index 1f0e257a548865..0205d706913967 100644 --- a/src/coreclr/src/vm/comutilnative.cpp +++ b/src/coreclr/src/vm/comutilnative.cpp @@ -1799,6 +1799,15 @@ FCIMPL0(void, COMInterlocked::FCMemoryBarrier) } FCIMPLEND +FCIMPL0(void, COMInterlocked::FCMemoryBarrierLoad) +{ + FCALL_CONTRACT; + + VolatileLoadBarrier(); + FC_GC_POLL(); +} +FCIMPLEND + #include void QCALLTYPE COMInterlocked::MemoryBarrierProcessWide() diff --git a/src/coreclr/src/vm/comutilnative.h b/src/coreclr/src/vm/comutilnative.h index 4493ec271326f1..fa3f0fdcb2adaf 100644 --- a/src/coreclr/src/vm/comutilnative.h +++ b/src/coreclr/src/vm/comutilnative.h @@ -184,6 +184,7 @@ class COMInterlocked static FCDECL2_IV(INT64, ExchangeAdd64, INT64 *location, INT64 value); static FCDECL0(void, FCMemoryBarrier); + static FCDECL0(void, FCMemoryBarrierLoad); static void QCALLTYPE MemoryBarrierProcessWide(); }; diff --git a/src/coreclr/src/vm/ecalllist.h b/src/coreclr/src/vm/ecalllist.h index 661cfad2907e4e..89d6bd59b07947 100644 --- a/src/coreclr/src/vm/ecalllist.h +++ b/src/coreclr/src/vm/ecalllist.h @@ -859,6 +859,7 @@ FCFuncStart(gInterlockedFuncs) FCIntrinsicSig("ExchangeAdd", &gsig_SM_RefLong_Long_RetLong, COMInterlocked::ExchangeAdd64, CORINFO_INTRINSIC_InterlockedXAdd64) FCIntrinsic("MemoryBarrier", COMInterlocked::FCMemoryBarrier, CORINFO_INTRINSIC_MemoryBarrier) + FCIntrinsic("ReadMemoryBarrier", COMInterlocked::FCMemoryBarrierLoad, CORINFO_INTRINSIC_MemoryBarrierLoad) QCFuncElement("_MemoryBarrierProcessWide", COMInterlocked::MemoryBarrierProcessWide) FCFuncEnd()