diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs
index d4a62bae8ee381..a5db6f21277f4c 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Runtime/CompilerServices/CastHelpers.cs
@@ -113,7 +113,7 @@ private static CastResult TryGet(nuint source, nuint target)
{
ref CastCacheEntry pEntry = ref Element(ref tableData, index);
- // must read in this order: version -> entry parts -> version
+ // must read in this order: version -> [entry parts] -> version
// if version is odd or changes, the entry is inconsistent and thus ignored
int version = Volatile.Read(ref pEntry._version);
nuint entrySource = pEntry._source;
@@ -124,12 +124,19 @@ private static CastResult TryGet(nuint source, nuint target)
if (entrySource == source)
{
- nuint entryTargetAndResult = Volatile.Read(ref pEntry._targetAndResult);
+ nuint entryTargetAndResult = pEntry._targetAndResult;
// target never has its lower bit set.
// a matching entryTargetAndResult would the have same bits, except for the lowest one, which is the result.
entryTargetAndResult ^= target;
if (entryTargetAndResult <= 1)
{
+ // make sure 'version' is loaded after 'source' and 'targetAndResults'
+ //
+ // We can either:
+ // - use acquires for both _source and _targetAndResults or
+ // - issue a load barrier before reading _version
+ // benchmarks on available hardware show that use of a read barrier is cheaper.
+ Interlocked.ReadMemoryBarrier();
if (version != pEntry._version)
{
// oh, so close, the entry is in inconsistent state.
diff --git a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs
index 33437141e76121..c9153f3cda7539 100644
--- a/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs
+++ b/src/coreclr/src/System.Private.CoreLib/src/System/Threading/Interlocked.CoreCLR.cs
@@ -232,6 +232,15 @@ public static long Read(ref long location) =>
[MethodImpl(MethodImplOptions.InternalCall)]
public static extern void MemoryBarrier();
+ ///
+ /// Synchronizes memory access as follows:
+ /// The processor that executes the current thread cannot reorder instructions in such a way that memory reads before
+ /// the call to execute after memory accesses that follow the call to .
+ ///
+ [Intrinsic]
+ [MethodImpl(MethodImplOptions.InternalCall)]
+ internal static extern void ReadMemoryBarrier();
+
[DllImport(RuntimeHelpers.QCall, CharSet = CharSet.Unicode)]
private static extern void _MemoryBarrierProcessWide();
diff --git a/src/coreclr/src/inc/corinfo.h b/src/coreclr/src/inc/corinfo.h
index fb4489ceb87f88..2c8c1260b73925 100644
--- a/src/coreclr/src/inc/corinfo.h
+++ b/src/coreclr/src/inc/corinfo.h
@@ -947,6 +947,7 @@ enum CorInfoIntrinsics
CORINFO_INTRINSIC_InterlockedCmpXchg32,
CORINFO_INTRINSIC_InterlockedCmpXchg64,
CORINFO_INTRINSIC_MemoryBarrier,
+ CORINFO_INTRINSIC_MemoryBarrierLoad,
CORINFO_INTRINSIC_GetCurrentManagedThread,
CORINFO_INTRINSIC_GetManagedThreadId,
CORINFO_INTRINSIC_ByReference_Ctor,
diff --git a/src/coreclr/src/inc/volatile.h b/src/coreclr/src/inc/volatile.h
index 960e33cc982aa6..6c73dc4c65667b 100644
--- a/src/coreclr/src/inc/volatile.h
+++ b/src/coreclr/src/inc/volatile.h
@@ -286,6 +286,27 @@ void VolatileStoreWithoutBarrier(T* pt, T val)
#endif
}
+//
+// Memory ordering barrier that waits for loads in progress to complete.
+// Any effects of loads or stores that appear after, in program order, will "happen after" relative to this.
+// Other operations such as computation or instruction prefetch are not affected.
+//
+// Architectural mapping:
+// arm64 : dmb ishld
+// arm : dmb ish
+// x86/64 : compiler fence
+inline
+void VolatileLoadBarrier()
+{
+#if defined(HOST_ARM64) && defined(__GNUC__)
+ asm volatile ("dmb ishld" : : : "memory");
+#elif defined(HOST_ARM64) && defined(_MSC_VER)
+ __dmb(_ARM64_BARRIER_ISHLD);
+#else
+ VOLATILE_MEMORY_BARRIER();
+#endif
+}
+
//
// Volatile implements accesses with our volatile semantics over a variable of type T.
// Wherever you would have used a "volatile Foo" or, equivalently, "Foo volatile", use Volatile
diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h
index 0b76f07284b13e..93227bc7f6c1fe 100644
--- a/src/coreclr/src/jit/codegen.h
+++ b/src/coreclr/src/jit/codegen.h
@@ -1465,11 +1465,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void instGen_Return(unsigned stkArgSize);
-#ifdef TARGET_ARM64
- void instGen_MemoryBarrier(insBarrier barrierType = INS_BARRIER_ISH);
-#else
- void instGen_MemoryBarrier();
-#endif
+ enum BarrierKind
+ {
+ BARRIER_FULL, // full barrier
+ BARRIER_LOAD_ONLY, // load barier
+ };
+
+ void instGen_MemoryBarrier(BarrierKind barrierKind = BARRIER_FULL);
void instGen_Set_Reg_To_Zero(emitAttr size, regNumber reg, insFlags flags = INS_FLAGS_DONT_CARE);
diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp
index e03ffe6f3b0867..431801ec66bd48 100644
--- a/src/coreclr/src/jit/codegenarm64.cpp
+++ b/src/coreclr/src/jit/codegenarm64.cpp
@@ -2665,8 +2665,8 @@ void CodeGen::genCodeForCpObj(GenTreeObj* cpObjNode)
if (cpObjNode->gtFlags & GTF_BLK_VOLATILE)
{
- // issue a INS_BARRIER_ISHLD after a volatile CpObj operation
- instGen_MemoryBarrier(INS_BARRIER_ISHLD);
+ // issue a load barrier after a volatile CpObj operation
+ instGen_MemoryBarrier(BARRIER_LOAD_ONLY);
}
// Clear the gcInfo for REG_WRITE_BARRIER_SRC_BYREF and REG_WRITE_BARRIER_DST_BYREF.
@@ -2775,7 +2775,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
assert(!"Unexpected treeNode->gtOper");
}
- instGen_MemoryBarrier(INS_BARRIER_ISH);
+ instGen_MemoryBarrier();
}
else
{
@@ -2855,7 +2855,7 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
GetEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
- instGen_MemoryBarrier(INS_BARRIER_ISH);
+ instGen_MemoryBarrier();
gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
}
@@ -2904,7 +2904,7 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
}
GetEmitter()->emitIns_R_R_R(INS_casal, dataSize, targetReg, dataReg, addrReg);
- instGen_MemoryBarrier(INS_BARRIER_ISH);
+ instGen_MemoryBarrier();
}
else
{
@@ -2984,7 +2984,7 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
genDefineTempLabel(labelCompareFail);
- instGen_MemoryBarrier(INS_BARRIER_ISH);
+ instGen_MemoryBarrier();
gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
}
diff --git a/src/coreclr/src/jit/codegenarmarch.cpp b/src/coreclr/src/jit/codegenarmarch.cpp
index 3d6508de5fb582..6e55a28da2cf7c 100644
--- a/src/coreclr/src/jit/codegenarmarch.cpp
+++ b/src/coreclr/src/jit/codegenarmarch.cpp
@@ -398,8 +398,13 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
break;
case GT_MEMORYBARRIER:
- instGen_MemoryBarrier();
+ {
+ CodeGen::BarrierKind barrierKind =
+ treeNode->gtFlags & GTF_MEMORYBARRIER_LOAD ? BARRIER_LOAD_ONLY : BARRIER_FULL;
+
+ instGen_MemoryBarrier(barrierKind);
break;
+ }
#ifdef TARGET_ARM64
case GT_XCHG:
@@ -1944,11 +1949,9 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
if (emitBarrier)
{
-#ifdef TARGET_ARM64
- instGen_MemoryBarrier(INS_BARRIER_ISHLD);
-#else
- instGen_MemoryBarrier();
-#endif
+ // when INS_ldar* could not be used for a volatile load,
+ // we use an ordinary load followed by a load barrier.
+ instGen_MemoryBarrier(BARRIER_LOAD_ONLY);
}
genProduceReg(tree);
@@ -1980,13 +1983,8 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode)
if (cpBlkNode->gtFlags & GTF_BLK_VOLATILE)
{
-#ifdef TARGET_ARM64
- // issue a INS_BARRIER_ISHLD after a volatile CpBlk operation
- instGen_MemoryBarrier(INS_BARRIER_ISHLD);
-#else
- // issue a full memory barrier after a volatile CpBlk operation
- instGen_MemoryBarrier();
-#endif // TARGET_ARM64
+ // issue a load barrier after a volatile CpBlk operation
+ instGen_MemoryBarrier(BARRIER_LOAD_ONLY);
}
}
@@ -2207,6 +2205,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
if (node->IsVolatile())
{
+ // issue a full memory barrier before a volatile CpBlk operation
instGen_MemoryBarrier();
}
@@ -2304,11 +2303,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
if (node->IsVolatile())
{
-#ifdef TARGET_ARM64
- instGen_MemoryBarrier(INS_BARRIER_ISHLD);
-#else
- instGen_MemoryBarrier();
-#endif
+ // issue a load barrier after a volatile CpBlk operation
+ instGen_MemoryBarrier(BARRIER_LOAD_ONLY);
}
}
diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp
index c47c001147c003..2070291a208080 100644
--- a/src/coreclr/src/jit/codegenxarch.cpp
+++ b/src/coreclr/src/jit/codegenxarch.cpp
@@ -1862,8 +1862,13 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode)
break;
case GT_MEMORYBARRIER:
- instGen_MemoryBarrier();
+ {
+ CodeGen::BarrierKind barrierKind =
+ treeNode->gtFlags & GTF_MEMORYBARRIER_LOAD ? BARRIER_LOAD_ONLY : BARRIER_FULL;
+
+ instGen_MemoryBarrier(barrierKind);
break;
+ }
case GT_CMPXCHG:
genCodeForCmpXchg(treeNode->AsCmpXchg());
diff --git a/src/coreclr/src/jit/gentree.h b/src/coreclr/src/jit/gentree.h
index 45c80be0647c3a..a3fe7aab7474cc 100644
--- a/src/coreclr/src/jit/gentree.h
+++ b/src/coreclr/src/jit/gentree.h
@@ -814,6 +814,8 @@ struct GenTree
#define GTF_CALL_POP_ARGS 0x04000000 // GT_CALL -- caller pop arguments?
#define GTF_CALL_HOISTABLE 0x02000000 // GT_CALL -- call is hoistable
+#define GTF_MEMORYBARRIER_LOAD 0x40000000 // GT_MEMORYBARRIER -- Load barrier
+
#define GTF_NOP_DEATH 0x40000000 // GT_NOP -- operand dies here
#define GTF_FLD_VOLATILE 0x40000000 // GT_FIELD/GT_CLS_VAR -- same as GTF_IND_VOLATILE
diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp
index 526d63d953cfc5..99d51600991c2e 100644
--- a/src/coreclr/src/jit/importer.cpp
+++ b/src/coreclr/src/jit/importer.cpp
@@ -3627,11 +3627,20 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
#endif // defined(TARGET_XARCH) || defined(TARGET_ARM64)
case CORINFO_INTRINSIC_MemoryBarrier:
+ case CORINFO_INTRINSIC_MemoryBarrierLoad:
assert(sig->numArgs == 0);
op1 = new (this, GT_MEMORYBARRIER) GenTree(GT_MEMORYBARRIER, TYP_VOID);
op1->gtFlags |= GTF_GLOB_REF | GTF_ASG;
+
+ // On XARCH `CORINFO_INTRINSIC_MemoryBarrierLoad` fences need not be emitted.
+ // However, we still need to capture the effect on reordering.
+ if (intrinsicID == CORINFO_INTRINSIC_MemoryBarrierLoad)
+ {
+ op1->gtFlags |= GTF_MEMORYBARRIER_LOAD;
+ }
+
retNode = op1;
break;
diff --git a/src/coreclr/src/jit/instr.cpp b/src/coreclr/src/jit/instr.cpp
index 2ee631fceeff82..969efe5a4573d1 100644
--- a/src/coreclr/src/jit/instr.cpp
+++ b/src/coreclr/src/jit/instr.cpp
@@ -2364,11 +2364,7 @@ void CodeGen::instGen_Return(unsigned stkArgSize)
* Note: all MemoryBarriers instructions can be removed by
* SET COMPlus_JitNoMemoryBarriers=1
*/
-#ifdef TARGET_ARM64
-void CodeGen::instGen_MemoryBarrier(insBarrier barrierType)
-#else
-void CodeGen::instGen_MemoryBarrier()
-#endif
+void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind)
{
#ifdef DEBUG
if (JitConfig.JitNoMemoryBarriers() == 1)
@@ -2378,12 +2374,19 @@ void CodeGen::instGen_MemoryBarrier()
#endif // DEBUG
#if defined(TARGET_XARCH)
+ // only full barrier needs to be emitted on Xarch
+ if (barrierKind != BARRIER_FULL)
+ {
+ return;
+ }
+
instGen(INS_lock);
GetEmitter()->emitIns_I_AR(INS_or, EA_4BYTE, 0, REG_SPBASE, 0);
#elif defined(TARGET_ARM)
+ // ARM has only full barriers, so all barriers need to be emitted as full.
GetEmitter()->emitIns_I(INS_dmb, EA_4BYTE, 0xf);
#elif defined(TARGET_ARM64)
- GetEmitter()->emitIns_BARR(INS_dmb, barrierType);
+ GetEmitter()->emitIns_BARR(INS_dmb, barrierKind == BARRIER_LOAD_ONLY ? INS_BARRIER_ISHLD : INS_BARRIER_ISH);
#else
#error "Unknown TARGET"
#endif
diff --git a/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs b/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs
index cce06a7b7b83fa..a0538e9063fdf5 100644
--- a/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs
+++ b/src/coreclr/src/tools/Common/JitInterface/CorInfoImpl.Intrinsics.cs
@@ -152,6 +152,7 @@ static IntrinsicHashtable InitializeIntrinsicHashtable()
table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_InterlockedCmpXchg32, "CompareExchange", "System.Threading", "Interlocked");
// table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_InterlockedCmpXchg64, "CompareExchange", "System.Threading", "Interlocked"); // ambiguous match
table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_MemoryBarrier, "MemoryBarrier", "System.Threading", "Interlocked");
+ table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_MemoryBarrierLoad, "LoadBarrier", "System.Threading", "Interlocked");
// table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_GetCurrentManagedThread, "GetCurrentThreadNative", "System", "Thread"); // not in .NET Core
// table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_GetManagedThreadId, "get_ManagedThreadId", "System", "Thread"); // not in .NET Core
table.Add(CorInfoIntrinsics.CORINFO_INTRINSIC_ByReference_Ctor, ".ctor", "System", "ByReference`1");
diff --git a/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs
index 1c13a74c864525..990c2713de72c9 100644
--- a/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs
+++ b/src/coreclr/src/tools/Common/JitInterface/CorInfoTypes.cs
@@ -463,6 +463,7 @@ public enum CorInfoIntrinsics
CORINFO_INTRINSIC_InterlockedCmpXchg32,
CORINFO_INTRINSIC_InterlockedCmpXchg64,
CORINFO_INTRINSIC_MemoryBarrier,
+ CORINFO_INTRINSIC_MemoryBarrierLoad,
CORINFO_INTRINSIC_GetCurrentManagedThread,
CORINFO_INTRINSIC_GetManagedThreadId,
CORINFO_INTRINSIC_ByReference_Ctor,
diff --git a/src/coreclr/src/vm/castcache.cpp b/src/coreclr/src/vm/castcache.cpp
index b4fb3a81832f26..9fb5f7a61c7e76 100644
--- a/src/coreclr/src/vm/castcache.cpp
+++ b/src/coreclr/src/vm/castcache.cpp
@@ -160,7 +160,7 @@ TypeHandle::CastResult CastCache::TryGet(TADDR source, TADDR target)
{
CastCacheEntry* pEntry = &Elements(tableData)[index];
- // must read in this order: version -> entry parts -> version
+ // must read in this order: version -> [entry parts] -> version
// if version is odd or changes, the entry is inconsistent and thus ignored
DWORD version1 = VolatileLoad(&pEntry->version);
TADDR entrySource = pEntry->source;
@@ -171,12 +171,14 @@ TypeHandle::CastResult CastCache::TryGet(TADDR source, TADDR target)
if (entrySource == source)
{
- TADDR entryTargetAndResult = VolatileLoad(&pEntry->targetAndResult);
+ TADDR entryTargetAndResult = pEntry->targetAndResult;
// target never has its lower bit set.
// a matching entryTargetAndResult would have the same bits, except for the lowest one, which is the result.
entryTargetAndResult ^= target;
if (entryTargetAndResult <= 1)
{
+ // make sure 'version' is loaded after 'source' and 'targetAndResults'
+ VolatileLoadBarrier();
if (version1 != pEntry->version)
{
// oh, so close, the entry is in inconsistent state.
diff --git a/src/coreclr/src/vm/comutilnative.cpp b/src/coreclr/src/vm/comutilnative.cpp
index 1f0e257a548865..0205d706913967 100644
--- a/src/coreclr/src/vm/comutilnative.cpp
+++ b/src/coreclr/src/vm/comutilnative.cpp
@@ -1799,6 +1799,15 @@ FCIMPL0(void, COMInterlocked::FCMemoryBarrier)
}
FCIMPLEND
+FCIMPL0(void, COMInterlocked::FCMemoryBarrierLoad)
+{
+ FCALL_CONTRACT;
+
+ VolatileLoadBarrier();
+ FC_GC_POLL();
+}
+FCIMPLEND
+
#include
void QCALLTYPE COMInterlocked::MemoryBarrierProcessWide()
diff --git a/src/coreclr/src/vm/comutilnative.h b/src/coreclr/src/vm/comutilnative.h
index 4493ec271326f1..fa3f0fdcb2adaf 100644
--- a/src/coreclr/src/vm/comutilnative.h
+++ b/src/coreclr/src/vm/comutilnative.h
@@ -184,6 +184,7 @@ class COMInterlocked
static FCDECL2_IV(INT64, ExchangeAdd64, INT64 *location, INT64 value);
static FCDECL0(void, FCMemoryBarrier);
+ static FCDECL0(void, FCMemoryBarrierLoad);
static void QCALLTYPE MemoryBarrierProcessWide();
};
diff --git a/src/coreclr/src/vm/ecalllist.h b/src/coreclr/src/vm/ecalllist.h
index 661cfad2907e4e..89d6bd59b07947 100644
--- a/src/coreclr/src/vm/ecalllist.h
+++ b/src/coreclr/src/vm/ecalllist.h
@@ -859,6 +859,7 @@ FCFuncStart(gInterlockedFuncs)
FCIntrinsicSig("ExchangeAdd", &gsig_SM_RefLong_Long_RetLong, COMInterlocked::ExchangeAdd64, CORINFO_INTRINSIC_InterlockedXAdd64)
FCIntrinsic("MemoryBarrier", COMInterlocked::FCMemoryBarrier, CORINFO_INTRINSIC_MemoryBarrier)
+ FCIntrinsic("ReadMemoryBarrier", COMInterlocked::FCMemoryBarrierLoad, CORINFO_INTRINSIC_MemoryBarrierLoad)
QCFuncElement("_MemoryBarrierProcessWide", COMInterlocked::MemoryBarrierProcessWide)
FCFuncEnd()