@@ -7833,26 +7833,34 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
78337833 }
78347834
78357835 // Data has to be INT_CNS, can be also VEC_CNS in future.
7836- if (!ind->Data ()->IsCnsIntOrI ())
7836+ if (!ind->Data ()->IsCnsIntOrI () && !ind-> Data ()-> IsVectorConst () )
78377837 {
78387838 return false ;
78397839 }
78407840
7841+ auto isNodeInvariant = [](Compiler* comp, GenTree* node, bool allowNull) {
7842+ if (node == nullptr )
7843+ {
7844+ return allowNull;
7845+ }
7846+ // We can allow bigger trees here, but it's not clear if it's worth it.
7847+ return node->OperIs (GT_LCL_VAR) && !comp->lvaVarAddrExposed (node->AsLclVar ()->GetLclNum ());
7848+ };
7849+
78417850 data->targetType = ind->TypeGet ();
78427851 data->value = ind->Data ();
78437852 if (ind->Addr ()->OperIs (GT_LEA))
78447853 {
78457854 GenTree* base = ind->Addr ()->AsAddrMode ()->Base ();
78467855 GenTree* index = ind->Addr ()->AsAddrMode ()->Index ();
7847- if ((base == nullptr ) || !base-> OperIs (GT_LCL_VAR) || comp-> lvaVarAddrExposed ( base-> AsLclVar ()-> GetLclNum () ))
7856+ if (! isNodeInvariant ( comp, base, false ))
78487857 {
78497858 // Base must be a local. It's possible for it to be nullptr when index is not null,
78507859 // but let's ignore such cases.
78517860 return false ;
78527861 }
78537862
7854- if ((index != nullptr ) &&
7855- (!index->OperIs (GT_LCL_VAR) || comp->lvaVarAddrExposed (index->AsLclVar ()->GetLclNum ())))
7863+ if (!isNodeInvariant (comp, index, true ))
78567864 {
78577865 // Index should be either nullptr or a local.
78587866 return false ;
@@ -7863,7 +7871,7 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
78637871 data->scale = ind->Addr ()->AsAddrMode ()->GetScale ();
78647872 data->offset = ind->Addr ()->AsAddrMode ()->Offset ();
78657873 }
7866- else if (ind-> Addr ()-> OperIs (GT_LCL_VAR) && ! comp-> lvaVarAddrExposed ( ind->Addr ()-> AsLclVar ()-> GetLclNum () ))
7874+ else if (isNodeInvariant ( comp, ind->Addr (), true ))
78677875 {
78687876 // Address is just a local, no offset, scale is 1
78697877 data->baseAddr = ind->Addr ();
@@ -7919,6 +7927,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
79197927 return ;
79207928 }
79217929
7930+ // TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial.
7931+ // The algorithm does support TYP_REF (with null value), but it seems to be not worth
7932+ // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
7933+ // items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
7934+ if (!varTypeIsIntegral (ind) && !varTypeIsSIMD (ind))
7935+ {
7936+ return ;
7937+ }
7938+
79227939 // We're going to do it in a loop while we see suitable STOREINDs to coalesce.
79237940 // E.g.: we have the following LIR sequence:
79247941 //
@@ -7933,12 +7950,6 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
79337950 // to get a single store of 8 bytes.
79347951 do
79357952 {
7936- // This check is not really needed, just for better throughput.
7937- if (!ind->TypeIs (TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
7938- {
7939- return ;
7940- }
7941-
79427953 StoreCoalescingData currData;
79437954 StoreCoalescingData prevData;
79447955
@@ -8002,6 +8013,57 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80028013 return ;
80038014 }
80048015
8016+ // Now the hardest part: decide whether it's safe to use an unaligned write.
8017+ //
8018+ // IND<byte> is always fine (and all IND<X> created here from such)
8019+ // IND<simd> is not required to be atomic per our Memory Model
8020+ const bool allowsNonAtomic =
8021+ ((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0 ) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0 );
8022+
8023+ if (!allowsNonAtomic && (genTypeSize (ind) > 1 ) && !varTypeIsSIMD (ind))
8024+ {
8025+ // TODO-CQ: if we see that the target is a local memory (non address exposed)
8026+ // we can use any type (including SIMD) for a new load.
8027+
8028+ // Ignore indices for now, they can invalidate our alignment assumptions.
8029+ // Although, we can take scale into account.
8030+ if (currData.index != nullptr )
8031+ {
8032+ return ;
8033+ }
8034+
8035+ // Base address being TYP_REF gives us a hint that data is pointer-aligned.
8036+ if (!currData.baseAddr ->TypeIs (TYP_REF))
8037+ {
8038+ return ;
8039+ }
8040+
8041+ // Check whether the combined indir is still aligned.
8042+ bool isCombinedIndirAtomic = (genTypeSize (ind) < TARGET_POINTER_SIZE) &&
8043+ (min (prevData.offset , currData.offset ) % (genTypeSize (ind) * 2 )) == 0 ;
8044+
8045+ if (genTypeSize (ind) == TARGET_POINTER_SIZE)
8046+ {
8047+ #ifdef TARGET_ARM64
8048+ // Per Arm Architecture Reference Manual for A-profile architecture:
8049+ //
8050+ // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
8051+ // are treated as a pair of single - copy atomic 64 - bit writes.
8052+ //
8053+ // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
8054+ //
8055+ // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
8056+ // we're already doing a load that has no atomicity guarantees.
8057+ isCombinedIndirAtomic = true ;
8058+ #endif
8059+ }
8060+
8061+ if (!isCombinedIndirAtomic)
8062+ {
8063+ return ;
8064+ }
8065+ }
8066+
80058067 // Since we're merging two stores of the same type, the new type is twice wider.
80068068 var_types oldType = ind->TypeGet ();
80078069 var_types newType;
@@ -8014,32 +8076,80 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80148076
80158077 case TYP_SHORT:
80168078 case TYP_USHORT:
8017- newType = TYP_INT; // TYP_UINT is not legal in IR
8079+ newType = TYP_INT;
80188080 break ;
80198081
80208082#ifdef TARGET_64BIT
80218083 case TYP_INT:
80228084 newType = TYP_LONG;
80238085 break ;
8086+
8087+ #if defined(FEATURE_HW_INTRINSICS)
8088+ case TYP_LONG:
8089+ case TYP_REF:
8090+ if (comp->IsBaselineSimdIsaSupported ())
8091+ {
8092+ // TLDR: we should be here only if one of the conditions is true:
8093+ // 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag
8094+ // 2) ARM64: Data is at least 8-byte aligned
8095+ // 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+
8096+ //
8097+ newType = TYP_SIMD16;
8098+ if ((oldType == TYP_REF) &&
8099+ (!currData.value ->IsIntegralConst (0 ) || !prevData.value ->IsIntegralConst (0 )))
8100+ {
8101+ // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
8102+ //
8103+ // arr[1] = "hello";
8104+ // arr[0] = "world";
8105+ //
8106+ // but we don't want to load managed references into SIMD registers (we can only do so
8107+ // when we can issue a nongc region for a block)
8108+ return ;
8109+ }
8110+ break ;
8111+ }
8112+ return ;
8113+
8114+ #if defined(TARGET_AMD64)
8115+ case TYP_SIMD16:
8116+ if (comp->getPreferredVectorByteLength () >= 32 )
8117+ {
8118+ newType = TYP_SIMD32;
8119+ break ;
8120+ }
8121+ return ;
8122+
8123+ case TYP_SIMD32:
8124+ if (comp->getPreferredVectorByteLength () >= 64 )
8125+ {
8126+ newType = TYP_SIMD64;
8127+ break ;
8128+ }
8129+ return ;
8130+ #endif // TARGET_AMD64
8131+ #endif // FEATURE_HW_INTRINSICS
80248132#endif // TARGET_64BIT
80258133
80268134 // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
80278135 // be converted to TYP_INT/TYP_LONG for constant value.
80288136 //
8029- // TODO-CQ:
8030- // 2 x LONG/REF -> SIMD16
8031- // 2 x SIMD16 -> SIMD32
8032- // 2 x SIMD32 -> SIMD64
8033- //
8034- // where it's legal (e.g. SIMD is not atomic on x64)
8137+ // TYP_UINT and TYP_ULONG are not legal for GT_IND.
80358138 //
80368139 default :
80378140 return ;
80388141 }
80398142
8143+ // We should not be here for stores requiring write barriers.
8144+ assert (!comp->codeGen ->gcInfo .gcIsWriteBarrierStoreIndNode (ind));
8145+ assert (!comp->codeGen ->gcInfo .gcIsWriteBarrierStoreIndNode (prevInd));
8146+
80408147 // Delete previous STOREIND entirely
80418148 BlockRange ().Remove (std::move (prevIndRange));
80428149
8150+ // It's not expected to be contained yet, but just in case...
8151+ ind->Data ()->ClearContained ();
8152+
80438153 // We know it's always LEA for now
80448154 GenTreeAddrMode* addr = ind->Addr ()->AsAddrMode ();
80458155
@@ -8050,8 +8160,29 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80508160 ind->gtType = newType;
80518161 ind->Data ()->gtType = newType;
80528162
8053- // We currently only support these constants for val
8054- assert (prevData.value ->IsCnsIntOrI () && currData.value ->IsCnsIntOrI ());
8163+ #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
8164+ // Upgrading two SIMD stores to a wider SIMD store.
8165+ // Only on x64 since ARM64 has no options above SIMD16
8166+ if (varTypeIsSIMD (oldType))
8167+ {
8168+ int8_t * lowerCns = prevData.value ->AsVecCon ()->gtSimdVal .i8 ;
8169+ int8_t * upperCns = currData.value ->AsVecCon ()->gtSimdVal .i8 ;
8170+
8171+ // if the previous store was at a higher address, swap the constants
8172+ if (prevData.offset > currData.offset )
8173+ {
8174+ std::swap (lowerCns, upperCns);
8175+ }
8176+
8177+ simd_t newCns = {};
8178+ uint32_t oldWidth = genTypeSize (oldType);
8179+ memcpy (newCns.i8 , lowerCns, oldWidth);
8180+ memcpy (newCns.i8 + oldWidth, upperCns, oldWidth);
8181+
8182+ ind->Data ()->AsVecCon ()->gtSimdVal = newCns;
8183+ continue ;
8184+ }
8185+ #endif
80558186
80568187 size_t lowerCns = (size_t )prevData.value ->AsIntCon ()->IconValue ();
80578188 size_t upperCns = (size_t )currData.value ->AsIntCon ()->IconValue ();
@@ -8062,6 +8193,24 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80628193 std::swap (lowerCns, upperCns);
80638194 }
80648195
8196+ #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
8197+ // We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16
8198+ // All legality checks were done above.
8199+ if (varTypeIsSIMD (newType))
8200+ {
8201+ // Replace two 64bit constants with a single 128bit constant
8202+ int8_t val[16 ];
8203+ memcpy (val, &lowerCns, 8 );
8204+ memcpy (val + 8 , &upperCns, 8 );
8205+ GenTreeVecCon* vecCns = comp->gtNewVconNode (newType, &val);
8206+
8207+ BlockRange ().InsertAfter (ind->Data (), vecCns);
8208+ BlockRange ().Remove (ind->Data ());
8209+ ind->gtOp2 = vecCns;
8210+ continue ;
8211+ }
8212+ #endif // TARGET_64BIT && FEATURE_HW_INTRINSICS
8213+
80658214 // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
80668215 // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
80678216 size_t mask = ~(size_t (0 )) >> (sizeof (size_t ) - genTypeSize (oldType)) * BITS_IN_BYTE;
@@ -8071,10 +8220,12 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80718220 size_t val = (lowerCns | (upperCns << (genTypeSize (oldType) * BITS_IN_BYTE)));
80728221 JITDUMP (" Coalesced two stores into a single store with value %lld\n " , (int64_t )val);
80738222
8074- // It's not expected to be contained yet, but just in case...
8075- ind->Data ()->ClearContained ();
80768223 ind->Data ()->AsIntCon ()->gtIconVal = (ssize_t )val;
8077- ind->gtFlags |= GTF_IND_UNALIGNED;
8224+ if (genTypeSize (oldType) == 1 )
8225+ {
8226+ // A mark for future foldings that this IND doesn't need to be atomic.
8227+ ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
8228+ }
80788229
80798230 } while (true );
80808231#endif // TARGET_XARCH || TARGET_ARM64
0 commit comments