@@ -7802,6 +7802,285 @@ void Lowering::ContainCheckBitCast(GenTree* node)
78027802 }
78037803}
78047804
7805+ struct StoreCoalescingData
7806+ {
7807+ var_types targetType;
7808+ GenTree* baseAddr;
7809+ GenTree* index;
7810+ GenTree* value;
7811+ uint32_t scale;
7812+ int offset;
7813+ };
7814+
7815+ // ------------------------------------------------------------------------
7816+ // GetStoreCoalescingData: given a STOREIND node, get the data needed to perform
7817+ // store coalescing including pointer to the previous node.
7818+ //
7819+ // Arguments:
7820+ // comp - the compiler instance
7821+ // ind - the STOREIND node
7822+ // data - [OUT] the data needed for store coalescing
7823+ //
7824+ // Return Value:
7825+ // true if the data was successfully retrieved, false otherwise.
7826+ // Basically, false means that we definitely can't do store coalescing.
7827+ //
7828+ static bool GetStoreCoalescingData (Compiler* comp, GenTreeStoreInd* ind, StoreCoalescingData* data)
7829+ {
7830+ // Don't merge volatile stores.
7831+ if (ind->IsVolatile ())
7832+ {
7833+ return false ;
7834+ }
7835+
7836+ // Data has to be INT_CNS, can be also VEC_CNS in future.
7837+ if (!ind->Data ()->IsCnsIntOrI ())
7838+ {
7839+ return false ;
7840+ }
7841+
7842+ data->targetType = ind->TypeGet ();
7843+ data->value = ind->Data ();
7844+ if (ind->Addr ()->OperIs (GT_LEA))
7845+ {
7846+ GenTree* base = ind->Addr ()->AsAddrMode ()->Base ();
7847+ GenTree* index = ind->Addr ()->AsAddrMode ()->Index ();
7848+ if ((base == nullptr ) || !base->OperIs (GT_LCL_VAR) || comp->lvaVarAddrExposed (base->AsLclVar ()->GetLclNum ()))
7849+ {
7850+ // Base must be a local. It's possible for it to be nullptr when index is not null,
7851+ // but let's ignore such cases.
7852+ return false ;
7853+ }
7854+
7855+ if ((index != nullptr ) &&
7856+ (!index->OperIs (GT_LCL_VAR) || comp->lvaVarAddrExposed (index->AsLclVar ()->GetLclNum ())))
7857+ {
7858+ // Index should be either nullptr or a local.
7859+ return false ;
7860+ }
7861+
7862+ data->baseAddr = base == nullptr ? nullptr : base;
7863+ data->index = index == nullptr ? nullptr : index;
7864+ data->scale = ind->Addr ()->AsAddrMode ()->GetScale ();
7865+ data->offset = ind->Addr ()->AsAddrMode ()->Offset ();
7866+ }
7867+ else if (ind->Addr ()->OperIs (GT_LCL_VAR) && !comp->lvaVarAddrExposed (ind->Addr ()->AsLclVar ()->GetLclNum ()))
7868+ {
7869+ // Address is just a local, no offset, scale is 1
7870+ data->baseAddr = ind->Addr ();
7871+ data->index = nullptr ;
7872+ data->scale = 1 ;
7873+ data->offset = 0 ;
7874+ }
7875+ else
7876+ {
7877+ // Address is not LEA or local.
7878+ return false ;
7879+ }
7880+ return true ;
7881+ }
7882+
7883+ // ------------------------------------------------------------------------
7884+ // LowerStoreIndirCoalescing: If the given STOREIND node is followed by a similar
7885+ // STOREIND node, try to merge them into a single store of a twice wider type. Example:
7886+ //
7887+ // * STOREIND int
7888+ // +--* LCL_VAR byref V00
7889+ // \--* CNS_INT int 0x1
7890+ //
7891+ // * STOREIND int
7892+ // +--* LEA(b+4) byref
7893+ // | \--* LCL_VAR byref V00
7894+ // \--* CNS_INT int 0x2
7895+ //
7896+ // We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
7897+ //
7898+ // * STOREIND long
7899+ // +--* LEA(b+0) byref
7900+ // | \--* LCL_VAR byref V00
7901+ // \--* CNS_INT long 0x200000001
7902+ //
7903+ // Arguments:
7904+ // ind - the current STOREIND node
7905+ //
7906+ void Lowering::LowerStoreIndirCoalescing (GenTreeStoreInd* ind)
7907+ {
7908+ // LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from
7909+ // unaligned accesses making this optimization questionable.
7910+ #if defined(TARGET_XARCH) || defined(TARGET_ARM64)
7911+ if (!comp->opts .OptimizationEnabled ())
7912+ {
7913+ return ;
7914+ }
7915+
7916+ // For now, we require the current STOREIND to have LEA (previous store may not have it)
7917+ // So we can easily adjust the offset, consider making it more flexible in future.
7918+ if (!ind->Addr ()->OperIs (GT_LEA))
7919+ {
7920+ return ;
7921+ }
7922+
7923+ // We're going to do it in a loop while we see suitable STOREINDs to coalesce.
7924+ // E.g.: we have the following LIR sequence:
7925+ //
7926+ // ...addr nodes...
7927+ // STOREIND(int)
7928+ // ...addr nodes...
7929+ // STOREIND(short)
7930+ // ...addr nodes...
7931+ // STOREIND(short) <-- we're here
7932+ //
7933+ // First we merge two 'short' stores, then we merge the result with the 'int' store
7934+ // to get a single store of 8 bytes.
7935+ do
7936+ {
7937+ // This check is not really needed, just for better throughput.
7938+ if (!ind->TypeIs (TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
7939+ {
7940+ return ;
7941+ }
7942+
7943+ StoreCoalescingData currData;
7944+ StoreCoalescingData prevData;
7945+
7946+ // Get coalescing data for the current STOREIND
7947+ if (!GetStoreCoalescingData (comp, ind, &currData))
7948+ {
7949+ return ;
7950+ }
7951+
7952+ bool isClosedRange = false ;
7953+ // Now we need to find the very first LIR node representing the current STOREIND
7954+ // and make sure that there are no other unexpected nodes in-between.
7955+ LIR::ReadOnlyRange currIndRange = BlockRange ().GetTreeRange (ind, &isClosedRange);
7956+ if (!isClosedRange)
7957+ {
7958+ return ;
7959+ }
7960+ GenTree* prevTree = currIndRange.FirstNode ()->gtPrev ;
7961+ // Now we need to find the previous STOREIND,
7962+ // we can ignore any NOPs or IL_OFFSETs in-between
7963+ while ((prevTree != nullptr ) && prevTree->OperIs (GT_NOP, GT_IL_OFFSET))
7964+ {
7965+ prevTree = prevTree->gtPrev ;
7966+ }
7967+
7968+ // It's not a STOREIND - bail out.
7969+ if ((prevTree == nullptr ) || !prevTree->OperIs (GT_STOREIND))
7970+ {
7971+ return ;
7972+ }
7973+
7974+ // Get coalescing data for the previous STOREIND
7975+ GenTreeStoreInd* prevInd = prevTree->AsStoreInd ();
7976+ if (!GetStoreCoalescingData (comp, prevInd->AsStoreInd (), &prevData))
7977+ {
7978+ return ;
7979+ }
7980+
7981+ // Same for the previous STOREIND, make sure there are no unexpected nodes around.
7982+ LIR::ReadOnlyRange prevIndRange = BlockRange ().GetTreeRange (prevInd, &isClosedRange);
7983+ if (!isClosedRange)
7984+ {
7985+ return ;
7986+ }
7987+
7988+ // STOREIND aren't value nodes.
7989+ LIR::Use use;
7990+ assert (!BlockRange ().TryGetUse (prevInd, &use) && !BlockRange ().TryGetUse (ind, &use));
7991+
7992+ // BaseAddr, Index, Scale and Type all have to match.
7993+ if ((prevData.scale != currData.scale ) || (prevData.targetType != currData.targetType ) ||
7994+ !GenTree::Compare (prevData.baseAddr , currData.baseAddr ) ||
7995+ !GenTree::Compare (prevData.index , currData.index ))
7996+ {
7997+ return ;
7998+ }
7999+
8000+ // Offset has to match the size of the type. We don't support the same or overlapping offsets.
8001+ if (abs (prevData.offset - currData.offset ) != (int )genTypeSize (prevData.targetType ))
8002+ {
8003+ return ;
8004+ }
8005+
8006+ // Since we're merging two stores of the same type, the new type is twice wider.
8007+ var_types oldType = ind->TypeGet ();
8008+ var_types newType;
8009+ switch (oldType)
8010+ {
8011+ case TYP_BYTE:
8012+ case TYP_UBYTE:
8013+ newType = TYP_USHORT;
8014+ break ;
8015+
8016+ case TYP_SHORT:
8017+ case TYP_USHORT:
8018+ newType = TYP_INT; // TYP_UINT is not legal in IR
8019+ break ;
8020+
8021+ #ifdef TARGET_64BIT
8022+ case TYP_INT:
8023+ newType = TYP_LONG;
8024+ break ;
8025+ #endif // TARGET_64BIT
8026+
8027+ // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
8028+ // be converted to TYP_INT/TYP_LONG for constant value.
8029+ //
8030+ // TODO-CQ:
8031+ // 2 x LONG/REF -> SIMD16
8032+ // 2 x SIMD16 -> SIMD32
8033+ // 2 x SIMD32 -> SIMD64
8034+ //
8035+ // where it's legal (e.g. SIMD is not atomic on x64)
8036+ //
8037+ default :
8038+ return ;
8039+ }
8040+
8041+ // Delete previous STOREIND entirely
8042+ BlockRange ().Remove (std::move (prevIndRange));
8043+
8044+ // We know it's always LEA for now
8045+ GenTreeAddrMode* addr = ind->Addr ()->AsAddrMode ();
8046+
8047+ // Update offset to be the minimum of the two
8048+ addr->SetOffset (min (prevData.offset , currData.offset ));
8049+
8050+ // Update type for both STOREIND and val
8051+ ind->gtType = newType;
8052+ ind->Data ()->gtType = newType;
8053+
8054+ // We currently only support these constants for val
8055+ assert (prevData.value ->IsCnsIntOrI () && currData.value ->IsCnsIntOrI ());
8056+
8057+ size_t lowerCns = (size_t )prevData.value ->AsIntCon ()->IconValue ();
8058+ size_t upperCns = (size_t )currData.value ->AsIntCon ()->IconValue ();
8059+
8060+ // if the previous store was at a higher address, swap the constants
8061+ if (prevData.offset > currData.offset )
8062+ {
8063+ std::swap (lowerCns, upperCns);
8064+ }
8065+
8066+ // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
8067+ // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
8068+ size_t mask = ~(size_t (0 )) >> (sizeof (size_t ) - genTypeSize (oldType)) * BITS_IN_BYTE;
8069+ lowerCns &= mask;
8070+ upperCns &= mask;
8071+
8072+ size_t val = (lowerCns | (upperCns << (genTypeSize (oldType) * BITS_IN_BYTE)));
8073+ JITDUMP (" Coalesced two stores into a single store with value %lld\n " , (int64_t )val);
8074+
8075+ // It's not expected to be contained yet, but just in case...
8076+ ind->Data ()->ClearContained ();
8077+ ind->Data ()->AsIntCon ()->gtIconVal = (ssize_t )val;
8078+ ind->gtFlags |= GTF_IND_UNALIGNED;
8079+
8080+ } while (true );
8081+ #endif // TARGET_XARCH || TARGET_ARM64
8082+ }
8083+
78058084// ------------------------------------------------------------------------
78068085// LowerStoreIndirCommon: a common logic to lower StoreIndir.
78078086//
@@ -7842,6 +8121,7 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
78428121 }
78438122#endif
78448123
8124+ LowerStoreIndirCoalescing (ind);
78458125 LowerStoreIndir (ind);
78468126 }
78478127}
0 commit comments