Skip to content

Commit 6b25c3b

Browse files
EgorBoEgorjakobbotsch
authored
JIT: Merge consecutive stores (#92852)
Co-authored-by: Egor <[email protected]> Co-authored-by: Jakob Botsch Nielsen <[email protected]>
1 parent 7abea9e commit 6b25c3b

File tree

2 files changed

+281
-0
lines changed

2 files changed

+281
-0
lines changed

src/coreclr/jit/lower.cpp

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7802,6 +7802,285 @@ void Lowering::ContainCheckBitCast(GenTree* node)
78027802
}
78037803
}
78047804

7805+
struct StoreCoalescingData
7806+
{
7807+
var_types targetType;
7808+
GenTree* baseAddr;
7809+
GenTree* index;
7810+
GenTree* value;
7811+
uint32_t scale;
7812+
int offset;
7813+
};
7814+
7815+
//------------------------------------------------------------------------
7816+
// GetStoreCoalescingData: given a STOREIND node, get the data needed to perform
7817+
// store coalescing including pointer to the previous node.
7818+
//
7819+
// Arguments:
7820+
// comp - the compiler instance
7821+
// ind - the STOREIND node
7822+
// data - [OUT] the data needed for store coalescing
7823+
//
7824+
// Return Value:
7825+
// true if the data was successfully retrieved, false otherwise.
7826+
// Basically, false means that we definitely can't do store coalescing.
7827+
//
7828+
static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCoalescingData* data)
7829+
{
7830+
// Don't merge volatile stores.
7831+
if (ind->IsVolatile())
7832+
{
7833+
return false;
7834+
}
7835+
7836+
// Data has to be INT_CNS, can be also VEC_CNS in future.
7837+
if (!ind->Data()->IsCnsIntOrI())
7838+
{
7839+
return false;
7840+
}
7841+
7842+
data->targetType = ind->TypeGet();
7843+
data->value = ind->Data();
7844+
if (ind->Addr()->OperIs(GT_LEA))
7845+
{
7846+
GenTree* base = ind->Addr()->AsAddrMode()->Base();
7847+
GenTree* index = ind->Addr()->AsAddrMode()->Index();
7848+
if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum()))
7849+
{
7850+
// Base must be a local. It's possible for it to be nullptr when index is not null,
7851+
// but let's ignore such cases.
7852+
return false;
7853+
}
7854+
7855+
if ((index != nullptr) &&
7856+
(!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum())))
7857+
{
7858+
// Index should be either nullptr or a local.
7859+
return false;
7860+
}
7861+
7862+
data->baseAddr = base == nullptr ? nullptr : base;
7863+
data->index = index == nullptr ? nullptr : index;
7864+
data->scale = ind->Addr()->AsAddrMode()->GetScale();
7865+
data->offset = ind->Addr()->AsAddrMode()->Offset();
7866+
}
7867+
else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum()))
7868+
{
7869+
// Address is just a local, no offset, scale is 1
7870+
data->baseAddr = ind->Addr();
7871+
data->index = nullptr;
7872+
data->scale = 1;
7873+
data->offset = 0;
7874+
}
7875+
else
7876+
{
7877+
// Address is not LEA or local.
7878+
return false;
7879+
}
7880+
return true;
7881+
}
7882+
7883+
//------------------------------------------------------------------------
7884+
// LowerStoreIndirCoalescing: If the given STOREIND node is followed by a similar
7885+
// STOREIND node, try to merge them into a single store of a twice wider type. Example:
7886+
//
7887+
// * STOREIND int
7888+
// +--* LCL_VAR byref V00
7889+
// \--* CNS_INT int 0x1
7890+
//
7891+
// * STOREIND int
7892+
// +--* LEA(b+4) byref
7893+
// | \--* LCL_VAR byref V00
7894+
// \--* CNS_INT int 0x2
7895+
//
7896+
// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
7897+
//
7898+
// * STOREIND long
7899+
// +--* LEA(b+0) byref
7900+
// | \--* LCL_VAR byref V00
7901+
// \--* CNS_INT long 0x200000001
7902+
//
7903+
// Arguments:
7904+
// ind - the current STOREIND node
7905+
//
7906+
void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
7907+
{
7908+
// LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from
7909+
// unaligned accesses making this optimization questionable.
7910+
#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
7911+
if (!comp->opts.OptimizationEnabled())
7912+
{
7913+
return;
7914+
}
7915+
7916+
// For now, we require the current STOREIND to have LEA (previous store may not have it)
7917+
// So we can easily adjust the offset, consider making it more flexible in future.
7918+
if (!ind->Addr()->OperIs(GT_LEA))
7919+
{
7920+
return;
7921+
}
7922+
7923+
// We're going to do it in a loop while we see suitable STOREINDs to coalesce.
7924+
// E.g.: we have the following LIR sequence:
7925+
//
7926+
// ...addr nodes...
7927+
// STOREIND(int)
7928+
// ...addr nodes...
7929+
// STOREIND(short)
7930+
// ...addr nodes...
7931+
// STOREIND(short) <-- we're here
7932+
//
7933+
// First we merge two 'short' stores, then we merge the result with the 'int' store
7934+
// to get a single store of 8 bytes.
7935+
do
7936+
{
7937+
// This check is not really needed, just for better throughput.
7938+
if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
7939+
{
7940+
return;
7941+
}
7942+
7943+
StoreCoalescingData currData;
7944+
StoreCoalescingData prevData;
7945+
7946+
// Get coalescing data for the current STOREIND
7947+
if (!GetStoreCoalescingData(comp, ind, &currData))
7948+
{
7949+
return;
7950+
}
7951+
7952+
bool isClosedRange = false;
7953+
// Now we need to find the very first LIR node representing the current STOREIND
7954+
// and make sure that there are no other unexpected nodes in-between.
7955+
LIR::ReadOnlyRange currIndRange = BlockRange().GetTreeRange(ind, &isClosedRange);
7956+
if (!isClosedRange)
7957+
{
7958+
return;
7959+
}
7960+
GenTree* prevTree = currIndRange.FirstNode()->gtPrev;
7961+
// Now we need to find the previous STOREIND,
7962+
// we can ignore any NOPs or IL_OFFSETs in-between
7963+
while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET))
7964+
{
7965+
prevTree = prevTree->gtPrev;
7966+
}
7967+
7968+
// It's not a STOREIND - bail out.
7969+
if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND))
7970+
{
7971+
return;
7972+
}
7973+
7974+
// Get coalescing data for the previous STOREIND
7975+
GenTreeStoreInd* prevInd = prevTree->AsStoreInd();
7976+
if (!GetStoreCoalescingData(comp, prevInd->AsStoreInd(), &prevData))
7977+
{
7978+
return;
7979+
}
7980+
7981+
// Same for the previous STOREIND, make sure there are no unexpected nodes around.
7982+
LIR::ReadOnlyRange prevIndRange = BlockRange().GetTreeRange(prevInd, &isClosedRange);
7983+
if (!isClosedRange)
7984+
{
7985+
return;
7986+
}
7987+
7988+
// STOREIND aren't value nodes.
7989+
LIR::Use use;
7990+
assert(!BlockRange().TryGetUse(prevInd, &use) && !BlockRange().TryGetUse(ind, &use));
7991+
7992+
// BaseAddr, Index, Scale and Type all have to match.
7993+
if ((prevData.scale != currData.scale) || (prevData.targetType != currData.targetType) ||
7994+
!GenTree::Compare(prevData.baseAddr, currData.baseAddr) ||
7995+
!GenTree::Compare(prevData.index, currData.index))
7996+
{
7997+
return;
7998+
}
7999+
8000+
// Offset has to match the size of the type. We don't support the same or overlapping offsets.
8001+
if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType))
8002+
{
8003+
return;
8004+
}
8005+
8006+
// Since we're merging two stores of the same type, the new type is twice wider.
8007+
var_types oldType = ind->TypeGet();
8008+
var_types newType;
8009+
switch (oldType)
8010+
{
8011+
case TYP_BYTE:
8012+
case TYP_UBYTE:
8013+
newType = TYP_USHORT;
8014+
break;
8015+
8016+
case TYP_SHORT:
8017+
case TYP_USHORT:
8018+
newType = TYP_INT; // TYP_UINT is not legal in IR
8019+
break;
8020+
8021+
#ifdef TARGET_64BIT
8022+
case TYP_INT:
8023+
newType = TYP_LONG;
8024+
break;
8025+
#endif // TARGET_64BIT
8026+
8027+
// TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
8028+
// be converted to TYP_INT/TYP_LONG for constant value.
8029+
//
8030+
// TODO-CQ:
8031+
// 2 x LONG/REF -> SIMD16
8032+
// 2 x SIMD16 -> SIMD32
8033+
// 2 x SIMD32 -> SIMD64
8034+
//
8035+
// where it's legal (e.g. SIMD is not atomic on x64)
8036+
//
8037+
default:
8038+
return;
8039+
}
8040+
8041+
// Delete previous STOREIND entirely
8042+
BlockRange().Remove(std::move(prevIndRange));
8043+
8044+
// We know it's always LEA for now
8045+
GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();
8046+
8047+
// Update offset to be the minimum of the two
8048+
addr->SetOffset(min(prevData.offset, currData.offset));
8049+
8050+
// Update type for both STOREIND and val
8051+
ind->gtType = newType;
8052+
ind->Data()->gtType = newType;
8053+
8054+
// We currently only support these constants for val
8055+
assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI());
8056+
8057+
size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
8058+
size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();
8059+
8060+
// if the previous store was at a higher address, swap the constants
8061+
if (prevData.offset > currData.offset)
8062+
{
8063+
std::swap(lowerCns, upperCns);
8064+
}
8065+
8066+
// Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
8067+
// the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
8068+
size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE;
8069+
lowerCns &= mask;
8070+
upperCns &= mask;
8071+
8072+
size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE)));
8073+
JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);
8074+
8075+
// It's not expected to be contained yet, but just in case...
8076+
ind->Data()->ClearContained();
8077+
ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
8078+
ind->gtFlags |= GTF_IND_UNALIGNED;
8079+
8080+
} while (true);
8081+
#endif // TARGET_XARCH || TARGET_ARM64
8082+
}
8083+
78058084
//------------------------------------------------------------------------
78068085
// LowerStoreIndirCommon: a common logic to lower StoreIndir.
78078086
//
@@ -7842,6 +8121,7 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
78428121
}
78438122
#endif
78448123

8124+
LowerStoreIndirCoalescing(ind);
78458125
LowerStoreIndir(ind);
78468126
}
78478127
}

src/coreclr/jit/lower.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ class Lowering final : public Phase
312312
void LowerStoreIndirCommon(GenTreeStoreInd* ind);
313313
void LowerIndir(GenTreeIndir* ind);
314314
void LowerStoreIndir(GenTreeStoreInd* node);
315+
void LowerStoreIndirCoalescing(GenTreeStoreInd* node);
315316
GenTree* LowerAdd(GenTreeOp* node);
316317
GenTree* LowerMul(GenTreeOp* mul);
317318
bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode);

0 commit comments

Comments
 (0)