Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
280 changes: 280 additions & 0 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7802,6 +7802,285 @@ void Lowering::ContainCheckBitCast(GenTree* node)
}
}

struct StoreCoalescingData
{
var_types targetType;
GenTree* baseAddr;
GenTree* index;
GenTree* value;
uint32_t scale;
int offset;
};

//------------------------------------------------------------------------
// GetStoreCoalescingData: given a STOREIND node, get the data needed to perform
// store coalescing including pointer to the previous node.
//
// Arguments:
// comp - the compiler instance
// ind - the STOREIND node
// data - [OUT] the data needed for store coalescing
//
// Return Value:
// true if the data was successfully retrieved, false otherwise.
// Basically, false means that we definitely can't do store coalescing.
//
static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCoalescingData* data)
{
// Don't merge volatile stores.
if (ind->IsVolatile())
{
return false;
}

// Data has to be INT_CNS, can be also VEC_CNS in future.
if (!ind->Data()->IsCnsIntOrI())
{
return false;
}

data->targetType = ind->TypeGet();
data->value = ind->Data();
if (ind->Addr()->OperIs(GT_LEA))
{
GenTree* base = ind->Addr()->AsAddrMode()->Base();
GenTree* index = ind->Addr()->AsAddrMode()->Index();
if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum()))
{
// Base must be a local. It's possible for it to be nullptr when index is not null,
// but let's ignore such cases.
return false;
}

if ((index != nullptr) &&
(!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum())))
{
// Index should be either nullptr or a local.
return false;
}

data->baseAddr = base == nullptr ? nullptr : base;
data->index = index == nullptr ? nullptr : index;
data->scale = ind->Addr()->AsAddrMode()->GetScale();
data->offset = ind->Addr()->AsAddrMode()->Offset();
}
else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum()))
{
// Address is just a local, no offset, scale is 1
data->baseAddr = ind->Addr();
data->index = nullptr;
data->scale = 1;
data->offset = 0;
}
else
{
// Address is not LEA or local.
return false;
}
return true;
}

//------------------------------------------------------------------------
// LowerStoreIndirCoalescing: If the given STOREIND node is followed by a similar
// STOREIND node, try to merge them into a single store of a twice wider type. Example:
//
// * STOREIND int
// +--* LCL_VAR byref V00
// \--* CNS_INT int 0x1
//
// * STOREIND int
// +--* LEA(b+4) byref
// | \--* LCL_VAR byref V00
// \--* CNS_INT int 0x2
//
// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
//
// * STOREIND long
// +--* LEA(b+0) byref
// | \--* LCL_VAR byref V00
// \--* CNS_INT long 0x200000001
//
// Arguments:
// ind - the current STOREIND node
//
void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
{
// LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from
// unaligned accesses making this optimization questionable.
#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
if (!comp->opts.OptimizationEnabled())
{
return;
}

// For now, we require the current STOREIND to have LEA (previous store may not have it)
// So we can easily adjust the offset, consider making it more flexible in future.
if (!ind->Addr()->OperIs(GT_LEA))
{
return;
}

// We're going to do it in a loop while we see suitable STOREINDs to coalesce.
// E.g.: we have the following LIR sequence:
//
// ...addr nodes...
// STOREIND(int)
// ...addr nodes...
// STOREIND(short)
// ...addr nodes...
// STOREIND(short) <-- we're here
//
// First we merge two 'short' stores, then we merge the result with the 'int' store
// to get a single store of 8 bytes.
do
{
// This check is not really needed, just for better throughput.
if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
{
return;
}

StoreCoalescingData currData;
StoreCoalescingData prevData;

// Get coalescing data for the current STOREIND
if (!GetStoreCoalescingData(comp, ind, &currData))
{
return;
}

bool isClosedRange = false;
// Now we need to find the very first LIR node representing the current STOREIND
// and make sure that there are no other unexpected nodes in-between.
LIR::ReadOnlyRange currIndRange = BlockRange().GetTreeRange(ind, &isClosedRange);
if (!isClosedRange)
{
return;
}
GenTree* prevTree = currIndRange.FirstNode()->gtPrev;
// Now we need to find the previous STOREIND,
// we can ignore any NOPs or IL_OFFSETs in-between
while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET))
{
prevTree = prevTree->gtPrev;
}

// It's not a STOREIND - bail out.
if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND))
{
return;
}

// Get coalescing data for the previous STOREIND
GenTreeStoreInd* prevInd = prevTree->AsStoreInd();
if (!GetStoreCoalescingData(comp, prevInd->AsStoreInd(), &prevData))
{
return;
}

// Same for the previous STOREIND, make sure there are no unexpected nodes around.
LIR::ReadOnlyRange prevIndRange = BlockRange().GetTreeRange(prevInd, &isClosedRange);
if (!isClosedRange)
{
return;
}

// STOREIND aren't value nodes.
LIR::Use use;
assert(!BlockRange().TryGetUse(prevInd, &use) && !BlockRange().TryGetUse(ind, &use));

// BaseAddr, Index, Scale and Type all have to match.
if ((prevData.scale != currData.scale) || (prevData.targetType != currData.targetType) ||
!GenTree::Compare(prevData.baseAddr, currData.baseAddr) ||
!GenTree::Compare(prevData.index, currData.index))
{
return;
}

// Offset has to match the size of the type. We don't support the same or overlapping offsets.
if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType))
{
return;
}

// Since we're merging two stores of the same type, the new type is twice wider.
var_types oldType = ind->TypeGet();
var_types newType;
switch (oldType)
{
case TYP_BYTE:
case TYP_UBYTE:
newType = TYP_USHORT;
break;

case TYP_SHORT:
case TYP_USHORT:
newType = TYP_INT; // TYP_UINT is not legal in IR
break;

#ifdef TARGET_64BIT
case TYP_INT:
newType = TYP_LONG;
break;
#endif // TARGET_64BIT

// TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
// be converted to TYP_INT/TYP_LONG for constant value.
//
// TODO-CQ:
// 2 x LONG/REF -> SIMD16
// 2 x SIMD16 -> SIMD32
// 2 x SIMD32 -> SIMD64
//
// where it's legal (e.g. SIMD is not atomic on x64)
//
default:
return;
}

// Delete previous STOREIND entirely
BlockRange().Remove(std::move(prevIndRange));

// We know it's always LEA for now
GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();

// Update offset to be the minimum of the two
addr->SetOffset(min(prevData.offset, currData.offset));

// Update type for both STOREIND and val
ind->gtType = newType;
ind->Data()->gtType = newType;

// We currently only support these constants for val
assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI());

size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();

// if the previous store was at a higher address, swap the constants
if (prevData.offset > currData.offset)
{
std::swap(lowerCns, upperCns);
}

// Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
// the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE;
lowerCns &= mask;
upperCns &= mask;

size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE)));
JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);

// It's not expected to be contained yet, but just in case...
ind->Data()->ClearContained();
ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
ind->gtFlags |= GTF_IND_UNALIGNED;

} while (true);
#endif // TARGET_XARCH || TARGET_ARM64
}

//------------------------------------------------------------------------
// LowerStoreIndirCommon: a common logic to lower StoreIndir.
//
Expand Down Expand Up @@ -7842,6 +8121,7 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
}
#endif

LowerStoreIndirCoalescing(ind);
LowerStoreIndir(ind);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ class Lowering final : public Phase
void LowerStoreIndirCommon(GenTreeStoreInd* ind);
void LowerIndir(GenTreeIndir* ind);
void LowerStoreIndir(GenTreeStoreInd* node);
void LowerStoreIndirCoalescing(GenTreeStoreInd* node);
GenTree* LowerAdd(GenTreeOp* node);
GenTree* LowerMul(GenTreeOp* mul);
bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode);
Expand Down