@@ -3815,6 +3815,27 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {
38153815 return LegalizationCost * LT.first ;
38163816}
38173817
3818+ // / Check if the mask is a DE-interleave mask of the given factor
3819+ // / \p Factor like:
3820+ // / <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
3821+ static bool isDeInterleaveMaskOfFactor (ArrayRef<int > Mask, unsigned Factor) {
3822+ // Check all potential start indices from 0 to (Factor - 1).
3823+ for (unsigned Index = 0 ; Index < Factor; Index++) {
3824+ unsigned i = 0 ;
3825+
3826+ // Check that elements are in ascending order by Factor. Ignore undef
3827+ // elements.
3828+ for (; i < Mask.size (); i++)
3829+ if (Mask[i] >= 0 && static_cast <unsigned >(Mask[i]) != Index + i * Factor)
3830+ break ;
3831+
3832+ if (i == Mask.size ())
3833+ return true ;
3834+ }
3835+
3836+ return false ;
3837+ }
3838+
38183839InstructionCost AArch64TTIImpl::getShuffleCost (
38193840 TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int > Mask,
38203841 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
@@ -3827,9 +3848,18 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
38273848 Tp->getScalarSizeInBits () == LT.second .getScalarSizeInBits () &&
38283849 Mask.size () > LT.second .getVectorNumElements () && !Index && !SubTp) {
38293850
3851+ // Check for LD3/LD4 instructions, which are represented in llvm IR as
3852+ // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
3853+ // but we model it with a cost of LT.first so that LD3/LD4 have a higher
3854+ // cost than just the load.
3855+ if (Args.size () >= 1 && isa<LoadInst>(Args[0 ]) &&
3856+ (isDeInterleaveMaskOfFactor (Mask, 3 ) ||
3857+ isDeInterleaveMaskOfFactor (Mask, 4 )))
3858+ return std::max<InstructionCost>(1 , LT.first / 4 );
3859+
38303860 // Check for ST3/ST4 instructions, which are represented in llvm IR as
38313861 // store(interleaving-shuffle). The shuffle cost could potentially be free,
3832- // but we model it with a cost of LT.first so that LD3/LD3 have a higher
3862+ // but we model it with a cost of LT.first so that ST3/ST4 have a higher
38333863 // cost than just the store.
38343864 if (CxtI && CxtI->hasOneUse () && isa<StoreInst>(*CxtI->user_begin ()) &&
38353865 (ShuffleVectorInst::isInterleaveMask (
0 commit comments