Skip to content
7 changes: 7 additions & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1615,6 +1615,13 @@ class CodeGen final : public CodeGenInterface

void instGen_Set_Reg_To_Zero(emitAttr size, regNumber reg, insFlags flags = INS_FLAGS_DONT_CARE);

void instGen_Set_Reg_To_Base_Plus_Imm(emitAttr size,
regNumber dstReg,
regNumber baseReg,
ssize_t imm,
insFlags flags = INS_FLAGS_DONT_CARE DEBUGARG(size_t targetHandle = 0)
DEBUGARG(GenTreeFlags gtFlags = GTF_EMPTY));

void instGen_Set_Reg_To_Imm(emitAttr size,
regNumber reg,
ssize_t imm,
Expand Down
13 changes: 12 additions & 1 deletion src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2214,8 +2214,19 @@ void CodeGen::genEHCatchRet(BasicBlock* block)
GetEmitter()->emitIns_R_L(INS_adr, EA_PTRSIZE, block->GetTarget(), REG_INTRET);
}

// move an immediate value into an integer register
// move an immediate value + base address into an integer register
void CodeGen::instGen_Set_Reg_To_Base_Plus_Imm(emitAttr size,
regNumber dstReg,
regNumber baseReg,
ssize_t imm,
insFlags flags DEBUGARG(size_t targetHandle)
DEBUGARG(GenTreeFlags gtFlags))
{
instGen_Set_Reg_To_Imm(size, dstReg, imm);
GetEmitter()->emitIns_R_R_R(INS_add, size, dstReg, dstReg, baseReg);
}

// move an immediate value into an integer register
void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size,
regNumber reg,
ssize_t imm,
Expand Down
16 changes: 12 additions & 4 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7902,7 +7902,9 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
{
useRegForImm = true;
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm);
reg2 = rsvdReg;
imm = 0;
}
}
break;
Expand Down Expand Up @@ -7930,7 +7932,9 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
{
useRegForImm = true;
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm);
reg2 = rsvdReg;
imm = 0;
}
}
break;
Expand Down Expand Up @@ -8181,7 +8185,9 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
{
useRegForImm = true;
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm);
reg2 = rsvdReg;
imm = 0;
}
}
break;
Expand Down Expand Up @@ -8209,7 +8215,9 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
{
useRegForImm = true;
regNumber rsvdReg = codeGen->rsGetRsvdReg();
codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm);
codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change related to the DUP? Don't think so.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might actually need it for https://github.com/dotnet/runtime/pull/104065/files#diff-2b2c8b9011607926410624d6f81613fad7b74c6e0516d578675a8b792998fe4fR7893-R7896, but I am curious if you found a repro, for which you had to add change here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not related to DUP directly. However, tests trigger a scenario where we end up in having a store with a larger immediate.

reg2 = rsvdReg;
imm = 0;
}
}
break;
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ void HWIntrinsicInfo::lookupImmBounds(
immUpperBound = (int)SVE_PATTERN_ALL;
break;

case NI_Sve_DuplicateSelectedScalarToVector:
immUpperBound = (512 / (genTypeSize(baseType) * BITS_PER_BYTE)) - 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
immUpperBound = (512 / (genTypeSize(baseType) * BITS_PER_BYTE)) - 1;
immUpperBound = Compiler::getSIMDVectorLength(simdSize, baseType) - 1;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may not able able to use the getSIMDVectorLength() here as the imm for DUP seems special [1].

Is the immediate index, in the range 0 to one less than the number of elements in 512 bits, encoded in "imm2:tsz".

I didn't find a better helper method for this so did it explicitly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually little confused with that description and then this one:

The immediate element index is in the range of 0 to 63 (bytes), 31 (halfwords), 15 (words), 7 (doublewords) or 3 (quadwords).

With ^ description, it sounded me like getSIMDVectorLength(), no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit confusing. Unlike other cases, the index here does not depend on the vector length. A valid index range is fixed based on the element type, e.g., 0 to 63 for vectors of type byte.

break;

case NI_Sve_SaturatingDecrementBy16BitElementCount:
case NI_Sve_SaturatingDecrementBy32BitElementCount:
case NI_Sve_SaturatingDecrementBy64BitElementCount:
Expand Down
11 changes: 11 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,17 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
unreached();
}
}
else if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector)
{
HWIntrinsicImmOpHelper helper(this, intrin.op2, node);

for (helper.EmitBegin(); !helper.Done(); helper.EmitCaseEnd())
{
const int elementIndex = helper.ImmValue();

GetEmitter()->emitIns_R_R_I(ins, emitSize, targetReg, op1Reg, elementIndex, opt);
}
}
else
{
assert(!hasImmediateOperand);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ HARDWARE_INTRINSIC(Sve, CreateWhileLessThanOrEqualMask8Bit,
HARDWARE_INTRINSIC(Sve, Divide, -1, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdiv, INS_sve_udiv, INS_sve_sdiv, INS_sve_udiv, INS_sve_fdiv, INS_sve_fdiv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, DotProduct, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdot, INS_sve_udot, INS_sve_sdot, INS_sve_udot, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, DotProductBySelectedScalar, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdot, INS_sve_udot, INS_sve_sdot, INS_sve_udot, INS_invalid, INS_invalid}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_LowVectorOperation)
HARDWARE_INTRINSIC(Sve, DuplicateSelectedScalarToVector, -1, 2, true, {INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand)
HARDWARE_INTRINSIC(Sve, FusedMultiplyAdd, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmla, INS_sve_fmla}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, FusedMultiplyAddBySelectedScalar, -1, 4, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmla, INS_sve_fmla}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_FmaIntrinsic|HW_Flag_LowVectorOperation)
HARDWARE_INTRINSIC(Sve, FusedMultiplyAddNegated, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fnmla, INS_sve_fnmla}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3176,6 +3176,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
case NI_Sve_DuplicateSelectedScalarToVector:
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op2));
if (intrin.op2->IsCnsIntOrI())
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1434,6 +1434,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
case NI_Sve_DuplicateSelectedScalarToVector:
needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
break;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,70 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<ulong> DotProductBySelectedScalar(Vector<ulong> addend, Vector<ushort> left, Vector<ushort> right, [ConstantExpected] byte rightIndex) { throw new PlatformNotSupportedException(); }


/// Broadcast a scalar value

/// <summary>
/// svuint8_t svdup_lane[_u8](svuint8_t data, uint8_t index)
/// DUP Zresult.B, Zdata.B[index]
/// </summary>
public static unsafe Vector<byte> DuplicateSelectedScalarToVector(Vector<byte> data, [ConstantExpected(Min = 0, Max = (byte)(63))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat64_t svdup_lane[_f64](svfloat64_t data, uint64_t index)
/// DUP Zresult.D, Zdata.D[index]
/// </summary>
public static unsafe Vector<double> DuplicateSelectedScalarToVector(Vector<double> data, [ConstantExpected(Min = 0, Max = (byte)(7))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint16_t svdup_lane[_s16](svint16_t data, uint16_t index)
/// DUP Zresult.H, Zdata.H[index]
/// </summary>
public static unsafe Vector<short> DuplicateSelectedScalarToVector(Vector<short> data, [ConstantExpected(Min = 0, Max = (byte)(31))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint32_t svdup_lane[_s32](svint32_t data, uint32_t index)
/// DUP Zresult.S, Zdata.S[index]
/// </summary>
public static unsafe Vector<int> DuplicateSelectedScalarToVector(Vector<int> data, [ConstantExpected(Min = 0, Max = (byte)(15))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint64_t svdup_lane[_s64](svint64_t data, uint64_t index)
/// DUP Zresult.D, Zdata.D[index]
/// </summary>
public static unsafe Vector<long> DuplicateSelectedScalarToVector(Vector<long> data, [ConstantExpected(Min = 0, Max = (byte)(7))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint8_t svdup_lane[_s8](svint8_t data, uint8_t index)
/// DUP Zresult.B, Zdata.B[index]
/// </summary>
public static unsafe Vector<sbyte> DuplicateSelectedScalarToVector(Vector<sbyte> data, [ConstantExpected(Min = 0, Max = (byte)(63))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat32_t svdup_lane[_f32](svfloat32_t data, uint32_t index)
/// DUP Zresult.S, Zdata.S[index]
/// </summary>
public static unsafe Vector<float> DuplicateSelectedScalarToVector(Vector<float> data, [ConstantExpected(Min = 0, Max = (byte)(15))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint16_t svdup_lane[_u16](svuint16_t data, uint16_t index)
/// DUP Zresult.H, Zdata.H[index]
/// </summary>
public static unsafe Vector<ushort> DuplicateSelectedScalarToVector(Vector<ushort> data, [ConstantExpected(Min = 0, Max = (byte)(31))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint32_t svdup_lane[_u32](svuint32_t data, uint32_t index)
/// DUP Zresult.S, Zdata.S[index]
/// </summary>
public static unsafe Vector<uint> DuplicateSelectedScalarToVector(Vector<uint> data, [ConstantExpected(Min = 0, Max = (byte)(15))] byte index) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint64_t svdup_lane[_u64](svuint64_t data, uint64_t index)
/// DUP Zresult.D, Zdata.D[index]
/// </summary>
public static unsafe Vector<ulong> DuplicateSelectedScalarToVector(Vector<ulong> data, [ConstantExpected(Min = 0, Max = (byte)(7))] byte index) { throw new PlatformNotSupportedException(); }


/// FusedMultiplyAdd : Multiply-add, addend first

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,70 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<ulong> DotProductBySelectedScalar(Vector<ulong> addend, Vector<ushort> left, Vector<ushort> right, [ConstantExpected] byte rightIndex) => DotProductBySelectedScalar(addend, left, right, rightIndex);


/// Broadcast a scalar value

/// <summary>
/// svuint8_t svdup_lane[_u8](svuint8_t data, uint8_t index)
/// DUP Zresult.B, Zdata.B[index]
/// </summary>
public static unsafe Vector<byte> DuplicateSelectedScalarToVector(Vector<byte> data, [ConstantExpected(Min = 0, Max = (byte)(63))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svfloat64_t svdup_lane[_f64](svfloat64_t data, uint64_t index)
/// DUP Zresult.D, Zdata.D[index]
/// </summary>
public static unsafe Vector<double> DuplicateSelectedScalarToVector(Vector<double> data, [ConstantExpected(Min = 0, Max = (byte)(7))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svint16_t svdup_lane[_s16](svint16_t data, uint16_t index)
/// DUP Zresult.H, Zdata.H[index]
/// </summary>
public static unsafe Vector<short> DuplicateSelectedScalarToVector(Vector<short> data, [ConstantExpected(Min = 0, Max = (byte)(31))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svint32_t svdup_lane[_s32](svint32_t data, uint32_t index)
/// DUP Zresult.S, Zdata.S[index]
/// </summary>
public static unsafe Vector<int> DuplicateSelectedScalarToVector(Vector<int> data, [ConstantExpected(Min = 0, Max = (byte)(15))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svint64_t svdup_lane[_s64](svint64_t data, uint64_t index)
/// DUP Zresult.D, Zdata.D[index]
/// </summary>
public static unsafe Vector<long> DuplicateSelectedScalarToVector(Vector<long> data, [ConstantExpected(Min = 0, Max = (byte)(7))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svint8_t svdup_lane[_s8](svint8_t data, uint8_t index)
/// DUP Zresult.B, Zdata.B[index]
/// </summary>
public static unsafe Vector<sbyte> DuplicateSelectedScalarToVector(Vector<sbyte> data, [ConstantExpected(Min = 0, Max = (byte)(63))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svfloat32_t svdup_lane[_f32](svfloat32_t data, uint32_t index)
/// DUP Zresult.S, Zdata.S[index]
/// </summary>
public static unsafe Vector<float> DuplicateSelectedScalarToVector(Vector<float> data, [ConstantExpected(Min = 0, Max = (byte)(15))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svuint16_t svdup_lane[_u16](svuint16_t data, uint16_t index)
/// DUP Zresult.H, Zdata.H[index]
/// </summary>
public static unsafe Vector<ushort> DuplicateSelectedScalarToVector(Vector<ushort> data, [ConstantExpected(Min = 0, Max = (byte)(31))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svuint32_t svdup_lane[_u32](svuint32_t data, uint32_t index)
/// DUP Zresult.S, Zdata.S[index]
/// </summary>
public static unsafe Vector<uint> DuplicateSelectedScalarToVector(Vector<uint> data, [ConstantExpected(Min = 0, Max = (byte)(15))] byte index) => DuplicateSelectedScalarToVector(data, index);

/// <summary>
/// svuint64_t svdup_lane[_u64](svuint64_t data, uint64_t index)
/// DUP Zresult.D, Zdata.D[index]
/// </summary>
public static unsafe Vector<ulong> DuplicateSelectedScalarToVector(Vector<ulong> data, [ConstantExpected(Min = 0, Max = (byte)(7))] byte index) => DuplicateSelectedScalarToVector(data, index);


/// FusedMultiplyAdd : Multiply-add, addend first

/// <summary>
Expand Down
Loading