diff --git a/src/coreclr/jit/CMakeLists.txt b/src/coreclr/jit/CMakeLists.txt index a3bc9aa356ba0b..748b44872b6c78 100644 --- a/src/coreclr/jit/CMakeLists.txt +++ b/src/coreclr/jit/CMakeLists.txt @@ -75,12 +75,14 @@ function(create_standalone_jit) if ((TARGETDETAILS_ARCH STREQUAL "x64") OR (TARGETDETAILS_ARCH STREQUAL "arm64") OR ((TARGETDETAILS_ARCH STREQUAL "x86") AND NOT (TARGETDETAILS_OS STREQUAL "unix"))) target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_SIMD) target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_HW_INTRINSICS) + target_compile_definitions(${TARGETDETAILS_TARGET} PRIVATE FEATURE_MASKED_HW_INTRINSICS) endif () endfunction() if (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR (CLR_CMAKE_TARGET_ARCH_I386 AND NOT CLR_CMAKE_HOST_UNIX)) add_compile_definitions($<$>>:FEATURE_SIMD>) add_compile_definitions($<$>>:FEATURE_HW_INTRINSICS>) + add_compile_definitions($<$>>:FEATURE_MASKED_HW_INTRINSICS>) endif () # JIT_BUILD disables certain PAL_TRY debugging features diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp index 3c02c968fe919d..66e99940b6bd67 100644 --- a/src/coreclr/jit/codegenarm64test.cpp +++ b/src/coreclr/jit/codegenarm64test.cpp @@ -8551,6 +8551,8 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_I(INS_sve_ldr, EA_SCALABLE, REG_P1, REG_R5, -25); theEmitter->emitIns_R_R_I(INS_sve_ldr, EA_SCALABLE, REG_P1, REG_R5, -256); theEmitter->emitIns_R_R_I(INS_sve_ldr, EA_SCALABLE, REG_P1, REG_R5, 255); + theEmitter->emitIns_R_S(INS_sve_ldr_mask, EA_8BYTE, REG_P0, 1, 0); + theEmitter->emitIns_R_S(INS_sve_ldr_mask, EA_8BYTE, REG_P15, 1, 6); // IF_SVE_JG_2A // STR , [{, #, MUL VL}] @@ -8559,6 +8561,8 @@ void CodeGen::genArm64EmitterUnitTestsSve() theEmitter->emitIns_R_R_I(INS_sve_str, EA_SCALABLE, REG_P3, REG_R1, -117); theEmitter->emitIns_R_R_I(INS_sve_str, EA_SCALABLE, REG_P3, REG_R1, -256); theEmitter->emitIns_R_R_I(INS_sve_str, EA_SCALABLE, REG_P3, REG_R1, 255); + theEmitter->emitIns_S_R(INS_sve_str_mask, EA_8BYTE, REG_P5, 1, 0); + theEmitter->emitIns_S_R(INS_sve_str_mask, EA_8BYTE, REG_P7, 1, 4); // IF_SVE_IE_2A // LDR , [{, #, MUL VL}] @@ -8572,6 +8576,8 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_SCALABLE_OPTS_UNPREDICATED); theEmitter->emitIns_R_R_I(INS_sve_ldr, EA_SCALABLE, REG_V3, REG_R4, 255, INS_OPTS_NONE, INS_SCALABLE_OPTS_UNPREDICATED); + theEmitter->emitIns_R_S(INS_sve_ldr, EA_8BYTE, REG_V17, 1, 0); + theEmitter->emitIns_R_S(INS_sve_ldr, EA_8BYTE, REG_V9, 1, 24); // IF_SVE_JH_2A // STR , [{, #, MUL VL}] @@ -8585,6 +8591,8 @@ void CodeGen::genArm64EmitterUnitTestsSve() INS_SCALABLE_OPTS_UNPREDICATED); theEmitter->emitIns_R_R_I(INS_sve_str, EA_SCALABLE, REG_V2, REG_R3, 255, INS_OPTS_NONE, INS_SCALABLE_OPTS_UNPREDICATED); + theEmitter->emitIns_S_R(INS_sve_str, EA_8BYTE, REG_V3, 1, 0); + theEmitter->emitIns_S_R(INS_sve_str, EA_8BYTE, REG_V0, 1, 28); #ifdef ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED // IF_SVE_GG_3A diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 2760347c612569..2937d477a31b13 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3464,6 +3464,11 @@ class Compiler GenTreeIndir* gtNewMethodTableLookup(GenTree* obj); +#if defined(TARGET_ARM64) + GenTree* gtNewSimdConvertVectorToMaskNode(var_types type, GenTree* node, CorInfoType simdBaseJitType, unsigned simdSize); + GenTree* gtNewSimdConvertMaskToVectorNode(GenTreeHWIntrinsic* node, var_types type); +#endif + //------------------------------------------------------------------------ // Other GenTree functions @@ -4574,11 +4579,6 @@ class Compiler NamedIntrinsic intrinsic, GenTree* immOp, bool mustExpand, int immLowerBound, int immUpperBound); GenTree* addRangeCheckForHWIntrinsic(GenTree* immOp, int immLowerBound, int immUpperBound); -#if defined(TARGET_ARM64) - GenTree* convertHWIntrinsicToMask(var_types type, GenTree* node, CorInfoType simdBaseJitType, unsigned simdSize); - GenTree* convertHWIntrinsicFromMask(GenTreeHWIntrinsic* node, var_types type); -#endif - #endif // FEATURE_HW_INTRINSICS GenTree* impArrayAccessIntrinsic(CORINFO_CLASS_HANDLE clsHnd, CORINFO_SIG_INFO* sig, diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 2ba16a9b39c694..28faef7419e9db 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -2660,10 +2660,9 @@ void emitter::emitInsSanityCheck(instrDesc* id) elemsize = id->idOpSize(); assert(insOptsNone(id->idInsOpt())); assert(isScalableVectorSize(elemsize)); - assert(isPredicateRegister(id->idReg1())); // TTTT - assert(isGeneralRegister(id->idReg2())); // nnnnn - assert(isValidSimm<9>(emitGetInsSC(id))); // iii - // iiiiii + assert(isPredicateRegister(id->idReg1())); // TTTT + assert(isGeneralRegisterOrZR(id->idReg2())); // nnnnn + assert(isValidSimm<9>(emitGetInsSC(id))); // iii break; case IF_SVE_IE_2A: // ..........iiiiii ...iiinnnnnttttt -- SVE load vector register @@ -2671,10 +2670,9 @@ void emitter::emitInsSanityCheck(instrDesc* id) elemsize = id->idOpSize(); assert(insOptsNone(id->idInsOpt())); assert(isScalableVectorSize(elemsize)); - assert(isVectorRegister(id->idReg1())); // ttttt - assert(isGeneralRegister(id->idReg2())); // nnnnn - assert(isValidSimm<9>(emitGetInsSC(id))); // iii - // iiiiii + assert(isVectorRegister(id->idReg1())); // ttttt + assert(isGeneralRegisterOrZR(id->idReg2())); // nnnnn + assert(isValidSimm<9>(emitGetInsSC(id))); // iii break; case IF_SVE_GG_3A: // ........ii.mmmmm ......nnnnnddddd -- SVE2 lookup table with 2-bit indices and 16-bit @@ -4423,7 +4421,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) if (imm == 0) return true; // Encodable using IF_LS_2A - if ((imm >= -256) && (imm <= 255)) + if (isValidSimm<9>(imm)) return true; // Encodable using IF_LS_2C (or possibly IF_LS_2B) if (imm < 0) @@ -7661,7 +7659,7 @@ void emitter::emitIns_R_R_I(instruction ins, } else if (insOptsIndexed(opt) || unscaledOp || (imm < 0) || ((imm & mask) != 0)) { - if ((imm >= -256) && (imm <= 255)) + if (isValidSimm<9>(imm)) { fmt = IF_LS_2C; } @@ -9758,14 +9756,23 @@ void emitter::emitIns_S(instruction ins, emitAttr attr, int varx, int offs) */ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs) { - emitAttr size = EA_SIZE(attr); - insFormat fmt = IF_NONE; - int disp = 0; - unsigned scale = 0; - bool isLdrStr = false; + emitAttr size = EA_SIZE(attr); + insFormat fmt = IF_NONE; + unsigned scale = 0; + bool isLdrStr = false; + bool isSimple = true; + bool useRegForImm = false; assert(offs >= 0); + /* Figure out the variable's frame position */ + bool FPbased; + int base = emitComp->lvaFrameAddress(varx, &FPbased); + int disp = base + offs; + ssize_t imm = disp; + + regNumber reg2 = encodingSPtoZR(FPbased ? REG_FPBASE : REG_SPBASE); + // TODO-ARM64-CQ: use unscaled loads? /* Figure out the encoding format of the instruction */ switch (ins) @@ -9795,63 +9802,103 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va case INS_lea: assert(size == EA_8BYTE); - scale = 0; + isSimple = false; + scale = 0; + + if (disp >= 0) + { + ins = INS_add; + } + else + { + ins = INS_sub; + imm = -disp; + } + + if (imm <= 0x0fff) + { + fmt = IF_DI_2A; // add reg1,reg2,#disp + } + else + { + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); + fmt = IF_DR_3A; // add reg1,reg2,rsvdReg + } break; + case INS_sve_ldr: + { + assert(isPredicateRegister(reg1)); + isSimple = false; + size = EA_SCALABLE; + attr = size; + fmt = IF_SVE_IE_2A; + + // TODO-SVE: Don't assume 128bit vectors + scale = NaturalScale_helper(EA_16BYTE); + ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate + + if (((imm & mask) == 0) && (isValidSimm<9>(imm >> scale))) + { + imm >>= scale; // The immediate is scaled by the size of the ld/st + } + else + { + useRegForImm = true; + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); + } + } + break; + + // TODO-SVE: Fold into INS_sve_ldr once REG_V0 and REG_P0 are distinct + case INS_sve_ldr_mask: + { + assert(isPredicateRegister(reg1)); + isSimple = false; + size = EA_SCALABLE; + attr = size; + fmt = IF_SVE_ID_2A; + ins = INS_sve_ldr; + + // TODO-SVE: Don't assume 128bit vectors + // Predicate size is vector length / 8 + scale = NaturalScale_helper(EA_2BYTE); + ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate + + if (((imm & mask) == 0) && (isValidSimm<9>(imm >> scale))) + { + imm >>= scale; // The immediate is scaled by the size of the ld/st + } + else + { + useRegForImm = true; + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); + } + } + break; + default: NYI("emitIns_R_S"); // FP locals? return; } // end switch (ins) - /* Figure out the variable's frame position */ - ssize_t imm; - int base; - bool FPbased; - - base = emitComp->lvaFrameAddress(varx, &FPbased); - disp = base + offs; assert((scale >= 0) && (scale <= 4)); - bool useRegForImm = false; - regNumber reg2 = FPbased ? REG_FPBASE : REG_SPBASE; - reg2 = encodingSPtoZR(reg2); - - if (ins == INS_lea) - { - if (disp >= 0) - { - ins = INS_add; - imm = disp; - } - else - { - ins = INS_sub; - imm = -disp; - } - - if (imm <= 0x0fff) - { - fmt = IF_DI_2A; // add reg1,reg2,#disp - } - else - { - regNumber rsvdReg = codeGen->rsGetRsvdReg(); - codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); - fmt = IF_DR_3A; // add reg1,reg2,rsvdReg - } - } - else + if (isSimple) { ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate - imm = disp; + if (imm == 0) { fmt = IF_LS_2A; } else if ((imm < 0) || ((imm & mask) != 0)) { - if ((imm >= -256) && (imm <= 255)) + if (isValidSimm<9>(imm)) { fmt = IF_LS_2C; } @@ -10014,10 +10061,20 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va assert(offs >= 0); emitAttr size = EA_SIZE(attr); insFormat fmt = IF_NONE; - int disp = 0; unsigned scale = 0; bool isVectorStore = false; bool isStr = false; + bool isSimple = true; + bool useRegForImm = false; + + /* Figure out the variable's frame position */ + bool FPbased; + int base = emitComp->lvaFrameAddress(varx, &FPbased); + int disp = base + offs; + ssize_t imm = disp; + + // TODO-ARM64-CQ: with compLocallocUsed, should we use REG_SAVED_LOCALLOC_SP instead? + regNumber reg2 = encodingSPtoZR(FPbased ? REG_FPBASE : REG_SPBASE); // TODO-ARM64-CQ: use unscaled loads? /* Figure out the encoding format of the instruction */ @@ -10049,20 +10106,66 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va isStr = true; break; + case INS_sve_str: + { + assert(isVectorRegister(reg1)); + isSimple = false; + size = EA_SCALABLE; + attr = size; + fmt = IF_SVE_JH_2A; + + // TODO-SVE: Don't assume 128bit vectors + scale = NaturalScale_helper(EA_16BYTE); + ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate + + if (((imm & mask) == 0) && (isValidSimm<9>(imm >> scale))) + { + imm >>= scale; // The immediate is scaled by the size of the ld/st + } + else + { + useRegForImm = true; + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); + } + } + break; + + // TODO-SVE: Fold into INS_sve_str once REG_V0 and REG_P0 are distinct + case INS_sve_str_mask: + { + assert(isPredicateRegister(reg1)); + isSimple = false; + size = EA_SCALABLE; + attr = size; + fmt = IF_SVE_JG_2A; + ins = INS_sve_str; + + // TODO-SVE: Don't assume 128bit vectors + // Predicate size is vector length / 8 + scale = NaturalScale_helper(EA_2BYTE); + ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate + + if (((imm & mask) == 0) && (isValidSimm<9>(imm >> scale))) + { + imm >>= scale; // The immediate is scaled by the size of the ld/st + } + else + { + useRegForImm = true; + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); + } + } + break; + default: NYI("emitIns_S_R"); // FP locals? return; } // end switch (ins) - /* Figure out the variable's frame position */ - int base; - bool FPbased; - - base = emitComp->lvaFrameAddress(varx, &FPbased); - disp = base + offs; - assert(scale >= 0); - if (isVectorStore) + if (isVectorStore || !isSimple) { assert(scale <= 4); } @@ -10071,51 +10174,48 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va assert(scale <= 3); } - // TODO-ARM64-CQ: with compLocallocUsed, should we use REG_SAVED_LOCALLOC_SP instead? - regNumber reg2 = FPbased ? REG_FPBASE : REG_SPBASE; - reg2 = encodingSPtoZR(reg2); - - bool useRegForImm = false; - ssize_t imm = disp; - ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate - if (imm == 0) + if (isSimple) { - fmt = IF_LS_2A; - } - else if ((imm < 0) || ((imm & mask) != 0)) - { - if ((imm >= -256) && (imm <= 255)) + ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate + + if (imm == 0) { - fmt = IF_LS_2C; + fmt = IF_LS_2A; } - else + else if ((imm < 0) || ((imm & mask) != 0)) { - useRegForImm = true; + if (isValidSimm<9>(imm)) + { + fmt = IF_LS_2C; + } + else + { + useRegForImm = true; + } } - } - else if (imm > 0) - { - if (((imm & mask) == 0) && ((imm >> scale) < 0x1000)) + else if (imm > 0) { - imm >>= scale; // The immediate is scaled by the size of the ld/st - - fmt = IF_LS_2B; + if (((imm & mask) == 0) && ((imm >> scale) < 0x1000)) + { + imm >>= scale; // The immediate is scaled by the size of the ld/st + fmt = IF_LS_2B; + } + else + { + useRegForImm = true; + } } - else + + if (useRegForImm) { - useRegForImm = true; + // The reserved register is not stored in idReg3() since that field overlaps with iiaLclVar. + // It is instead implicit when idSetIsLclVar() is set, with this encoding format. + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); + fmt = IF_LS_3A; } } - if (useRegForImm) - { - // The reserved register is not stored in idReg3() since that field overlaps with iiaLclVar. - // It is instead implicit when idSetIsLclVar() is set, with this encoding format. - regNumber rsvdReg = codeGen->rsGetRsvdReg(); - codeGen->instGen_Set_Reg_To_Imm(EA_PTRSIZE, rsvdReg, imm); - fmt = IF_LS_3A; - } - assert(fmt != IF_NONE); // Try to optimize a store with an alternative instruction. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 0d374e712d1b12..e6833a348b521a 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6395,6 +6395,28 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsBitwiseHWIntrinsic() const; bool OperIsEmbRoundingEnabled() const; + bool OperIsConvertMaskToVector() const + { +#if defined(TARGET_XARCH) + return GetHWIntrinsicId() == NI_AVX512F_ConvertMaskToVector; +#elif defined(TARGET_ARM64) + return GetHWIntrinsicId() == NI_Sve_ConvertMaskToVector; +#else + return false; +#endif // TARGET_ARM64 && FEATURE_MASKED_HW_INTRINSICS + } + + bool OperIsConvertVectorToMask() const + { +#if defined(TARGET_XARCH) + return GetHWIntrinsicId() == NI_AVX512F_ConvertVectorToMask; +#elif defined(TARGET_ARM64) + return GetHWIntrinsicId() == NI_Sve_ConvertVectorToMask; +#else + return false; +#endif + } + bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; bool OperRequiresGlobRefFlag() const; diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 14c262524da2d8..e8b60b07909d95 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -778,7 +778,7 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, { arg = impSIMDPopStack(); } - assert(varTypeIsSIMD(arg)); + assert(varTypeIsSIMDOrMask(arg)); } else { @@ -1593,18 +1593,21 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, #if defined(TARGET_ARM64) if (HWIntrinsicInfo::IsMaskedOperation(intrinsic)) { - // Op1 input is a vector. HWInstrinsic requires a mask, so convert to a mask. assert(numArgs > 0); - GenTree* op1 = retNode->AsHWIntrinsic()->Op(1); - op1 = convertHWIntrinsicToMask(retType, op1, simdBaseJitType, simdSize); - retNode->AsHWIntrinsic()->Op(1) = op1; + GenTree* op1 = retNode->AsHWIntrinsic()->Op(1); + if (!varTypeIsMask(op1)) + { + // Op1 input is a vector. HWInstrinsic requires a mask. + retNode->AsHWIntrinsic()->Op(1) = gtNewSimdConvertVectorToMaskNode(retType, op1, simdBaseJitType, simdSize); + } } if (retType != nodeRetType) { // HWInstrinsic returns a mask, but all returns must be vectors, so convert mask to vector. assert(HWIntrinsicInfo::ReturnsPerElementMask(intrinsic)); - retNode = convertHWIntrinsicFromMask(retNode->AsHWIntrinsic(), retType); + assert(nodeRetType == TYP_MASK); + retNode = gtNewSimdConvertMaskToVectorNode(retNode->AsHWIntrinsic(), retType); } #endif // defined(TARGET_ARM64) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 5c7f796c61c909..385dfe4bc82bf7 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -2204,7 +2204,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } //------------------------------------------------------------------------ -// convertHWIntrinsicFromMask: Convert a HW instrinsic vector node to a mask +// gtNewSimdConvertMaskToVectorNode: Convert a HW instrinsic vector node to a mask // // Arguments: // node -- The node to convert @@ -2214,11 +2214,13 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // Return Value: // The node converted to the a mask type // -GenTree* Compiler::convertHWIntrinsicToMask(var_types type, - GenTree* node, - CorInfoType simdBaseJitType, - unsigned simdSize) +GenTree* Compiler::gtNewSimdConvertVectorToMaskNode(var_types type, + GenTree* node, + CorInfoType simdBaseJitType, + unsigned simdSize) { + assert(varTypeIsSIMD(node)); + // ConvertVectorToMask uses cmpne which requires an embedded mask. GenTree* embeddedMask = gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); return gtNewSimdHWIntrinsicNode(TYP_MASK, embeddedMask, node, NI_Sve_ConvertVectorToMask, simdBaseJitType, @@ -2226,7 +2228,7 @@ GenTree* Compiler::convertHWIntrinsicToMask(var_types type, } //------------------------------------------------------------------------ -// convertHWIntrinsicFromMask: Convert a HW instrinsic mask node to a vector +// gtNewSimdConvertMaskToVectorNode: Convert a HW instrinsic mask node to a vector // // Arguments: // node -- The node to convert @@ -2235,9 +2237,11 @@ GenTree* Compiler::convertHWIntrinsicToMask(var_types type, // Return Value: // The node converted to the given type // -GenTree* Compiler::convertHWIntrinsicFromMask(GenTreeHWIntrinsic* node, var_types type) +GenTree* Compiler::gtNewSimdConvertMaskToVectorNode(GenTreeHWIntrinsic* node, var_types type) { - assert(node->TypeGet() == TYP_MASK); + assert(varTypeIsMask(node)); + assert(varTypeIsSIMD(type)); + return gtNewSimdHWIntrinsicNode(type, node, NI_Sve_ConvertMaskToVector, node->GetSimdBaseJitType(), node->GetSimdSize()); } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 75e712504a19f2..dd82e7c08f92ba 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1687,12 +1687,17 @@ instruction CodeGen::ins_Move_Extend(var_types srcType, bool srcInReg) return ins; } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) if (varTypeUsesMaskReg(srcType)) { +#if defined(TARGET_XARCH) return INS_kmovq_msk; +#elif defined(TARGET_ARM64) + unreached(); // TODO-SVE: This needs testing + return INS_sve_mov; +#endif } -#endif // TARGET_XARCH && FEATURE_SIMD +#endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(srcType)); @@ -1837,12 +1842,16 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false* return ins; } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) if (varTypeUsesMaskReg(srcType)) { +#if defined(TARGET_XARCH) return INS_kmovq_msk; +#elif defined(TARGET_ARM64) + return INS_sve_ldr_mask; +#endif } -#endif // TARGET_XARCH && FEATURE_SIMD +#endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(srcType)); @@ -1921,12 +1930,17 @@ instruction CodeGen::ins_Copy(var_types dstType) #endif } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) if (varTypeUsesMaskReg(dstType)) { +#if defined(TARGET_XARCH) return INS_kmovq_msk; +#elif defined(TARGET_ARM64) + unreached(); // TODO-SVE: This needs testing + return INS_sve_mov; +#endif } -#endif // TARGET_XARCH && FEATURE_SIMD +#endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); @@ -2030,7 +2044,7 @@ instruction CodeGen::ins_Copy(regNumber srcReg, var_types dstType) #endif } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) if (varTypeUsesMaskReg(dstType)) { if (genIsValidMaskReg(srcReg)) @@ -2041,9 +2055,14 @@ instruction CodeGen::ins_Copy(regNumber srcReg, var_types dstType) // mask to int assert(genIsValidIntOrFakeReg(srcReg)); +#if defined(TARGET_XARCH) return INS_kmovq_gpr; +#elif defined(TARGET_ARM64) + unreached(); // TODO-SVE: This needs testing + return INS_sve_mov; +#endif } -#endif // TARGET_XARCH && FEATURE_SIMD +#endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); @@ -2145,12 +2164,16 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false return ins; } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) if (varTypeUsesMaskReg(dstType)) { +#if defined(TARGET_XARCH) return INS_kmovq_msk; +#elif defined(TARGET_ARM64) + return INS_sve_str_mask; +#endif } -#endif // TARGET_XARCH && FEATURE_SIMD +#endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); @@ -2262,7 +2285,7 @@ instruction CodeGenInterface::ins_StoreFromSrc(regNumber srcReg, var_types dstTy return ins_Store(dstType, aligned); } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) if (varTypeUsesMaskReg(dstType)) { if (genIsValidMaskReg(srcReg)) @@ -2275,7 +2298,7 @@ instruction CodeGenInterface::ins_StoreFromSrc(regNumber srcReg, var_types dstTy assert(genIsValidIntOrFakeReg(srcReg)); return ins_Store(dstType, aligned); } -#endif // TARGET_XARCH && FEATURE_SIMD +#endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index c23cb258d46315..a94998f596e93a 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -385,7 +385,7 @@ enum insScalableOpts : unsigned INS_SCALABLE_OPTS_UNPREDICATED_WIDE, // Variants without a predicate and wide elements (eg asr) INS_SCALABLE_OPTS_TO_PREDICATE, // Variants moving to a predicate from a vector (e.g. pmov) INS_SCALABLE_OPTS_TO_VECTOR, // Variants moving to a vector from a predicate (e.g. pmov) - INS_SCALABLE_OPTS_BROADCAST // Used to distinguish mov from cpy, where mov is an alias for both + INS_SCALABLE_OPTS_BROADCAST, // Used to distinguish mov from cpy, where mov is an alias for both }; // Maps directly to the pattern used in SVE instructions such as cntb. diff --git a/src/coreclr/jit/instrsarm64sve.h b/src/coreclr/jit/instrsarm64sve.h index f7209c6df98a58..fb469c0bfdc101 100644 --- a/src/coreclr/jit/instrsarm64sve.h +++ b/src/coreclr/jit/instrsarm64sve.h @@ -2840,6 +2840,11 @@ INST1(ldnt1sw, "ldnt1sw", 0, IF_SV INST1(st1q, "st1q", 0, IF_SVE_IY_4A, 0xE4202000 ) // ST1Q {.Q }, , [.D{, }] SVE_IY_4A 11100100001mmmmm 001gggnnnnnttttt E420 2000 + +// TODO-SVE: Removable once REG_V0 and REG_P0 are distinct +INST1(str_mask, "str_mask", 0, IF_SN_0A, BAD_CODE) +INST1(ldr_mask, "ldr_mask", 0, IF_SN_0A, BAD_CODE) + // clang-format on /*****************************************************************************/ diff --git a/src/coreclr/jit/scopeinfo.cpp b/src/coreclr/jit/scopeinfo.cpp index 01238efcfcbd0c..06c4dd244a4be5 100644 --- a/src/coreclr/jit/scopeinfo.cpp +++ b/src/coreclr/jit/scopeinfo.cpp @@ -301,6 +301,9 @@ void CodeGenInterface::siVarLoc::siFillStackVarLoc( case TYP_LONG: case TYP_DOUBLE: #endif // TARGET_64BIT +#if defined(FEATURE_MASKED_HW_INTRINSICS) + case TYP_MASK: +#endif // FEATURE_MASKED_HW_INTRINSICS #if FEATURE_IMPLICIT_BYREFS // In the AMD64 ABI we are supposed to pass a struct by reference when its // size is not 1, 2, 4 or 8 bytes in size. During fgMorph, the compiler modifies @@ -433,6 +436,9 @@ void CodeGenInterface::siVarLoc::siFillRegisterVarLoc( case TYP_SIMD32: case TYP_SIMD64: #endif // TARGET_XARCH +#if defined(FEATURE_MASKED_HW_INTRINSICS) + case TYP_MASK: +#endif // FEATURE_MASKED_HW_INTRINSICS { this->vlType = VLT_REG_FP; diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 48c23eb646411f..b6879dafc498d9 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -456,7 +456,8 @@ GenTree* Compiler::impSIMDPopStack() { StackEntry se = impPopStack(); GenTree* tree = se.val; - assert(varTypeIsSIMD(tree)); + + assert(varTypeIsSIMDOrMask(tree)); // Handle calls that may return the struct via a return buffer. if (tree->OperIs(GT_CALL, GT_RET_EXPR)) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 1e32a1d88946cc..7897e0c7fc9323 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -393,7 +393,7 @@ inline bool genIsValidFloatReg(regNumber reg) return reg >= REG_FP_FIRST && reg <= REG_FP_LAST; } -#if defined(TARGET_XARCH) +#if defined(FEATURE_MASKED_HW_INTRINSICS) /***************************************************************************** * Return true if the register is a valid mask register */ @@ -401,7 +401,7 @@ inline bool genIsValidMaskReg(regNumber reg) { return reg >= REG_MASK_FIRST && reg <= REG_MASK_LAST; } -#endif // TARGET_XARCH +#endif // FEATURE_MASKED_HW_INTRINSICS #ifdef TARGET_ARM diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 4a99ca2a79f989..6a89d378132a66 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -56,6 +56,9 @@ #define REG_PREDICATE_HIGH_FIRST REG_P8 // Similarly, some instructions can only use the second half of the predicate registers. #define REG_PREDICATE_HIGH_LAST REG_P15 + #define REG_MASK_FIRST REG_PREDICATE_FIRST + #define REG_MASK_LAST REG_PREDICATE_LAST + static_assert_no_msg(REG_PREDICATE_HIGH_LAST == REG_PREDICATE_LAST); #define REGNUM_BITS 6 // number of bits in a REG_* diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 4922e1f3da0a6c..05e042e8926b56 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -12092,9 +12092,6 @@ void Compiler::fgValueNumberHWIntrinsic(GenTreeHWIntrinsic* tree) // There are some HWINTRINSICS operations that have zero args, i.e. NI_Vector128_Zero if (opCount == 0) { - // Currently we don't have intrinsics with variable number of args with a parameter-less option. - assert(!isVariableNumArgs); - if (encodeResultType) { // There are zero arg HWINTRINSICS operations that encode the result type, i.e. Vector128_AllBitSet diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index ed57a76b6e7ad8..1623addb69b079 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -85,13 +85,19 @@ inline bool varTypeIsSIMD(T vt) template inline bool varTypeIsMask(T vt) { -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(FEATURE_MASKED_HW_INTRINSICS) return (TypeGet(vt) == TYP_MASK); -#else // FEATURE_SIMD +#else // FEATURE_MASKED_HW_INTRINSICS return false; #endif } +template +inline bool varTypeIsSIMDOrMask(T vt) +{ + return varTypeIsSIMD(vt) || varTypeIsMask(vt); +} + template inline bool varTypeIsIntegral(T vt) {