diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 75d8e5432c4ae8..7789c250748637 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7336,7 +7336,19 @@ void CodeGen::genIntToFloatCast(GenTree* treeNode) // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions // here since they should have been lowered appropriately. noway_assert(srcType != TYP_UINT); - noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT)); + assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT) || + compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + + if ((srcType == TYP_ULONG) && varTypeIsFloating(dstType) && + compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + assert(compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + genConsumeOperands(treeNode->AsOp()); + instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType)); + GetEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); + genProduceReg(treeNode); + return; + } // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used // which does a partial write to lower 4/8 bytes of xmm register keeping the other @@ -7449,8 +7461,10 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG)))); // We shouldn't be seeing uint64 here as it should have been converted - // into a helper call by either front-end or lowering phase. - noway_assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG)))); + // into a helper call by either front-end or lowering phase, unless we have AVX512F + // accelerated conversions. + assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))) || + compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); // If the dstType is TYP_UINT, we have 32-bits to encode the // float number. Any of 33rd or above bits can be the sign bit. @@ -7463,7 +7477,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode) // Note that we need to specify dstType here so that it will determine // the size of destination integer register and also the rex.w prefix. genConsumeOperands(treeNode->AsOp()); - instruction ins = ins_FloatConv(TYP_INT, srcType, emitTypeSize(srcType)); + instruction ins = ins_FloatConv(dstType, srcType, emitTypeSize(srcType)); GetEmitter()->emitInsBinary(ins, emitTypeSize(dstType), treeNode, op1); genProduceReg(treeNode); } diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 9cab8e6fcea2ff..e2b3b350963b53 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -3891,6 +3891,11 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id) const return EA_32BYTE; } + case INS_vcvttss2usi64: + { + return EA_4BYTE; + } + case INS_movddup: { if (defaultSize == 64) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 13307006a6db31..9f9ffdc5a614e9 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1399,7 +1399,6 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi: - case INS_vcvttss2usi: { if (attr == EA_8BYTE) { @@ -2623,7 +2622,8 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id) case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi: - case INS_vcvttss2usi: + case INS_vcvttss2usi32: + case INS_vcvttss2usi64: { // These SSE instructions write to a general purpose integer register. return false; @@ -11435,12 +11435,18 @@ void emitter::emitDispIns( case INS_vcvtsd2usi: case INS_vcvtss2usi: case INS_vcvttsd2usi: - case INS_vcvttss2usi: { printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE)); break; } + case INS_vcvttss2usi32: + case INS_vcvttss2usi64: + { + printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_4BYTE)); + break; + } + #ifdef TARGET_AMD64 case INS_movsxd: { @@ -18595,23 +18601,32 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_cvtsi2sd64: case INS_cvtsi2ss64: case INS_vcvtsd2usi: - case INS_vcvttsd2usi: - case INS_vcvtusi2sd32: - case INS_vcvtusi2sd64: case INS_vcvtusi2ss32: case INS_vcvtusi2ss64: + case INS_vcvttsd2usi: + case INS_vcvttss2usi32: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_7C; break; + case INS_vcvtusi2sd64: + case INS_vcvtusi2sd32: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency += PERFSCORE_LATENCY_5C; + break; + case INS_cvttss2si: case INS_cvtss2si: case INS_vcvtss2usi: - case INS_vcvttss2usi: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C; break; + case INS_vcvttss2usi64: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency += PERFSCORE_LATENCY_8C; + break; + case INS_cvtss2sd: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_5C; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index e1649b2159c55f..0d04bbbedf1fa5 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -268,8 +268,12 @@ HARDWARE_INTRINSIC(Vector512, Ceiling, HARDWARE_INTRINSIC(Vector512, Create, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, true, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector512, ConvertToDouble, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, ConvertToSingle, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, ConvertToInt32, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ConvertToInt64, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ConvertToUInt32, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ConvertToUInt64, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, Divide, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, Equals, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, EqualsAll, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) @@ -845,7 +849,7 @@ HARDWARE_INTRINSIC(AVX512F, CompareNotEqual, HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F, ConvertToUInt32WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi32, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Byte, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovdb, INS_vpmovdb, INS_vpmovqb, INS_vpmovqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128ByteWithSaturation, 64, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovusdb, INS_invalid, INS_vpmovusqb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F, ConvertToVector128Int16, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmovqw, INS_vpmovqw, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) @@ -1002,7 +1006,7 @@ HARDWARE_INTRINSIC(AVX512F_VL, TernaryLogic, HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Double, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2sd64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_X64, ConvertScalarToVector128Single, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtusi2ss64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvtss2usi, INS_vcvtsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(AVX512F_X64, ConvertToUInt64WithTruncation, 16, 1, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vcvttss2usi64, INS_vcvttsd2usi}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 162bb4942b54ce..11f95af2be0687 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1364,12 +1364,31 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ConvertToDouble: case NI_Vector256_ConvertToDouble: + case NI_Vector512_ConvertToDouble: + { + assert(sig->numArgs == 1); + assert(varTypeIsLong(simdBaseType)); + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + intrinsic = (simdSize == 16) ? NI_AVX512DQ_VL_ConvertToVector128Double + : (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256Double + : NI_AVX512DQ_ConvertToVector512Double; + + op1 = impSIMDPopStack(); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); + } + break; + } + case NI_Vector128_ConvertToInt64: case NI_Vector256_ConvertToInt64: + case NI_Vector512_ConvertToInt64: case NI_Vector128_ConvertToUInt32: case NI_Vector256_ConvertToUInt32: + case NI_Vector512_ConvertToUInt32: case NI_Vector128_ConvertToUInt64: case NI_Vector256_ConvertToUInt64: + case NI_Vector512_ConvertToUInt64: { assert(sig->numArgs == 1); // TODO-XARCH-CQ: These intrinsics should be accelerated @@ -1431,7 +1450,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } else { - // TODO-XARCH-CQ: These intrinsics should be accelerated + // TODO-XARCH-CQ: These intrinsics should be accelerated. assert(simdBaseType == TYP_UINT); } break; diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index de914ea0bdfdc6..9c38a69d6854db 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -7883,6 +7883,18 @@ void Compiler::impImportBlockCode(BasicBlock* block) || (impStackTop().val->TypeGet() == TYP_BYREF) #endif ; +#ifdef TARGET_AMD64 + // If AVX512 is present and we are not checking for overflow, we do not need + // a large node. In this case, we will not fallback to a helper function but + // will use the intrinsic instead. This is done for all long/ulong to floating + // point conversions. Hence setting the callNode to false to + // avoid generating a large node. + if (callNode && !ovfl && varTypeIsLong(impStackTop().val) && + compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + callNode = false; + } +#endif // TARGET_AMD64 } else { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 93c4e601bb7811..43cd4ce2ddc173 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -2281,6 +2281,9 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type) instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) { // AVX: For now we support only conversion from Int/Long -> float + // AVX512: Supports following conversions + // srcType = float/double castToType = ulong + // srcType = ulong castToType = double switch (from) { @@ -2329,6 +2332,8 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) return ins_Move_Extend(TYP_FLOAT, false); case TYP_DOUBLE: return INS_cvtss2sd; + case TYP_ULONG: + return INS_vcvttss2usi64; default: unreached(); } @@ -2341,6 +2346,8 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) return INS_cvttsd2si; case TYP_LONG: return INS_cvttsd2si; + case TYP_ULONG: + return INS_vcvttsd2usi; case TYP_FLOAT: return INS_cvtsd2ss; case TYP_DOUBLE: @@ -2350,6 +2357,17 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from, emitAttr attr) } break; + case TYP_ULONG: + switch (to) + { + case TYP_DOUBLE: + return INS_vcvtusi2sd64; + case TYP_FLOAT: + return INS_vcvtusi2ss64; + default: + unreached(); + } + default: unreached(); } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 161df4485e0d98..add7a79abbde6c 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -637,7 +637,8 @@ INST3(vcvtss2usi, "cvtss2usi", IUM_WR, BAD_CODE, BAD_ INST3(vcvttpd2udq, "cvttpd2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation packed doubles to unsigned DWORDs INST3(vcvttps2udq, "cvttps2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x78), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation packed singles to unsigned DWORDs INST3(vcvttsd2usi, "cvttsd2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x78), INS_TT_TUPLE1_FIXED, Input_64Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar double to unsigned DWORD/QWORD -INST3(vcvttss2usi, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_WX | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD +INST3(vcvttss2usi32, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD +INST3(vcvttss2usi64, "cvttss2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x78), INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W1 | Encoding_EVEX) // cvt w/ truncation scalar single to unsigned DWORD/QWORD INST3(vcvtudq2pd, "cvtudq2pd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x7A), INS_TT_HALF, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to doubles INST3(vcvtudq2ps, "cvtudq2ps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt packed unsigned DWORDs to singles INST3(vcvtusi2sd32, "cvtusi2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7B), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // cvt scalar unsigned DWORD to double diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index aa4258d71ba776..94632d1411e794 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -795,15 +795,15 @@ void Lowering::LowerCast(GenTree* tree) // srcType = float/double castToType = * and overflow detecting cast // Reason: must be converted to a helper call // srcType = float/double, castToType = ulong - // Reason: must be converted to a helper call + // Reason: must be converted to a helper call unless we have AVX512F // srcType = uint castToType = float/double // Reason: uint -> float/double = uint -> long -> float/double // srcType = ulong castToType = float // Reason: ulong -> float = ulong -> double -> float if (varTypeIsFloating(srcType)) { - noway_assert(!tree->gtOverflow()); - noway_assert(castToType != TYP_ULONG); + assert(!tree->gtOverflow() || comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + assert(castToType != TYP_ULONG || comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); } else if (srcType == TYP_UINT) { @@ -811,7 +811,7 @@ void Lowering::LowerCast(GenTree* tree) } else if (srcType == TYP_ULONG) { - noway_assert(castToType != TYP_FLOAT); + assert(castToType != TYP_FLOAT || comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); } // Case of src is a small type and dst is a floating point type. diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 8e4c6612b41a10..dd63ecca7494d8 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -293,6 +293,39 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) var_types dstType = tree->CastToType(); unsigned dstSize = genTypeSize(dstType); +#if defined(TARGET_AMD64) + // If AVX512 is present, we have intrinsic available to convert + // ulong directly to float. Hence, we need to combine the 2 nodes + // GT_CAST(GT_CAST(TYP_ULONG, TYP_DOUBLE), TYP_FLOAT) into a single + // node i.e. GT_CAST(TYP_ULONG, TYP_FLOAT). At this point, we already + // have the 2 GT_CAST nodes in the tree and we are combining them below. + if (oper->OperIs(GT_CAST)) + { + GenTreeCast* innerCast = static_cast(oper); + + if (innerCast->IsUnsigned()) + { + GenTree* innerOper = innerCast->CastOp(); + var_types innerSrcType = genActualType(innerOper); + var_types innerDstType = innerCast->CastToType(); + unsigned innerDstSize = genTypeSize(innerDstType); + innerSrcType = varTypeToUnsigned(innerSrcType); + + // Check if we are going from ulong->double->float + if (innerSrcType == TYP_ULONG && innerDstType == TYP_DOUBLE && dstType == TYP_FLOAT) + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + // One optimized (combined) cast here + tree = gtNewCastNode(TYP_ULONG, innerOper, true, TYP_FLOAT); + tree->gtType = TYP_FLOAT; + return fgMorphTree(tree); + } + } + } + } +#endif // TARGET_AMD64 + // See if the cast has to be done in two steps. R -> I if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType)) { @@ -357,6 +390,10 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #endif // !TARGET_AMD64 case TYP_ULONG: +#ifdef TARGET_AMD64 + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + return nullptr; +#endif return fgMorphCastIntoHelper(tree, CORINFO_HELP_DBL2ULNG, oper); default: unreached(); @@ -449,7 +486,7 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) { srcType = varTypeToUnsigned(srcType); - if (srcType == TYP_ULONG) + if (srcType == TYP_ULONG && !compOpportunisticallyDependsOn(InstructionSet_AVX512F)) { if (dstType == TYP_FLOAT) { diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index ce1c1068ffb45c..744828da60d81f 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -531,8 +531,6 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, switch (intrinsic) { #if defined(TARGET_XARCH) - case NI_VectorT128_ConvertToDouble: - case NI_VectorT256_ConvertToDouble: case NI_VectorT128_ConvertToInt64: case NI_VectorT256_ConvertToInt64: case NI_VectorT128_ConvertToUInt32: @@ -541,6 +539,10 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case NI_VectorT256_ConvertToUInt64: { // TODO-XARCH-CQ: These intrinsics should be accelerated + // This is not accelerated because scalar float/double->uint + // is not yet accelerated. Upon updating them for avx512, there + // will be a difference in values between non AVX512 and + // AVX512 machine. return nullptr; } @@ -1256,6 +1258,23 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize); } + case NI_VectorT128_ConvertToDouble: + case NI_VectorT256_ConvertToDouble: + case NI_VectorT512_ConvertToDouble: + { + if (IsBaselineVector512IsaSupportedOpportunistically()) + { + assert(sig->numArgs == 1); + assert(simdBaseType == TYP_LONG || simdBaseType == TYP_ULONG); + NamedIntrinsic convert = (simdSize == 16) + ? NI_AVX512DQ_VL_ConvertToVector128Double + : (simdSize == 32) ? NI_AVX512DQ_VL_ConvertToVector256Double + : NI_AVX512DQ_ConvertToVector512Double; + return gtNewSimdHWIntrinsicNode(retType, op1, convert, simdBaseJitType, simdSize); + } + return nullptr; + } + case NI_VectorT128_ConvertToSingle: case NI_VectorT256_ConvertToSingle: { diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 598712e86254d4..269a3067a3b30a 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -401,6 +401,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, Xor, // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // Vector Intrinsics SIMD_AS_HWINTRINSIC_ID(VectorT512, get_Zero, 0, {NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT512, ConvertToDouble, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_VectorT512_ConvertToDouble, NI_VectorT512_ConvertToDouble, NI_Illegal, NI_Illegal}, SimdAsHWIntrinsicFlag::None) #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/vm/jithelpers.cpp b/src/coreclr/vm/jithelpers.cpp index d4ce2c9aa69ac6..476ca84d90ee0a 100644 --- a/src/coreclr/vm/jithelpers.cpp +++ b/src/coreclr/vm/jithelpers.cpp @@ -572,6 +572,38 @@ FORCEINLINE INT64 FastDbl2Lng(double val) #endif } +//------------------------------------------------------------------------ +// TruncateDouble: helper function to truncate double +// numbers to nearest integer (round towards zero). +// +// Arguments: +// val - double number to be truncated. +// +// Return Value: +// double: truncated number (rounded towards zero) +// +double TruncateDouble(double val) +{ + FCALL_CONTRACT; + int64_t *dintVal = (int64_t *)&val; + + uint64_t uintVal = (uint64_t)*dintVal; + int exponent = (int)((uintVal >> 52) & 0x7FF); + if (exponent < 1023) + { + uintVal = uintVal & 0x8000000000000000ull; + } + else if (exponent < 1075) + { + uintVal = uintVal & (unsigned long long)(~(0xFFFFFFFFFFFFF >> (exponent - 1023))); + } + int64_t intVal = (int64_t)uintVal; + double *doubleVal = (double *)&intVal; + double retVal = *doubleVal; + + return retVal; +} + /*********************************************************************/ HCIMPL1_V(UINT32, JIT_Dbl2UIntOvf, double val) { @@ -589,7 +621,13 @@ HCIMPLEND HCIMPL1_V(UINT64, JIT_Dbl2ULng, double val) { FCALL_CONTRACT; +#if defined(TARGET_X86) || defined(TARGET_AMD64) + const double uint64_max_plus_1 = -2.0 * (double)INT64_MIN; + val = TruncateDouble(val); + return ((val != val) || (val < 0) || (val >= uint64_max_plus_1)) ? UINT64_MAX : (UINT64)val; + +#else const double two63 = 2147483648.0 * 4294967296.0; UINT64 ret; if (val < two63) { @@ -600,6 +638,7 @@ HCIMPL1_V(UINT64, JIT_Dbl2ULng, double val) ret = FastDbl2Lng(val - two63) + I64(0x8000000000000000); } return ret; +#endif // TARGET_X86 || TARGET_AMD64 } HCIMPLEND diff --git a/src/libraries/System.Private.CoreLib/src/System/Double.cs b/src/libraries/System.Private.CoreLib/src/System/Double.cs index aaa637ae02a6f3..c459a648b44a64 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Double.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Double.cs @@ -1400,7 +1400,7 @@ private static bool TryConvertTo(double value, [MaybeNullWhen(false)] ou { #if TARGET_64BIT nuint actualResult = (value >= ulong.MaxValue) ? unchecked((nuint)ulong.MaxValue) : - (value <= ulong.MinValue) ? unchecked((nuint)ulong.MinValue) : (nuint)value; + (value <= ulong.MinValue || IsNaN(value)) ? unchecked((nuint)ulong.MinValue) : (nuint)value; result = (TOther)(object)actualResult; return true; #else diff --git a/src/libraries/System.Private.CoreLib/src/System/Half.cs b/src/libraries/System.Private.CoreLib/src/System/Half.cs index 6415acc9c798e3..07a70273594878 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Half.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Half.cs @@ -1883,7 +1883,7 @@ private static bool TryConvertTo(Half value, [MaybeNullWhen(false)] out else if (typeof(TOther) == typeof(nuint)) { nuint actualResult = (value == PositiveInfinity) ? nuint.MaxValue : - (value <= Zero) ? nuint.MinValue : (nuint)value; + (value <= Zero || IsNaN(value)) ? nuint.MinValue : (nuint)value; result = (TOther)(object)actualResult; return true; } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/NFloat.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/NFloat.cs index e5645feb21ffa6..d8f35715ff0bf6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/NFloat.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/InteropServices/NFloat.cs @@ -1754,7 +1754,7 @@ private static bool TryConvertTo(NFloat value, [MaybeNullWhen(false)] ou return true; #else nuint actualResult = (value >= ulong.MaxValue) ? unchecked((nuint)ulong.MaxValue) : - (value <= ulong.MinValue) ? unchecked((nuint)ulong.MinValue) : (nuint)value; + (value <= ulong.MinValue || IsNaN(value)) ? unchecked((nuint)ulong.MinValue) : (nuint)value; result = (TOther)(object)actualResult; return true; #endif diff --git a/src/libraries/System.Private.CoreLib/src/System/Single.cs b/src/libraries/System.Private.CoreLib/src/System/Single.cs index 42d63de43279b7..3219e9b27d5857 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Single.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Single.cs @@ -1380,7 +1380,7 @@ private static bool TryConvertTo(float value, [MaybeNullWhen(false)] out { #if TARGET_64BIT nuint actualResult = (value >= ulong.MaxValue) ? unchecked((nuint)ulong.MaxValue) : - (value <= ulong.MinValue) ? unchecked((nuint)ulong.MinValue) : (nuint)value; + (value <= ulong.MinValue || IsNaN(value)) ? unchecked((nuint)ulong.MinValue) : (nuint)value; result = (TOther)(object)actualResult; return true; #else diff --git a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp index eaf7f2fa1a9daa..3890fcac11a3dd 100644 --- a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp +++ b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cpp @@ -137,6 +137,7 @@ extern "C" DLLEXPORT uint64_t ConvertDoubleToUInt64(double x, FPtoIntegerConver return ((x != x) || (x < INT64_MIN) || (x >= uint64_max_plus_1)) ? (uint64_t)INT64_MIN : (x < 0) ? (uint64_t)(int64_t)x : (uint64_t)x; case CONVERT_SENTINEL: + case CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: return ((x != x) || (x < 0) || (x >= uint64_max_plus_1)) ? UINT64_MAX : (uint64_t)x; case CONVERT_SATURATING: @@ -153,18 +154,8 @@ extern "C" DLLEXPORT uint64_t ConvertDoubleToUInt64(double x, FPtoIntegerConver return (uint64_t)ConvertDoubleToInt64(x - int64_max_plus_1, CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32) + (0x8000000000000000); } } - - case CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: - if (x < int64_max_plus_1) - { - return (x < INT64_MIN) ? (uint64_t)INT64_MIN : (uint64_t)(int64_t)x; - } - else - { - x -= int64_max_plus_1; - x = trunc(x); - return (uint64_t)(((x != x) || (x >= int64_max_plus_1)) ? INT64_MIN : (int64_t)x) + (0x8000000000000000); - } + + case CONVERT_NATIVECOMPILERBEHAVIOR: // handled above, but add case to silence warning return 0; } diff --git a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs index 5b78783c09e4ca..e2be91c974fec3 100644 --- a/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs +++ b/src/tests/JIT/Directed/Convert/out_of_range_fp_to_int_conversions.cs @@ -183,6 +183,7 @@ public static ulong ConvertDoubleToUInt64(double x, FPtoIntegerConversionType t) return (Double.IsNaN(x) || (x < long.MinValue) || (x >= ullong_max_plus_1)) ? unchecked((ulong)long.MinValue): (x < 0) ? (ulong)(long)x: (ulong)x; case FPtoIntegerConversionType.CONVERT_SENTINEL: + case FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: return (Double.IsNaN(x) || (x < 0) || (x >= ullong_max_plus_1)) ? ulong.MaxValue : (ulong)x; case FPtoIntegerConversionType.CONVERT_SATURATING: @@ -199,21 +200,7 @@ public static ulong ConvertDoubleToUInt64(double x, FPtoIntegerConversionType t) return (ulong)ConvertDoubleToInt64(x - two63, FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_ARM32) + (0x8000000000000000); } } - - case FPtoIntegerConversionType.CONVERT_MANAGED_BACKWARD_COMPATIBLE_X86_X64: - - if (x < two63) - { - return (x < long.MinValue) ? unchecked((ulong)long.MinValue) : (ulong)(long)x; - } - else - { - // (double)LLONG_MAX cannot be represented exactly as double - const double llong_max_plus_1 = (double)((ulong)long.MaxValue + 1); - x -= two63; - x = Math.Truncate(x); - return (ulong)((Double.IsNaN(x) || (x >= llong_max_plus_1)) ? long.MinValue : (long)x) + (0x8000000000000000); - } + } return 0;