-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Optimize FMA codegen base on the overwritten #58196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
ee2c0b6
46d0011
cce4bda
b825291
f615e39
b698036
7d9c0d6
1344d92
029a9b5
f2a371f
9955389
7c56653
1d51caa
091133e
9a6ae44
ffcff76
5641f8f
b7312ac
a325fe3
0f950dd
33a596d
5da9368
c3a9f07
9e356aa
f8159bc
18bbe4d
2ca2524
17bd967
eed5912
43c5034
5ef70a5
bfa6924
12f260b
5ca658e
ec4ef66
aa93a85
c66a018
ff5a433
a4657c7
75d7a37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2133,47 +2133,74 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node) | |
| // Intrinsics with CopyUpperBits semantics cannot have op1 be contained | ||
| assert(!copiesUpperBits || !op1->isContained()); | ||
|
|
||
| if (op2->isContained() || op2->isUsedFromSpillTemp()) | ||
| unsigned overwrittenOpNum = 0; | ||
| LIR::Use use; | ||
| if (LIR::AsRange(compiler->compCurBB).TryGetUse(node, &use)) | ||
| { | ||
| // 132 form: op1 = (op1 * op3) + [op2] | ||
| overwrittenOpNum = node->GetOverwrittenOpNumForFMA(use.User(), op1, op2, op3); | ||
| } | ||
|
|
||
| ins = (instruction)(ins - 1); | ||
| op1Reg = op1->GetRegNum(); | ||
| op2Reg = op3->GetRegNum(); | ||
| op3 = op2; | ||
| if (overwrittenOpNum == 1) | ||
| { | ||
| if (op2->isContained()) | ||
| { | ||
| // op1 = (op1 * [op2]) + op3 | ||
| // 132 form: XMM1 = (XMM1 * [XMM3]) + XMM2 | ||
| ins = (instruction)(ins - 1); | ||
| op1Reg = op1->GetRegNum(); | ||
| op2Reg = op3->GetRegNum(); | ||
| op3 = op2; | ||
| } | ||
| else | ||
| { | ||
| // op1 = (op1 * op2) + [op3] | ||
| // 213 form: XMM1 = (XMM2 * XMM1) + [XMM3] | ||
| op1Reg = op1->GetRegNum(); | ||
| op2Reg = op2->GetRegNum(); | ||
|
||
| } | ||
| } | ||
| else if (op1->isContained() || op1->isUsedFromSpillTemp()) | ||
| else if (overwrittenOpNum == 3) | ||
| { | ||
| // 231 form: op3 = (op2 * op3) + [op1] | ||
|
|
||
| // 231 form: XMM1 = (XMM2 * [XMM3]) + XMM1 | ||
| // One of the following: | ||
| // op3 = ([op1] * op2) + op3 | ||
| // op3 = (op1 * [op2]) + op3 | ||
| ins = (instruction)(ins + 1); | ||
| op1Reg = op3->GetRegNum(); | ||
| op2Reg = op2->GetRegNum(); | ||
| op3 = op1; | ||
| if (op1->isContained()) | ||
tannergooding marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| // op3 = ([op1] * op2) + op3 | ||
| op2Reg = op2->GetRegNum(); | ||
| op3 = op1; | ||
| } | ||
| else | ||
| { | ||
| // op3 = (op1 * [op2]) + op3 | ||
| op2Reg = op1->GetRegNum(); | ||
| op3 = op2; | ||
| } | ||
| } | ||
| else | ||
| { | ||
| // 213 form: op1 = (op2 * op1) + [op3] | ||
|
|
||
| op1Reg = op1->GetRegNum(); | ||
| op2Reg = op2->GetRegNum(); | ||
|
|
||
| isCommutative = !copiesUpperBits; | ||
| } | ||
|
|
||
| if (isCommutative && (op1Reg != targetReg) && (op2Reg == targetReg)) | ||
| { | ||
| assert(node->isRMWHWIntrinsic(compiler)); | ||
| assert(overwrittenOpNum == 2 || overwrittenOpNum == 0); | ||
tannergooding marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if (op1->isContained()) | ||
tannergooding marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| // op2 = ([op1] * op2) + op3 | ||
| // 132 form: XMM1 = (XMM1 * [XMM3]) + XMM2 | ||
| ins = (instruction)(ins - 1); | ||
| op1Reg = op2->GetRegNum(); | ||
| op2Reg = op3->GetRegNum(); | ||
| op3 = op1; | ||
|
|
||
| // We have "reg2 = (reg1 * reg2) +/- op3" where "reg1 != reg2" on a RMW intrinsic. | ||
| // | ||
| // For non-commutative intrinsics, we should have ensured that op2 was marked | ||
| // delay free in order to prevent it from getting assigned the same register | ||
| // as target. However, for commutative intrinsics, we can just swap the operands | ||
| // in order to have "reg2 = reg2 op reg1" which will end up producing the right code. | ||
| } | ||
| else | ||
| { | ||
| // op2 = (op1 * op2) + [op3] | ||
| // 213 form: XMM1 = (XMM2 * XMM1) + [XMM3] | ||
| op1Reg = op2->GetRegNum(); | ||
| op2Reg = op1->GetRegNum(); | ||
|
||
| } | ||
|
|
||
| op2Reg = op1Reg; | ||
| op1Reg = targetReg; | ||
| } | ||
|
|
||
| genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, op1Reg, op2Reg, op3); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2334,47 +2334,85 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) | |
|
|
||
| const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId); | ||
|
|
||
| unsigned overwrittenOpNum = 0; | ||
| LIR::Use use; | ||
| if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use)) | ||
| { | ||
| overwrittenOpNum = intrinsicTree->GetOverwrittenOpNumForFMA(use.User(), op1, op2, op3); | ||
| } | ||
|
|
||
| // Intrinsics with CopyUpperBits semantics cannot have op1 be contained | ||
| assert(!copiesUpperBits || !op1->isContained()); | ||
|
|
||
| if (op2->isContained()) | ||
| if (overwrittenOpNum == 1) | ||
| { | ||
| // 132 form: op1 = (op1 * op3) + [op2] | ||
| if (op2->isContained()) | ||
| { | ||
| // op1 = (op1 * [op2]) + op3 | ||
| // 132 form: XMM1 = (XMM1 * [XMM3]) + XMM2 | ||
| tgtPrefUse = BuildUse(op1); | ||
|
|
||
| tgtPrefUse = BuildUse(op1); | ||
| srcCount += 1; | ||
| srcCount += BuildOperandUses(op2); | ||
| srcCount += BuildDelayFreeUses(op3, op1); | ||
| } | ||
| else | ||
| { | ||
| //assert(op3->isContained()); | ||
| // op1 = (op1 * op2) + [op3] | ||
| // 213 form: XMM1 = (XMM2 * XMM1) + [XMM3] | ||
| tgtPrefUse = BuildUse(op1); | ||
|
|
||
| srcCount += 1; | ||
| srcCount += BuildOperandUses(op2); | ||
| srcCount += BuildDelayFreeUses(op3, op1); | ||
| srcCount += 1; | ||
| srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1); | ||
| srcCount += BuildDelayFreeUses(op2, op1); | ||
|
|
||
| } | ||
| } | ||
| else if (op1->isContained()) | ||
| else if (overwrittenOpNum == 3) | ||
| { | ||
| // 231 form: op3 = (op2 * op3) + [op1] | ||
|
|
||
| // 231 form: XMM1 = (XMM2 * [XMM3]) + XMM1 | ||
| // One of the following: | ||
| // op3 = ([op1] * op2) + op3 | ||
| // op3 = (op1 * [op2]) + op3 | ||
| tgtPrefUse = BuildUse(op3); | ||
|
|
||
| srcCount += BuildOperandUses(op1); | ||
| srcCount += BuildDelayFreeUses(op2, op1); | ||
| srcCount += 1; | ||
| if (op1->isContained()) | ||
| { | ||
| srcCount += BuildOperandUses(op1); | ||
| srcCount += BuildDelayFreeUses(op2, op3); | ||
| } | ||
| else | ||
| { | ||
| //assert(op2->isContained()); | ||
| srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op3); | ||
| srcCount += BuildDelayFreeUses(op1, op3); | ||
| } | ||
|
|
||
| } | ||
| else | ||
| { | ||
| // 213 form: op1 = (op2 * op1) + [op3] | ||
| assert(overwrittenOpNum == 2 || overwrittenOpNum == 0); | ||
|
|
||
| tgtPrefUse = BuildUse(op1); | ||
| srcCount += 1; | ||
|
|
||
| if (copiesUpperBits) | ||
| if (op1->isContained()) | ||
|
||
| { | ||
| srcCount += BuildDelayFreeUses(op2, op1); | ||
| // op2 = ([op1] * op2) + op3 | ||
| // 132 form: XMM1 = (XMM1 * [XMM3]) + XMM2 | ||
| tgtPrefUse = BuildUse(op2); | ||
| srcCount += 1; | ||
| srcCount += BuildOperandUses(op1); | ||
| srcCount += BuildDelayFreeUses(op3, op2); | ||
| } | ||
| else | ||
| { | ||
| tgtPrefUse2 = BuildUse(op2); | ||
| // op2 = (op1 * op2) + [op3] | ||
| // 213 form: XMM1 = (XMM2 * XMM1) + [XMM3] | ||
|
|
||
| tgtPrefUse = BuildUse(op2); | ||
| srcCount += 1; | ||
| srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1); | ||
| srcCount += BuildDelayFreeUses(op1, op2); | ||
| } | ||
|
|
||
| srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1); | ||
| } | ||
|
|
||
| buildUses = false; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.