-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Correctly handle folding immediates into subregister use operands #129664
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Correctly handle folding immediates into subregister use operands #129664
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis fixes a miscompile where a 64-bit materialize incorrectly folds into We currently do not see many subregister use operands. Incidentally, The existing APIs are awkward since they expect to have a fully formed Full diff: https://github.com/llvm/llvm-project/pull/129664.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index eb9aabf8b6317..ee9a8ec3bb245 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -50,6 +50,11 @@ struct FoldCandidate {
}
}
+ FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm,
+ bool Commuted_ = false, int ShrinkOp = -1)
+ : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
+ Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {}
+
bool isFI() const {
return Kind == MachineOperand::MO_FrameIndex;
}
@@ -578,16 +583,29 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
- MachineInstr *MI, unsigned OpNo,
- MachineOperand *FoldOp, bool Commuted = false,
- int ShrinkOp = -1) {
+ FoldCandidate &&Entry) {
// Skip additional folding on the same operand.
for (FoldCandidate &Fold : FoldList)
- if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
+ if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
return;
- LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
- << " operand " << OpNo << "\n " << *MI);
- FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
+ LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
+ << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
+ FoldList.push_back(Entry);
+}
+
+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+ MachineInstr *MI, unsigned OpNo,
+ MachineOperand *FoldOp, bool Commuted = false,
+ int ShrinkOp = -1) {
+ appendFoldCandidate(FoldList,
+ FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+}
+
+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+ MachineInstr *MI, unsigned OpNo, int64_t ImmVal,
+ bool Commuted = false, int ShrinkOp = -1) {
+ appendFoldCandidate(FoldList,
+ FoldCandidate(MI, OpNo, ImmVal, Commuted, ShrinkOp));
}
bool SIFoldOperandsImpl::tryAddToFoldList(
@@ -847,11 +865,35 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
return false;
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
- if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
- appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
- return true;
+ MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+ if (OpToFold.isImm()) {
+ if (unsigned UseSubReg = UseOp.getSubReg()) {
+ std::optional<int64_t> SubImm =
+ SIInstrInfo::extractSubregFromImm(OpToFold.getImm(), UseSubReg);
+ if (!SubImm)
+ return false;
+
+ // TODO: Avoid the temporary MachineOperand
+ MachineOperand TmpOp = MachineOperand::CreateImm(*SubImm);
+ if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp)) {
+ appendFoldCandidate(FoldList, UseMI, UseOpIdx, *SubImm);
+ return true;
+ }
+
+ return false;
+ }
+
+ if (TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
+ appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
+ return true;
+ }
}
+ // TODO: Verify the following code handles subregisters correctly.
+ // TODO: Handle extract of global reference
+ if (UseOp.getSubReg())
+ return false;
+
if (!OpToFold.isReg())
return false;
@@ -861,7 +903,6 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
// Maybe it is just a COPY of an immediate itself.
MachineInstr *Def = MRI->getVRegDef(UseReg);
- MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
MachineOperand &DefOp = Def->getOperand(1);
if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
index 591bda2b22f12..2389477bb72a8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
@@ -17,7 +17,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[REG_SEQUENCE]].sub0, 8, implicit-def $scc
- ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 8, implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 0, implicit-def $scc, implicit $scc
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
%0:sgpr_64 = COPY $sgpr8_sgpr9
@@ -42,8 +42,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr8_vgpr9
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1
- ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 30064771075, 0, implicit $exec
- ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 30064771075, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 3, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 7, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
%0:vreg_64 = COPY $vgpr8_vgpr9
@@ -116,7 +116,7 @@ body: |
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 8
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B64_]].sub1, %subreg.sub0, [[S_MOV_B64_]].sub0, %subreg.sub1
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 8, implicit-def $scc
- ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 8, implicit-def $scc, implicit $scc
+ ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 0, implicit-def $scc, implicit $scc
; CHECK-NEXT: S_ENDPGM 0, implicit [[S_ADD_U32_]], implicit [[S_ADDC_U32_]]
%0:sgpr_64 = COPY $sgpr8_sgpr9
%1:sreg_64 = S_MOV_B64 8
|
…ands This fixes a miscompile where a 64-bit materialize incorrectly folds into a sub1 use operand. We currently do not see many subregister use operands. Incidentally, there are also SIFoldOperands bugs that prevent this fold from appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit materializes, in preparation for future patches. The existing APIs are awkward since they expect to have a fully formed instruction with operands to use, and not something new which needs to be created.
5b08cec
to
c216acd
Compare
…ands (llvm#129664) This fixes a miscompile where a 64-bit materialize incorrectly folds into a sub1 use operand. We currently do not see many subregister use operands. Incidentally, there are also SIFoldOperands bugs that prevent this fold from appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit materializes, in preparation for future patches. The existing APIs are awkward since they expect to have a fully formed instruction with operands to use, and not something new which needs to be created.
This fixes a miscompile where a 64-bit materialize incorrectly folds into
a sub1 use operand.
We currently do not see many subregister use operands. Incidentally,
there are also SIFoldOperands bugs that prevent this fold from
appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit
materializes, in preparation for future patches.
The existing APIs are awkward since they expect to have a fully formed
instruction with operands to use, and not something new which needs
to be created.