-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: More consistently use the fold list instead of direct mutation #127612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: More consistently use the fold list instead of direct mutation #127612
Conversation
There were 2 parallel fold check mechanisms, so consistently use the fold list. The worklist management here is still not good. Other types of folds are not using it, and we should probably rewrite the pass to look more like peephole-opt. This should be an alternative fix to skipping commute if the operands are the same (#127562). The new test is still not broken as-is, but demonstrates failures in a future patch.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThere were 2 parallel fold check mechanisms, so consistently use the This should be an alternative fix to skipping commute if the operands Full diff: https://github.com/llvm/llvm-project/pull/127612.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index ab396929162d0..c522eeab80fdb 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -114,7 +114,7 @@ class SIFoldOperandsImpl {
bool
getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
Register UseReg, uint8_t OpTy) const;
- bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
+ bool tryToFoldACImm(MachineOperand &OpToFold, MachineInstr *UseMI,
unsigned UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList) const;
void foldOperand(MachineOperand &OpToFold,
@@ -573,11 +573,6 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
return true;
}
-static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
- const MachineInstr *MI) {
- return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
-}
-
static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *FoldOp, bool Commuted = false,
@@ -678,12 +673,6 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
}
}
- // If we are already folding into another operand of MI, then
- // we can't commute the instruction, otherwise we risk making the
- // other fold illegal.
- if (isUseMIInFoldList(FoldList, MI))
- return false;
-
// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.
unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
@@ -814,7 +803,7 @@ bool SIFoldOperandsImpl::getRegSeqInit(
}
bool SIFoldOperandsImpl::tryToFoldACImm(
- const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
+ MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList) const {
const MCInstrDesc &Desc = UseMI->getDesc();
if (UseOpIdx >= Desc.getNumOperands())
@@ -825,7 +814,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
- UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
+ appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
return true;
}
@@ -836,16 +825,13 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
if (!UseReg.isVirtual())
return false;
- if (isUseMIInFoldList(FoldList, UseMI))
- return false;
-
// Maybe it is just a COPY of an immediate itself.
MachineInstr *Def = MRI->getVRegDef(UseReg);
MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
MachineOperand &DefOp = Def->getOperand(1);
if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
- UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
+ appendFoldCandidate(FoldList, UseMI, UseOpIdx, &DefOp);
return true;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir
new file mode 100644
index 0000000000000..18fbf3eaf9c8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=peephole-opt,si-fold-operands -o - %s | FileCheck %s
+
+# Check for assert when trying later folds after commuting an
+# instruction where both operands are the same register. This depended
+# on use list ordering, so we need to run peephole-opt first to
+# reproduce.
+
+---
+name: commute_add_same_inputs_assert
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: commute_add_same_inputs_assert
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
+ ; CHECK-NEXT: $vgpr4 = COPY [[V_ADDC_U32_e32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr4
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_64 = S_MOV_B64 0
+ %2:sreg_32 = COPY %1.sub0
+ %3:vgpr_32 = V_ADD_CO_U32_e32 %2, killed %0, implicit-def $vcc, implicit $exec
+ %4:vgpr_32 = COPY %2
+ %5:vgpr_32 = COPY %2
+ %6:vgpr_32 = V_ADDC_U32_e32 %4, %5, implicit-def $vcc, implicit $vcc, implicit $exec
+ $vgpr4 = COPY %6
+ SI_RETURN implicit $vgpr4
+
+...
+
+---
+name: commute_sub_same_inputs_assert
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+ ; CHECK-LABEL: name: commute_sub_same_inputs_assert
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[V_SUBBREV_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBBREV_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec
+ ; CHECK-NEXT: $vgpr4 = COPY [[V_SUBBREV_U32_e32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr4
+ %0:vgpr_32 = COPY $vgpr0
+ %1:sreg_64 = S_MOV_B64 0
+ %2:sreg_32 = COPY %1.sub0
+ %3:vgpr_32 = V_SUB_CO_U32_e32 %2, killed %0, implicit-def $vcc, implicit $exec
+ %4:vgpr_32 = COPY %2
+ %5:vgpr_32 = COPY %2
+ %6:vgpr_32 = V_SUBB_U32_e32 %5, %4, implicit-def $vcc, implicit $vcc, implicit $exec
+ $vgpr4 = COPY %6
+ SI_RETURN implicit $vgpr4
+
+...
|
ping |
There were 2 parallel fold check mechanisms, so consistently use the
fold list. The worklist management here is still not good. Other types
of folds are not using it, and we should probably rewrite the pass to
look more like peephole-opt.
This should be an alternative fix to skipping commute if the operands
are the same (#127562). The new test is still not broken as-is, but
demonstrates failures in a future patch.