Skip to content

Commit c8f4c35

Browse files
authored
AMDGPU: Correctly handle folding immediates into subregister use operands (#129664)
This fixes a miscompile where a 64-bit materialize incorrectly folds into a sub1 use operand. We currently do not see many subregister use operands. Incidentally, there are also SIFoldOperands bugs that prevent this fold from appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit materializes, in preparation for future patches. The existing APIs are awkward since they expect to have a fully formed instruction with operands to use, and not something new which needs to be created.
1 parent 0f869cc commit c8f4c35

File tree

2 files changed

+57
-16
lines changed

2 files changed

+57
-16
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ struct FoldCandidate {
5050
}
5151
}
5252

53+
FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm,
54+
bool Commuted_ = false, int ShrinkOp = -1)
55+
: UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
56+
Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {}
57+
5358
bool isFI() const {
5459
return Kind == MachineOperand::MO_FrameIndex;
5560
}
@@ -578,16 +583,29 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
578583
}
579584

580585
static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
581-
MachineInstr *MI, unsigned OpNo,
582-
MachineOperand *FoldOp, bool Commuted = false,
583-
int ShrinkOp = -1) {
586+
FoldCandidate &&Entry) {
584587
// Skip additional folding on the same operand.
585588
for (FoldCandidate &Fold : FoldList)
586-
if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
589+
if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
587590
return;
588-
LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
589-
<< " operand " << OpNo << "\n " << *MI);
590-
FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
591+
LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
592+
<< " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
593+
FoldList.push_back(Entry);
594+
}
595+
596+
static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
597+
MachineInstr *MI, unsigned OpNo,
598+
MachineOperand *FoldOp, bool Commuted = false,
599+
int ShrinkOp = -1) {
600+
appendFoldCandidate(FoldList,
601+
FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
602+
}
603+
604+
static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
605+
MachineInstr *MI, unsigned OpNo, int64_t ImmVal,
606+
bool Commuted = false, int ShrinkOp = -1) {
607+
appendFoldCandidate(FoldList,
608+
FoldCandidate(MI, OpNo, ImmVal, Commuted, ShrinkOp));
591609
}
592610

593611
bool SIFoldOperandsImpl::tryAddToFoldList(
@@ -847,11 +865,35 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
847865
return false;
848866

849867
uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
850-
if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
851-
appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
852-
return true;
868+
MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
869+
if (OpToFold.isImm()) {
870+
if (unsigned UseSubReg = UseOp.getSubReg()) {
871+
std::optional<int64_t> SubImm =
872+
SIInstrInfo::extractSubregFromImm(OpToFold.getImm(), UseSubReg);
873+
if (!SubImm)
874+
return false;
875+
876+
// TODO: Avoid the temporary MachineOperand
877+
MachineOperand TmpOp = MachineOperand::CreateImm(*SubImm);
878+
if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp)) {
879+
appendFoldCandidate(FoldList, UseMI, UseOpIdx, *SubImm);
880+
return true;
881+
}
882+
883+
return false;
884+
}
885+
886+
if (TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
887+
appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
888+
return true;
889+
}
853890
}
854891

892+
// TODO: Verify the following code handles subregisters correctly.
893+
// TODO: Handle extract of global reference
894+
if (UseOp.getSubReg())
895+
return false;
896+
855897
if (!OpToFold.isReg())
856898
return false;
857899

@@ -861,8 +903,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
861903

862904
// Maybe it is just a COPY of an immediate itself.
863905
MachineInstr *Def = MRI->getVRegDef(UseReg);
864-
MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
865-
if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
906+
if (Def && TII->isFoldableCopy(*Def)) {
866907
MachineOperand &DefOp = Def->getOperand(1);
867908
if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
868909
appendFoldCandidate(FoldList, UseMI, UseOpIdx, &DefOp);

llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ body: |
1717
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
1818
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1
1919
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[REG_SEQUENCE]].sub0, 8, implicit-def $scc
20-
; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 8, implicit-def $scc, implicit $scc
20+
; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 0, implicit-def $scc, implicit $scc
2121
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
2222
; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
2323
%0:sgpr_64 = COPY $sgpr8_sgpr9
@@ -42,8 +42,8 @@ body: |
4242
; CHECK-NEXT: {{ $}}
4343
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr8_vgpr9
4444
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1
45-
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 30064771075, 0, implicit $exec
46-
; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 30064771075, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
45+
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 3, 0, implicit $exec
46+
; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 7, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
4747
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
4848
; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
4949
%0:vreg_64 = COPY $vgpr8_vgpr9
@@ -116,7 +116,7 @@ body: |
116116
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 8
117117
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B64_]].sub1, %subreg.sub0, [[S_MOV_B64_]].sub0, %subreg.sub1
118118
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 8, implicit-def $scc
119-
; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 8, implicit-def $scc, implicit $scc
119+
; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 0, implicit-def $scc, implicit $scc
120120
; CHECK-NEXT: S_ENDPGM 0, implicit [[S_ADD_U32_]], implicit [[S_ADDC_U32_]]
121121
%0:sgpr_64 = COPY $sgpr8_sgpr9
122122
%1:sreg_64 = S_MOV_B64 8

0 commit comments

Comments
 (0)