Skip to content

Commit 87d884b

Browse files
authored
[AMDGPU] Fix folding of v2i16/v2f16 splat imms (#72709)
We can use inline constants with packed 16-bit operands, but these should use op_sel. Currently splat of inlinable constants is considered legal, which is not really true if we fail to fold it with op_sel and drop the high half. It may be legal as a literal but not as inline constant, but then usual literal checks must be performed. This patch makes these splat literals illegal but adds additional logic to the operand folding to keep current folds. This logic is somewhat heavy though. This has fixed constant bus violation in the fdot2 test.
1 parent a1b3b78 commit 87d884b

File tree

5 files changed

+125
-69
lines changed

5 files changed

+125
-69
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 85 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ class SIFoldOperands : public MachineFunctionPass {
8080

8181
bool updateOperand(FoldCandidate &Fold) const;
8282

83+
bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84+
85+
bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86+
8387
bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
8488
MachineInstr *MI, unsigned OpNo,
8589
MachineOperand *OpToFold) const;
@@ -196,62 +200,85 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
196200
return new SIFoldOperands();
197201
}
198202

199-
bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
203+
bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
200204
MachineInstr *MI = Fold.UseMI;
201205
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
202-
assert(Old.isReg());
206+
const uint64_t TSFlags = MI->getDesc().TSFlags;
203207

208+
assert(Old.isReg() && Fold.isImm());
204209

205-
const uint64_t TSFlags = MI->getDesc().TSFlags;
206-
if (Fold.isImm()) {
207-
if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
208-
AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
209-
ST->hasInv2PiInlineImm())) {
210-
if (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))
211-
return false; // Prevent further folding of this operand without opsel.
212-
213-
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
214-
// already set.
215-
unsigned Opcode = MI->getOpcode();
216-
int OpNo = MI->getOperandNo(&Old);
217-
int ModIdx = -1;
218-
if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
219-
ModIdx = AMDGPU::OpName::src0_modifiers;
220-
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
221-
ModIdx = AMDGPU::OpName::src1_modifiers;
222-
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
223-
ModIdx = AMDGPU::OpName::src2_modifiers;
224-
assert(ModIdx != -1);
225-
ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
226-
MachineOperand &Mod = MI->getOperand(ModIdx);
227-
unsigned Val = Mod.getImm();
228-
if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
229-
// Only apply the following transformation if that operand requires
230-
// a packed immediate.
231-
switch (TII->get(Opcode).operands()[OpNo].OperandType) {
232-
case AMDGPU::OPERAND_REG_IMM_V2FP16:
233-
case AMDGPU::OPERAND_REG_IMM_V2INT16:
234-
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
235-
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
236-
// If upper part is all zero we do not need op_sel_hi.
237-
if (!isUInt<16>(Fold.ImmToFold)) {
238-
if (!(Fold.ImmToFold & 0xffff)) {
239-
Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
240-
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
241-
Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
242-
return true;
243-
}
244-
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
245-
Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
246-
return true;
247-
}
248-
break;
249-
default:
250-
break;
251-
}
252-
}
253-
}
210+
if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211+
(ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
212+
isUInt<16>(Fold.ImmToFold) ||
213+
!AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
214+
return false;
215+
216+
unsigned Opcode = MI->getOpcode();
217+
int OpNo = MI->getOperandNo(&Old);
218+
uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
219+
switch (OpType) {
220+
default:
221+
return false;
222+
case AMDGPU::OPERAND_REG_IMM_V2FP16:
223+
case AMDGPU::OPERAND_REG_IMM_V2INT16:
224+
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
225+
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
226+
break;
227+
}
228+
229+
return true;
230+
}
231+
232+
bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
233+
MachineInstr *MI = Fold.UseMI;
234+
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
235+
unsigned Opcode = MI->getOpcode();
236+
int OpNo = MI->getOperandNo(&Old);
237+
238+
// Set op_sel/op_sel_hi on this operand or bail out if op_sel is
239+
// already set.
240+
int ModIdx = -1;
241+
if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
242+
ModIdx = AMDGPU::OpName::src0_modifiers;
243+
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
244+
ModIdx = AMDGPU::OpName::src1_modifiers;
245+
else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
246+
ModIdx = AMDGPU::OpName::src2_modifiers;
247+
assert(ModIdx != -1);
248+
ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
249+
MachineOperand &Mod = MI->getOperand(ModIdx);
250+
unsigned Val = Mod.getImm();
251+
if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
252+
return false;
253+
254+
// Only apply the following transformation if that operand requires
255+
// a packed immediate.
256+
// If upper part is all zero we do not need op_sel_hi.
257+
if (!(Fold.ImmToFold & 0xffff)) {
258+
MachineOperand New =
259+
MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
260+
if (!TII->isOperandLegal(*MI, OpNo, &New))
261+
return false;
262+
Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
263+
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
264+
Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
265+
return true;
254266
}
267+
MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
268+
if (!TII->isOperandLegal(*MI, OpNo, &New))
269+
return false;
270+
Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
271+
Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
272+
return true;
273+
}
274+
275+
bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
276+
MachineInstr *MI = Fold.UseMI;
277+
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
278+
assert(Old.isReg());
279+
280+
if (Fold.isImm() && canUseImmWithOpSel(Fold))
281+
return tryFoldImmWithOpSel(Fold);
255282

256283
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
257284
MachineBasicBlock *MBB = MI->getParent();
@@ -383,7 +410,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
383410
return false;
384411
};
385412

386-
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
413+
bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
414+
if (!IsLegal && OpToFold->isImm()) {
415+
FoldCandidate Fold(MI, OpNo, OpToFold);
416+
IsLegal = canUseImmWithOpSel(Fold);
417+
}
418+
419+
if (!IsLegal) {
387420
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
388421
unsigned NewOpc = macToMad(Opc);
389422
if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4144,12 +4144,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
41444144
case AMDGPU::OPERAND_REG_IMM_V2INT16:
41454145
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
41464146
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
4147-
// This suffers the same problem as the scalar 16-bit cases.
4148-
return AMDGPU::isInlinableIntLiteralV216(Imm);
4147+
return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
4148+
AMDGPU::isInlinableIntLiteral((int16_t)Imm);
41494149
case AMDGPU::OPERAND_REG_IMM_FP16:
41504150
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
41514151
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
4152-
case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
4152+
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
4153+
case AMDGPU::OPERAND_REG_IMM_V2FP16:
4154+
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4155+
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
41534156
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
41544157
// A few special case instructions have 16-bit operands on subtargets
41554158
// where 16-bit instructions are not legal.
@@ -4162,12 +4165,6 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
41624165

41634166
return false;
41644167
}
4165-
case AMDGPU::OPERAND_REG_IMM_V2FP16:
4166-
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
4167-
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
4168-
uint32_t Trunc = static_cast<uint32_t>(Imm);
4169-
return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
4170-
}
41714168
case AMDGPU::OPERAND_KIMM32:
41724169
case AMDGPU::OPERAND_KIMM16:
41734170
return false;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2507,6 +2507,16 @@ bool isInlinableIntLiteralV216(int32_t Literal) {
25072507
return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
25082508
}
25092509

2510+
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
2511+
switch (OpType) {
2512+
case AMDGPU::OPERAND_REG_IMM_V2FP16:
2513+
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
2514+
return isInlinableLiteralV216(Literal, HasInv2Pi);
2515+
default:
2516+
return isInlinableIntLiteralV216(Literal);
2517+
}
2518+
}
2519+
25102520
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
25112521
assert(HasInv2Pi);
25122522

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,6 +1286,9 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
12861286
LLVM_READNONE
12871287
bool isInlinableIntLiteralV216(int32_t Literal);
12881288

1289+
LLVM_READNONE
1290+
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
1291+
12891292
LLVM_READNONE
12901293
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
12911294

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,29 @@ entry:
7272
ret void
7373
}
7474

75-
; FIXME: This test violates constant bus restriction.
75+
; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal.
7676

7777
define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
78-
; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
79-
; GFX11: ; %bb.0: ; %entry
80-
; GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, 0x10001, s1
81-
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
82-
; GFX11-NEXT: s_nop 0
83-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
84-
; GFX11-NEXT: s_endpgm
78+
; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
79+
; SDAG-GFX11: ; %bb.0: ; %entry
80+
; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, s1
81+
; SDAG-GFX11-NEXT: s_mov_b32 s1, 0x10001
82+
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
83+
; SDAG-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, s1, v2
84+
; SDAG-GFX11-NEXT: global_store_b16 v[0:1], v2, off
85+
; SDAG-GFX11-NEXT: s_nop 0
86+
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
87+
; SDAG-GFX11-NEXT: s_endpgm
88+
;
89+
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
90+
; GISEL-GFX11: ; %bb.0: ; %entry
91+
; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, 0x10001
92+
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
93+
; GISEL-GFX11-NEXT: v_dot2_bf16_bf16 v2, s0, v2, s1
94+
; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v2, off
95+
; GISEL-GFX11-NEXT: s_nop 0
96+
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
97+
; GISEL-GFX11-NEXT: s_endpgm
8598
ptr addrspace(1) %r,
8699
<2 x i16> inreg %a,
87100
i16 inreg %c) {

0 commit comments

Comments
 (0)