Skip to content

Commit 49b4920

Browse files
authored
AMDGPU: Fix packed 16-bit inline constants (#76522)
Consistently treat packed 16-bit operands as 32-bit values, because that's really what they are. The attempt to treat them differently was ultimately incorrect and lead to miscompiles, e.g. when using non-splat constants such as (1, 0) as operands. Recognize 32-bit float constants for i/u16 instructions. This is a bit odd conceptually, but it matches HW behavior and SP3. Remove isFoldableLiteralV216; there was too much magic in the dependency between it and its use in SIFoldOperands. Instead, we now simply rely on checking whether a constant is an inline constant, and trying a bunch of permutations of the low and high halves. This is more obviously correct and leads to some new cases where inline constants are used as shown by tests. Move the logic for switching packed add vs. sub into SIFoldOperands. This has two benefits: all logic that optimizes for inline constants in packed math is now in one place; and it applies to both SelectionDAG and GISel paths. Disable the use of opsel with v_dot* instructions on gfx11. They are documented to ignore opsel on src0 and src1. It may be interesting to re-enable to use of opsel on src2 as a future optimization. A similar "proper" fix of what inline constants mean could potentially be applied to unpacked 16-bit ops. However, it's less clear what the benefit would be, and there are surely places where we'd have to carefully audit whether values are properly sign- or zero-extended. It is best to keep such a change separate. Fixes: Corruption in FSR 2.0 (latent bug exposed by an LLPC change)
1 parent 49029f9 commit 49b4920

39 files changed

+1541
-1748
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
317317
}
318318
}
319319

320-
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
321-
bool Negated) const {
320+
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
322321
if (N->isUndef())
323322
return true;
324323

325324
const SIInstrInfo *TII = Subtarget->getInstrInfo();
326-
if (Negated) {
327-
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
328-
return TII->isInlineConstant(-C->getAPIntValue());
325+
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
326+
return TII->isInlineConstant(C->getAPIntValue());
329327

330-
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
331-
return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
332-
333-
} else {
334-
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
335-
return TII->isInlineConstant(C->getAPIntValue());
336-
337-
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
338-
return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
339-
}
328+
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
329+
return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
340330

341331
return false;
342332
}

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,25 +50,20 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
5050
}
5151

5252
// TODO: Handle undef as zero
53-
static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
54-
bool Negate = false) {
53+
static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
5554
assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
5655
uint32_t LHSVal, RHSVal;
5756
if (getConstantValue(N->getOperand(0), LHSVal) &&
5857
getConstantValue(N->getOperand(1), RHSVal)) {
5958
SDLoc SL(N);
60-
uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
61-
: (LHSVal & 0xffff) | (RHSVal << 16);
59+
uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
6260
return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
6361
DAG.getTargetConstant(K, SL, MVT::i32));
6462
}
6563

6664
return nullptr;
6765
}
6866

69-
static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
70-
return packConstantV2I16(N, DAG, true);
71-
}
7267
} // namespace
7368

7469
/// AMDGPU specific code to select AMDGPU machine instructions for
@@ -110,10 +105,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
110105

111106
private:
112107
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
113-
bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
114-
bool isNegInlineImmediate(const SDNode *N) const {
115-
return isInlineImmediate(N, true);
116-
}
108+
bool isInlineImmediate(const SDNode *N) const;
117109

118110
bool isInlineImmediate16(int64_t Imm) const {
119111
return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1865,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
18651865
case AMDGPU::OPERAND_REG_IMM_V2FP32:
18661866
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
18671867
case AMDGPU::OPERAND_REG_IMM_V2INT32:
1868+
case AMDGPU::OPERAND_REG_IMM_V2INT16:
1869+
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
1870+
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
18681871
case AMDGPU::OPERAND_KIMM32:
18691872
case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
18701873
return &APFloat::IEEEsingle();
@@ -1879,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
18791882
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
18801883
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
18811884
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
1882-
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
18831885
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
18841886
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
18851887
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
1886-
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
18871888
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
1888-
case AMDGPU::OPERAND_REG_IMM_V2INT16:
18891889
case AMDGPU::OPERAND_REG_IMM_V2FP16:
18901890
case AMDGPU::OPERAND_KIMM16:
18911891
return &APFloat::IEEEhalf();
@@ -2033,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
20332033
// We allow fp literals with f16x2 operands assuming that the specified
20342034
// literal goes into the lower half and the upper half is zero. We also
20352035
// require that the literal may be losslessly converted to f16.
2036-
MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
2037-
(type == MVT::v2i16)? MVT::i16 :
2038-
(type == MVT::v2f32)? MVT::f32 : type;
2036+
//
2037+
// For i16x2 operands, we assume that the specified literal is encoded as a
2038+
// single-precision float. This is pretty odd, but it matches SP3 and what
2039+
// happens in hardware.
2040+
MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16
2041+
: (type == MVT::v2i16) ? MVT::f32
2042+
: (type == MVT::v2f32) ? MVT::f32
2043+
: type;
20392044

20402045
APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
20412046
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
@@ -3401,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
34013406
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
34023407
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
34033408
OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
3404-
return AMDGPU::isInlinableIntLiteralV216(Val);
3409+
return AMDGPU::isInlinableLiteralV2I16(Val);
34053410

34063411
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
34073412
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
34083413
OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
3409-
return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
3414+
return AMDGPU::isInlinableLiteralV2F16(Val);
34103415

34113416
return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
34123417
}

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
182182
DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \
183183
false, ImmWidth)
184184

185+
#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \
186+
DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)
187+
185188
// Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
186189
// and decode using 'enum10' from decodeSrcOp.
187190
#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \
@@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
262265
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
263266
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
264267

268+
DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
269+
DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)
270+
265271
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
266272
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
267273
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10961096
bool hasDstSelForwardingHazard() const { return GFX940Insts; }
10971097

10981098
// Cannot use op_sel with v_dot instructions.
1099-
bool hasDOTOpSelHazard() const { return GFX940Insts; }
1099+
bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
11001100

11011101
// Does not have HW interlocs for VALU writing and then reading SGPRs.
11021102
bool hasVDecCoExecHazard() const {

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 78 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
460460
}
461461
}
462462

463-
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
464-
const MCSubtargetInfo &STI,
465-
raw_ostream &O) {
466-
int16_t SImm = static_cast<int16_t>(Imm);
467-
if (isInlinableIntLiteral(SImm)) {
468-
O << SImm;
469-
return;
470-
}
471-
463+
// This must accept a 32-bit immediate value to correctly handle packed 16-bit
464+
// operations.
465+
static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
466+
raw_ostream &O) {
472467
if (Imm == 0x3C00)
473-
O<< "1.0";
468+
O << "1.0";
474469
else if (Imm == 0xBC00)
475-
O<< "-1.0";
470+
O << "-1.0";
476471
else if (Imm == 0x3800)
477-
O<< "0.5";
472+
O << "0.5";
478473
else if (Imm == 0xB800)
479-
O<< "-0.5";
474+
O << "-0.5";
480475
else if (Imm == 0x4000)
481-
O<< "2.0";
476+
O << "2.0";
482477
else if (Imm == 0xC000)
483-
O<< "-2.0";
478+
O << "-2.0";
484479
else if (Imm == 0x4400)
485-
O<< "4.0";
480+
O << "4.0";
486481
else if (Imm == 0xC400)
487-
O<< "-4.0";
488-
else if (Imm == 0x3118 &&
489-
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
482+
O << "-4.0";
483+
else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
490484
O << "0.15915494";
491-
} else {
492-
uint64_t Imm16 = static_cast<uint16_t>(Imm);
493-
O << formatHex(Imm16);
494-
}
495-
}
485+
else
486+
return false;
496487

497-
void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
498-
const MCSubtargetInfo &STI,
499-
raw_ostream &O) {
500-
uint16_t Lo16 = static_cast<uint16_t>(Imm);
501-
printImmediate16(Lo16, STI, O);
488+
return true;
502489
}
503490

504-
void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
491+
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
505492
const MCSubtargetInfo &STI,
506493
raw_ostream &O) {
494+
int16_t SImm = static_cast<int16_t>(Imm);
495+
if (isInlinableIntLiteral(SImm)) {
496+
O << SImm;
497+
return;
498+
}
499+
500+
uint16_t HImm = static_cast<uint16_t>(Imm);
501+
if (printImmediateFloat16(HImm, STI, O))
502+
return;
503+
504+
uint64_t Imm16 = static_cast<uint16_t>(Imm);
505+
O << formatHex(Imm16);
506+
}
507+
508+
void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
509+
const MCSubtargetInfo &STI,
510+
raw_ostream &O) {
507511
int32_t SImm = static_cast<int32_t>(Imm);
508-
if (SImm >= -16 && SImm <= 64) {
512+
if (isInlinableIntLiteral(SImm)) {
509513
O << SImm;
510514
return;
511515
}
512516

517+
switch (OpType) {
518+
case AMDGPU::OPERAND_REG_IMM_V2INT16:
519+
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
520+
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
521+
if (printImmediateFloat32(Imm, STI, O))
522+
return;
523+
break;
524+
case AMDGPU::OPERAND_REG_IMM_V2FP16:
525+
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
526+
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
527+
if (isUInt<16>(Imm) &&
528+
printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
529+
return;
530+
break;
531+
default:
532+
llvm_unreachable("bad operand type");
533+
}
534+
535+
O << formatHex(static_cast<uint64_t>(Imm));
536+
}
537+
538+
bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm,
539+
const MCSubtargetInfo &STI,
540+
raw_ostream &O) {
513541
if (Imm == llvm::bit_cast<uint32_t>(0.0f))
514542
O << "0.0";
515543
else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
@@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
532560
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
533561
O << "0.15915494";
534562
else
535-
O << formatHex(static_cast<uint64_t>(Imm));
563+
return false;
564+
565+
return true;
566+
}
567+
568+
void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
569+
const MCSubtargetInfo &STI,
570+
raw_ostream &O) {
571+
int32_t SImm = static_cast<int32_t>(Imm);
572+
if (isInlinableIntLiteral(SImm)) {
573+
O << SImm;
574+
return;
575+
}
576+
577+
if (printImmediateFloat32(Imm, STI, O))
578+
return;
579+
580+
O << formatHex(static_cast<uint64_t>(Imm));
536581
}
537582

538583
void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
@@ -755,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
755800
break;
756801
case AMDGPU::OPERAND_REG_IMM_V2INT16:
757802
case AMDGPU::OPERAND_REG_IMM_V2FP16:
758-
if (!isUInt<16>(Op.getImm()) &&
759-
STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
760-
printImmediate32(Op.getImm(), STI, O);
761-
break;
762-
}
763-
764-
// Deal with 16-bit FP inline immediates not working.
765-
if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
766-
printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
767-
break;
768-
}
769-
[[fallthrough]];
770803
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
771804
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
772-
printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
773-
break;
774805
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
775806
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
776-
printImmediateV216(Op.getImm(), STI, O);
807+
printImmediateV216(Op.getImm(), OpTy, STI, O);
777808
break;
778809
case MCOI::OPERAND_UNKNOWN:
779810
case MCOI::OPERAND_PCREL:

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
8888
raw_ostream &O);
8989
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
9090
raw_ostream &O);
91-
void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
92-
raw_ostream &O);
91+
void printImmediateV216(uint32_t Imm, uint8_t OpType,
92+
const MCSubtargetInfo &STI, raw_ostream &O);
93+
bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
94+
raw_ostream &O);
9395
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
9496
raw_ostream &O);
9597
void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
284284
// which does not have f16 support?
285285
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
286286
case AMDGPU::OPERAND_REG_IMM_V2INT16:
287-
case AMDGPU::OPERAND_REG_IMM_V2FP16: {
288-
if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
289-
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
290-
if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
291-
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
292-
[[fallthrough]];
293-
}
294287
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
295288
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
296-
return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
289+
return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
290+
.value_or(255);
291+
case AMDGPU::OPERAND_REG_IMM_V2FP16:
297292
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
298-
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
299-
uint16_t Lo16 = static_cast<uint16_t>(Imm);
300-
uint32_t Encoding = getLit16Encoding(Lo16, STI);
301-
return Encoding;
302-
}
293+
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
294+
return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
295+
.value_or(255);
303296
case AMDGPU::OPERAND_KIMM32:
304297
case AMDGPU::OPERAND_KIMM16:
305298
return MO.getImm();

0 commit comments

Comments
 (0)