Skip to content

AMDGPU: Fix packed 16-bit inline constants #76522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 5 additions & 15 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,26 +317,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
}
}

bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
bool Negated) const {
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
if (N->isUndef())
return true;

const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (Negated) {
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
return TII->isInlineConstant(-C->getAPIntValue());
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
return TII->isInlineConstant(C->getAPIntValue());

if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());

} else {
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
return TII->isInlineConstant(C->getAPIntValue());

if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
}
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());

return false;
}
Expand Down
14 changes: 3 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,20 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {
}

// TODO: Handle undef as zero
static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
bool Negate = false) {
static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
uint32_t LHSVal, RHSVal;
if (getConstantValue(N->getOperand(0), LHSVal) &&
getConstantValue(N->getOperand(1), RHSVal)) {
SDLoc SL(N);
uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16)
: (LHSVal & 0xffff) | (RHSVal << 16);
uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
DAG.getTargetConstant(K, SL, MVT::i32));
}

return nullptr;
}

static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
return packConstantV2I16(N, DAG, true);
}
} // namespace

/// AMDGPU specific code to select AMDGPU machine instructions for
Expand Down Expand Up @@ -110,10 +105,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {

private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
bool isNegInlineImmediate(const SDNode *N) const {
return isInlineImmediate(N, true);
}
bool isInlineImmediate(const SDNode *N) const;

bool isInlineImmediate16(int64_t Imm) const {
return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
Expand Down
21 changes: 13 additions & 8 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1857,6 +1857,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return &APFloat::IEEEsingle();
Expand All @@ -1871,13 +1874,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
Expand Down Expand Up @@ -2025,9 +2025,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
// We allow fp literals with f16x2 operands assuming that the specified
// literal goes into the lower half and the upper half is zero. We also
// require that the literal may be losslessly converted to f16.
MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
(type == MVT::v2i16)? MVT::i16 :
(type == MVT::v2f32)? MVT::f32 : type;
//
// For i16x2 operands, we assume that the specified literal is encoded as a
// single-precision float. This is pretty odd, but it matches SP3 and what
// happens in hardware.
MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16
: (type == MVT::v2i16) ? MVT::f32
: (type == MVT::v2f32) ? MVT::f32
: type;

APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
return canLosslesslyConvertToFPType(FPLiteral, ExpectedType);
Expand Down Expand Up @@ -3393,12 +3398,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
return AMDGPU::isInlinableIntLiteralV216(Val);
return AMDGPU::isInlinableLiteralV2I16(Val);

if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
return AMDGPU::isInlinableLiteralV2F16(Val);

return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \
false, ImmWidth)

#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \
DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth)

// Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
// and decode using 'enum10' from decodeSrcOp.
#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \
Expand Down Expand Up @@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)

DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16)

DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasDstSelForwardingHazard() const { return GFX940Insts; }

// Cannot use op_sel with v_dot instructions.
bool hasDOTOpSelHazard() const { return GFX940Insts; }
bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a separate change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this is done in this patch to avoid having to special case these instructions in the fold logic. If so, its ok with me.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's right.


// Does not have HW interlocs for VALU writing and then reading SGPRs.
bool hasVDecCoExecHazard() const {
Expand Down
125 changes: 78 additions & 47 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
}
}

void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
int16_t SImm = static_cast<int16_t>(Imm);
if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}

// This must accept a 32-bit immediate value to correctly handle packed 16-bit
// operations.
static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O) {
if (Imm == 0x3C00)
O<< "1.0";
O << "1.0";
else if (Imm == 0xBC00)
O<< "-1.0";
O << "-1.0";
else if (Imm == 0x3800)
O<< "0.5";
O << "0.5";
else if (Imm == 0xB800)
O<< "-0.5";
O << "-0.5";
else if (Imm == 0x4000)
O<< "2.0";
O << "2.0";
else if (Imm == 0xC000)
O<< "-2.0";
O << "-2.0";
else if (Imm == 0x4400)
O<< "4.0";
O << "4.0";
else if (Imm == 0xC400)
O<< "-4.0";
else if (Imm == 0x3118 &&
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
O << "-4.0";
else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
} else {
uint64_t Imm16 = static_cast<uint16_t>(Imm);
O << formatHex(Imm16);
}
}
else
return false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just print it here and not handle it in both places below?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this one, there is actually a very subtle difference between how it's used in printImmediate16 vs. printImmediateV216:

  • printImmediate16 casts to uint16_t, discarding the MSBs before printing the hex value
  • printImmediateV216 doesn't do this cast and therefore doesn't discard the top 16 bits

(One might argue that perhaps printImmediate16 shouldn't discard the MSBs either, but that feels like a separate change.)


void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
printImmediate16(Lo16, STI, O);
return true;
}

void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
int16_t SImm = static_cast<int16_t>(Imm);
if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}

uint16_t HImm = static_cast<uint16_t>(Imm);
if (printImmediateFloat16(HImm, STI, O))
return;

uint64_t Imm16 = static_cast<uint16_t>(Imm);
O << formatHex(Imm16);
}

void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
const MCSubtargetInfo &STI,
raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
if (SImm >= -16 && SImm <= 64) {
if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}

switch (OpType) {
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
if (printImmediateFloat32(Imm, STI, O))
return;
break;
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
if (isUInt<16>(Imm) &&
printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
return;
break;
default:
llvm_unreachable("bad operand type");
}

O << formatHex(static_cast<uint64_t>(Imm));
}

bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (Imm == llvm::bit_cast<uint32_t>(0.0f))
O << "0.0";
else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
Expand All @@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
else
O << formatHex(static_cast<uint64_t>(Imm));
return false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one doesn't have the casting problem, but I wanted to keep consistency between the two methods.


return true;
}

void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
const MCSubtargetInfo &STI,
raw_ostream &O) {
int32_t SImm = static_cast<int32_t>(Imm);
if (isInlinableIntLiteral(SImm)) {
O << SImm;
return;
}

if (printImmediateFloat32(Imm, STI, O))
return;

O << formatHex(static_cast<uint64_t>(Imm));
}

void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
Expand Down Expand Up @@ -741,25 +786,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
break;
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
if (!isUInt<16>(Op.getImm()) &&
STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
printImmediate32(Op.getImm(), STI, O);
break;
}

// Deal with 16-bit FP inline immediates not working.
if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
break;
}
[[fallthrough]];
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
break;
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
printImmediateV216(Op.getImm(), STI, O);
printImmediateV216(Op.getImm(), OpTy, STI, O);
break;
case MCOI::OPERAND_UNKNOWN:
case MCOI::OPERAND_PCREL:
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
raw_ostream &O);
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediateV216(uint32_t Imm, uint8_t OpType,
const MCSubtargetInfo &STI, raw_ostream &O);
bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
Expand Down
19 changes: 6 additions & 13 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
// which does not have f16 support?
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16: {
if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
[[fallthrough]];
}
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
.value_or(255);
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
uint16_t Lo16 = static_cast<uint16_t>(Imm);
uint32_t Encoding = getLit16Encoding(Lo16, STI);
return Encoding;
}
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
.value_or(255);
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
return MO.getImm();
Expand Down
Loading