Skip to content

Commit 75e528f

Browse files
authored
[AMDGPU] Extend zero initialization of return values for TFE (#85759)
buffer_load instructions that use TFE also need to zero initialize return values similar to how the image instructions currently work. Add support for this with standard zero init of all results + zero init of just TFE flag when enable-prt-strict-null subtarget feature is disabled.
1 parent 0cf4788 commit 75e528f

12 files changed

+741
-105
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2045,38 +2045,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
20452045
if (BaseOpcode->HasD16)
20462046
MIB.addImm(IsD16 ? -1 : 0);
20472047

2048-
if (IsTexFail) {
2049-
// An image load instruction with TFE/LWE only conditionally writes to its
2050-
// result registers. Initialize them to zero so that we always get well
2051-
// defined result values.
2052-
assert(VDataOut && !VDataIn);
2053-
Register Tied = MRI->cloneVirtualRegister(VDataOut);
2054-
Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2055-
BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
2056-
.addImm(0);
2057-
auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
2058-
if (STI.usePRTStrictNull()) {
2059-
// With enable-prt-strict-null enabled, initialize all result registers to
2060-
// zero.
2061-
auto RegSeq =
2062-
BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2063-
for (auto Sub : Parts)
2064-
RegSeq.addReg(Zero).addImm(Sub);
2065-
} else {
2066-
// With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
2067-
// result register.
2068-
Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2069-
BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
2070-
auto RegSeq =
2071-
BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2072-
for (auto Sub : Parts.drop_back(1))
2073-
RegSeq.addReg(Undef).addImm(Sub);
2074-
RegSeq.addReg(Zero).addImm(Parts.back());
2075-
}
2076-
MIB.addReg(Tied, RegState::Implicit);
2077-
MIB->tieOperands(0, MIB->getNumOperands() - 1);
2078-
}
2079-
20802048
MI.eraseFromParent();
20812049
constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
20822050
TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class BUF_Pseudo <string opName, dag outs, dag ins,
8686
bits<1> has_soffset = 1;
8787
bits<1> has_offset = 1;
8888
bits<1> has_slc = 1;
89-
bits<1> tfe = ?;
89+
bits<1> tfe = 0;
9090
bits<4> elements = 0;
9191
bits<1> has_sccb = 1;
9292
bits<1> sccb_value = 0;
@@ -323,6 +323,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
323323
Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret);
324324
let MUBUF = 1;
325325
let AsmMatchConverter = "cvtMubuf";
326+
let usesCustomInserter = 1;
326327
}
327328

328329
class MUBUF_Real <MUBUF_Pseudo ps, string real_name = ps.Mnemonic> :
@@ -3369,7 +3370,7 @@ def MUBUFInfoTable : GenericTable {
33693370
let CppTypeName = "MUBUFInfo";
33703371
let Fields = [
33713372
"Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset",
3372-
"IsBufferInv"
3373+
"IsBufferInv", "tfe"
33733374
];
33743375

33753376
let PrimaryKey = ["Opcode"];

llvm/lib/Target/AMDGPU/MIMGInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ class MIMG <dag outs, string dns = "">
210210
: MIMG_Base <outs, dns> {
211211

212212
let hasPostISelHook = 1;
213+
let usesCustomInserter = 1;
213214

214215
Instruction Opcode = !cast<Instruction>(NAME);
215216
MIMGBaseOpcode BaseOpcode;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5410,6 +5410,11 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
54105410
return SplitBB;
54115411
}
54125412
default:
5413+
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5414+
if (!MI.mayStore())
5415+
AddMemOpInit(MI);
5416+
return BB;
5417+
}
54135418
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
54145419
}
54155420
}
@@ -15034,60 +15039,67 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
1503415039
// result register that will be written in the case of a memory access failure.
1503515040
// The required code is also added to tie this init code to the result of the
1503615041
// img instruction.
15037-
void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
15042+
void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
1503815043
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1503915044
const SIRegisterInfo &TRI = TII->getRegisterInfo();
1504015045
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1504115046
MachineBasicBlock &MBB = *MI.getParent();
1504215047

15043-
MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15044-
MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15045-
MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15046-
15047-
if (!TFE && !LWE) // intersect_ray
15048-
return;
15048+
int DstIdx =
15049+
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15050+
unsigned InitIdx = 0;
1504915051

15050-
unsigned TFEVal = TFE ? TFE->getImm() : 0;
15051-
unsigned LWEVal = LWE ? LWE->getImm() : 0;
15052-
unsigned D16Val = D16 ? D16->getImm() : 0;
15052+
if (TII->isImage(MI)) {
15053+
MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15054+
MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15055+
MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
1505315056

15054-
if (!TFEVal && !LWEVal)
15055-
return;
15057+
if (!TFE && !LWE) // intersect_ray
15058+
return;
1505615059

15057-
// At least one of TFE or LWE are non-zero
15058-
// We have to insert a suitable initialization of the result value and
15059-
// tie this to the dest of the image instruction.
15060+
unsigned TFEVal = TFE ? TFE->getImm() : 0;
15061+
unsigned LWEVal = LWE ? LWE->getImm() : 0;
15062+
unsigned D16Val = D16 ? D16->getImm() : 0;
1506015063

15061-
const DebugLoc &DL = MI.getDebugLoc();
15064+
if (!TFEVal && !LWEVal)
15065+
return;
1506215066

15063-
int DstIdx =
15064-
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15067+
// At least one of TFE or LWE are non-zero
15068+
// We have to insert a suitable initialization of the result value and
15069+
// tie this to the dest of the image instruction.
1506515070

15066-
// Calculate which dword we have to initialize to 0.
15067-
MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15071+
// Calculate which dword we have to initialize to 0.
15072+
MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
1506815073

15069-
// check that dmask operand is found.
15070-
assert(MO_Dmask && "Expected dmask operand in instruction");
15074+
// check that dmask operand is found.
15075+
assert(MO_Dmask && "Expected dmask operand in instruction");
1507115076

15072-
unsigned dmask = MO_Dmask->getImm();
15073-
// Determine the number of active lanes taking into account the
15074-
// Gather4 special case
15075-
unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15077+
unsigned dmask = MO_Dmask->getImm();
15078+
// Determine the number of active lanes taking into account the
15079+
// Gather4 special case
15080+
unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
1507615081

15077-
bool Packed = !Subtarget->hasUnpackedD16VMem();
15082+
bool Packed = !Subtarget->hasUnpackedD16VMem();
1507815083

15079-
unsigned InitIdx =
15080-
D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15084+
InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
1508115085

15082-
// Abandon attempt if the dst size isn't large enough
15083-
// - this is in fact an error but this is picked up elsewhere and
15084-
// reported correctly.
15085-
uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15086-
if (DstSize < InitIdx)
15086+
// Abandon attempt if the dst size isn't large enough
15087+
// - this is in fact an error but this is picked up elsewhere and
15088+
// reported correctly.
15089+
uint32_t DstSize =
15090+
TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15091+
if (DstSize < InitIdx)
15092+
return;
15093+
} else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15094+
InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15095+
} else {
1508715096
return;
15097+
}
15098+
15099+
const DebugLoc &DL = MI.getDebugLoc();
1508815100

1508915101
// Create a register for the initialization value.
15090-
Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15102+
Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
1509115103
unsigned NewDst = 0; // Final initialized value will be in here
1509215104

1509315105
// If PRTStrictNull feature is enabled (the default) then initialize
@@ -15185,11 +15197,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
1518515197
return;
1518615198
}
1518715199

15188-
if (TII->isImage(MI)) {
15189-
if (!MI.mayStore())
15190-
AddIMGInit(MI);
15200+
if (TII->isImage(MI))
1519115201
TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15192-
}
1519315202
}
1519415203

1519515204
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
466466

467467
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
468468
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
469-
void AddIMGInit(MachineInstr &MI) const;
469+
void AddMemOpInit(MachineInstr &MI) const;
470470
void AdjustInstrPostInstrSelection(MachineInstr &MI,
471471
SDNode *Node) const override;
472472

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ struct MUBUFInfo {
318318
bool has_srsrc;
319319
bool has_soffset;
320320
bool IsBufferInv;
321+
bool tfe;
321322
};
322323

323324
struct MTBUFInfo {
@@ -466,6 +467,11 @@ bool getMUBUFIsBufferInv(unsigned Opc) {
466467
return Info ? Info->IsBufferInv : false;
467468
}
468469

470+
bool getMUBUFTfe(unsigned Opc) {
471+
const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
472+
return Info ? Info->tfe : false;
473+
}
474+
469475
bool getSMEMIsBuffer(unsigned Opc) {
470476
const SMInfo *Info = getSMEMOpcodeHelper(Opc);
471477
return Info ? Info->IsBuffer : false;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,9 @@ bool getMUBUFHasSoffset(unsigned Opc);
525525
LLVM_READONLY
526526
bool getMUBUFIsBufferInv(unsigned Opc);
527527

528+
LLVM_READONLY
529+
bool getMUBUFTfe(unsigned Opc);
530+
528531
LLVM_READONLY
529532
bool getSMEMIsBuffer(unsigned Opc);
530533

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
33
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
4+
; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted
45

56
define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
67
; GFX8-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
22
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
3+
; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted
34

45
define amdgpu_ps float @struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
56
; CHECK-LABEL: name: struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset

0 commit comments

Comments
 (0)