Skip to content

Commit 467b5b9

Browse files
committed
AMDGPU/SI: Use v_readfirstlane to legalize SMRD with VGPR base pointer
Summary: Instead of trying to replace SMRD instructions with a VGPR base pointer with an equivalent MUBUF instruction, we now copy the base pointer to SGPRs using v_readfirstlane. This is safe to do, because any load selected as an SMRD instruction has been proven to have a uniform base pointer, so each thread in the wave will have the same pointer value in VGPRs. This will fix some errors on VI from trying to replace SMRD instructions with addr64-enabled MUBUF instructions that don't exist. Reviewers: arsenm, cfang, nhaehnle Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D17305 llvm-svn: 261385
1 parent e611698 commit 467b5b9

File tree

5 files changed

+43
-252
lines changed

5 files changed

+43
-252
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 20 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,18 +1621,6 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
16211621
case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
16221622
case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
16231623
case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
1624-
case AMDGPU::S_LOAD_DWORD_IMM:
1625-
case AMDGPU::S_LOAD_DWORD_SGPR:
1626-
case AMDGPU::S_LOAD_DWORD_IMM_ci:
1627-
return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
1628-
case AMDGPU::S_LOAD_DWORDX2_IMM:
1629-
case AMDGPU::S_LOAD_DWORDX2_SGPR:
1630-
case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
1631-
return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
1632-
case AMDGPU::S_LOAD_DWORDX4_IMM:
1633-
case AMDGPU::S_LOAD_DWORDX4_SGPR:
1634-
case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
1635-
return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
16361624
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
16371625
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
16381626
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -1993,6 +1981,20 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
19931981
return DstReg;
19941982
}
19951983

1984+
void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
1985+
MachineInstr *MI) const {
1986+
1987+
// If the pointer is store in VGPRs, then we need to move them to
1988+
// SGPRs using v_readfirstlane. This is safe because we only select
1989+
// loads with uniform pointers to SMRD instruction so we know the
1990+
// pointer value is uniform.
1991+
MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
1992+
if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
1993+
unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
1994+
SBase->setReg(SGPR);
1995+
}
1996+
}
1997+
19961998
void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
19971999
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
19982000

@@ -2008,6 +2010,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
20082010
return;
20092011
}
20102012

2013+
// Legalize SMRD
2014+
if (isSMRD(*MI)) {
2015+
legalizeOperandsSMRD(MRI, MI);
2016+
return;
2017+
}
2018+
20112019
// Legalize REG_SEQUENCE and PHI
20122020
// The register class of the operands much be the same type as the register
20132021
// class of the output.
@@ -2280,219 +2288,6 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
22802288
}
22812289
}
22822290

2283-
void SIInstrInfo::splitSMRD(MachineInstr *MI,
2284-
const TargetRegisterClass *HalfRC,
2285-
unsigned HalfImmOp, unsigned HalfSGPROp,
2286-
MachineInstr *&Lo, MachineInstr *&Hi) const {
2287-
2288-
DebugLoc DL = MI->getDebugLoc();
2289-
MachineBasicBlock *MBB = MI->getParent();
2290-
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2291-
unsigned RegLo = MRI.createVirtualRegister(HalfRC);
2292-
unsigned RegHi = MRI.createVirtualRegister(HalfRC);
2293-
unsigned HalfSize = HalfRC->getSize();
2294-
const MachineOperand *OffOp =
2295-
getNamedOperand(*MI, AMDGPU::OpName::offset);
2296-
const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
2297-
2298-
// The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
2299-
// on VI.
2300-
2301-
bool IsKill = SBase->isKill();
2302-
if (OffOp) {
2303-
bool isVI =
2304-
MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
2305-
AMDGPUSubtarget::VOLCANIC_ISLANDS;
2306-
unsigned OffScale = isVI ? 1 : 4;
2307-
// Handle the _IMM variant
2308-
unsigned LoOffset = OffOp->getImm() * OffScale;
2309-
unsigned HiOffset = LoOffset + HalfSize;
2310-
Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
2311-
// Use addReg instead of addOperand
2312-
// to make sure kill flag is cleared.
2313-
.addReg(SBase->getReg(), 0, SBase->getSubReg())
2314-
.addImm(LoOffset / OffScale);
2315-
2316-
if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
2317-
unsigned OffsetSGPR =
2318-
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2319-
BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
2320-
.addImm(HiOffset); // The offset in register is in bytes.
2321-
Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
2322-
.addReg(SBase->getReg(), getKillRegState(IsKill),
2323-
SBase->getSubReg())
2324-
.addReg(OffsetSGPR);
2325-
} else {
2326-
Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
2327-
.addReg(SBase->getReg(), getKillRegState(IsKill),
2328-
SBase->getSubReg())
2329-
.addImm(HiOffset / OffScale);
2330-
}
2331-
} else {
2332-
// Handle the _SGPR variant
2333-
MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
2334-
Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
2335-
.addReg(SBase->getReg(), 0, SBase->getSubReg())
2336-
.addOperand(*SOff);
2337-
unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2338-
BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
2339-
.addReg(SOff->getReg(), 0, SOff->getSubReg())
2340-
.addImm(HalfSize);
2341-
Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
2342-
.addReg(SBase->getReg(), getKillRegState(IsKill),
2343-
SBase->getSubReg())
2344-
.addReg(OffsetSGPR);
2345-
}
2346-
2347-
unsigned SubLo, SubHi;
2348-
const TargetRegisterClass *NewDstRC;
2349-
switch (HalfSize) {
2350-
case 4:
2351-
SubLo = AMDGPU::sub0;
2352-
SubHi = AMDGPU::sub1;
2353-
NewDstRC = &AMDGPU::VReg_64RegClass;
2354-
break;
2355-
case 8:
2356-
SubLo = AMDGPU::sub0_sub1;
2357-
SubHi = AMDGPU::sub2_sub3;
2358-
NewDstRC = &AMDGPU::VReg_128RegClass;
2359-
break;
2360-
case 16:
2361-
SubLo = AMDGPU::sub0_sub1_sub2_sub3;
2362-
SubHi = AMDGPU::sub4_sub5_sub6_sub7;
2363-
NewDstRC = &AMDGPU::VReg_256RegClass;
2364-
break;
2365-
case 32:
2366-
SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2367-
SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
2368-
NewDstRC = &AMDGPU::VReg_512RegClass;
2369-
break;
2370-
default:
2371-
llvm_unreachable("Unhandled HalfSize");
2372-
}
2373-
2374-
unsigned OldDst = MI->getOperand(0).getReg();
2375-
unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
2376-
2377-
MRI.replaceRegWith(OldDst, NewDst);
2378-
2379-
BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
2380-
.addReg(RegLo)
2381-
.addImm(SubLo)
2382-
.addReg(RegHi)
2383-
.addImm(SubHi);
2384-
}
2385-
2386-
void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
2387-
MachineRegisterInfo &MRI,
2388-
SmallVectorImpl<MachineInstr *> &Worklist) const {
2389-
MachineBasicBlock *MBB = MI->getParent();
2390-
int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
2391-
assert(DstIdx != -1);
2392-
unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
2393-
switch(RI.getRegClass(DstRCID)->getSize()) {
2394-
case 4:
2395-
case 8:
2396-
case 16: {
2397-
unsigned NewOpcode = getVALUOp(*MI);
2398-
unsigned RegOffset;
2399-
unsigned ImmOffset;
2400-
2401-
if (MI->getOperand(2).isReg()) {
2402-
RegOffset = MI->getOperand(2).getReg();
2403-
ImmOffset = 0;
2404-
} else {
2405-
assert(MI->getOperand(2).isImm());
2406-
// SMRD instructions take a dword offsets on SI and byte offset on VI
2407-
// and MUBUF instructions always take a byte offset.
2408-
ImmOffset = MI->getOperand(2).getImm();
2409-
if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
2410-
AMDGPUSubtarget::SEA_ISLANDS)
2411-
ImmOffset <<= 2;
2412-
RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2413-
2414-
if (isUInt<12>(ImmOffset)) {
2415-
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2416-
RegOffset)
2417-
.addImm(0);
2418-
} else {
2419-
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2420-
RegOffset)
2421-
.addImm(ImmOffset);
2422-
ImmOffset = 0;
2423-
}
2424-
}
2425-
2426-
unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
2427-
unsigned DWord0 = RegOffset;
2428-
unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2429-
unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2430-
unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2431-
uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
2432-
2433-
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
2434-
.addImm(0);
2435-
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
2436-
.addImm(RsrcDataFormat & 0xFFFFFFFF);
2437-
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
2438-
.addImm(RsrcDataFormat >> 32);
2439-
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
2440-
.addReg(DWord0)
2441-
.addImm(AMDGPU::sub0)
2442-
.addReg(DWord1)
2443-
.addImm(AMDGPU::sub1)
2444-
.addReg(DWord2)
2445-
.addImm(AMDGPU::sub2)
2446-
.addReg(DWord3)
2447-
.addImm(AMDGPU::sub3);
2448-
2449-
const MCInstrDesc &NewInstDesc = get(NewOpcode);
2450-
const TargetRegisterClass *NewDstRC
2451-
= RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
2452-
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
2453-
unsigned DstReg = MI->getOperand(0).getReg();
2454-
MRI.replaceRegWith(DstReg, NewDstReg);
2455-
2456-
MachineInstr *NewInst =
2457-
BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
2458-
.addOperand(MI->getOperand(1)) // sbase
2459-
.addReg(SRsrc)
2460-
.addImm(0)
2461-
.addImm(ImmOffset)
2462-
.addImm(0) // glc
2463-
.addImm(0) // slc
2464-
.addImm(0) // tfe
2465-
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
2466-
MI->eraseFromParent();
2467-
2468-
legalizeOperands(NewInst);
2469-
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
2470-
break;
2471-
}
2472-
case 32: {
2473-
MachineInstr *Lo, *Hi;
2474-
addUsersToMoveToVALUWorklist(MI->getOperand(0).getReg(), MRI, Worklist);
2475-
splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
2476-
AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
2477-
MI->eraseFromParent();
2478-
moveSMRDToVALU(Lo, MRI, Worklist);
2479-
moveSMRDToVALU(Hi, MRI, Worklist);
2480-
break;
2481-
}
2482-
2483-
case 64: {
2484-
MachineInstr *Lo, *Hi;
2485-
addUsersToMoveToVALUWorklist(MI->getOperand(0).getReg(), MRI, Worklist);
2486-
splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
2487-
AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
2488-
MI->eraseFromParent();
2489-
moveSMRDToVALU(Lo, MRI, Worklist);
2490-
moveSMRDToVALU(Hi, MRI, Worklist);
2491-
break;
2492-
}
2493-
}
2494-
}
2495-
24962291
void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
24972292
SmallVector<MachineInstr *, 128> Worklist;
24982293
Worklist.push_back(&TopInst);
@@ -2508,10 +2303,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
25082303
// Handle some special cases
25092304
switch (Opcode) {
25102305
default:
2511-
if (isSMRD(*Inst)) {
2512-
moveSMRDToVALU(Inst, MRI, Worklist);
2513-
continue;
2514-
}
25152306
break;
25162307
case AMDGPU::S_AND_B64:
25172308
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -406,19 +406,12 @@ class SIInstrInfo : public AMDGPUInstrInfo {
406406
unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
407407
MachineRegisterInfo &MRI) const;
408408

409+
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr *MI) const;
410+
409411
/// \brief Legalize all operands in this instruction. This function may
410412
/// create new instruction and insert them before \p MI.
411413
void legalizeOperands(MachineInstr *MI) const;
412414

413-
/// \brief Split an SMRD instruction into two smaller loads of half the
414-
// size storing the results in \p Lo and \p Hi.
415-
void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
416-
unsigned HalfImmOp, unsigned HalfSGPROp,
417-
MachineInstr *&Lo, MachineInstr *&Hi) const;
418-
419-
void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
420-
SmallVectorImpl<MachineInstr *> &Worklist) const;
421-
422415
/// \brief Replace this instruction's opcode with the equivalent VALU
423416
/// opcode. This function will also move the users of \p MI to the
424417
/// VALU if necessary.

llvm/test/CodeGen/AMDGPU/missing-store.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
; FUNC-LABEL: {{^}}missing_store_reduced:
99
; SI: ds_read_b64
1010
; SI: buffer_store_dword
11-
; SI: buffer_load_dword
11+
; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
12+
; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
13+
; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
1214
; SI: buffer_store_dword
1315
; SI: s_endpgm
1416
define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {

llvm/test/CodeGen/AMDGPU/salu-to-valu.ll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,14 @@ done: ; preds = %loop
5353
; Test moving an SMRD instruction to the VALU
5454

5555
; GCN-LABEL: {{^}}smrd_valu:
56-
; FIXME: We should be using flat load for HSA.
57-
; GCN: buffer_load_dword [[OUT:v[0-9]+]]
58-
; GCN-NOHSA: buffer_store_dword [[OUT]]
59-
; GCN-HSA: flat_store_dword {{.*}}, [[OUT]]
56+
; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
57+
; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
58+
; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
59+
; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
60+
; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
61+
; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
62+
; GCN-NOHSA: buffer_store_dword [[V_OUT]]
63+
; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
6064
define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
6165
entry:
6266
%tmp = icmp ne i32 %a, 0

llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,14 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
7070
ret void
7171
}
7272

73-
; Technically we could reorder these, but just comparing the
74-
; instruction type of the load is insufficient.
75-
76-
; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
77-
; CI: buffer_load_dword
73+
; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
7874
; CI: buffer_store_dword
79-
; CI: buffer_load_dword
75+
; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
76+
; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
77+
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
78+
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
8079
; CI: buffer_store_dword
81-
define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
80+
define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
8281
%ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
8382

8483
%ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -95,8 +94,10 @@ define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1
9594
}
9695

9796
; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
98-
; CI: buffer_load_dword
99-
; CI: buffer_load_dword
97+
; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
98+
; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
99+
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
100+
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
100101
; CI: ds_write_b32
101102
; CI: buffer_store_dword
102103
define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {

0 commit comments

Comments
 (0)