Skip to content

Commit 0c0d406

Browse files
vangthao95dstutt
authored andcommitted
[AMDGPU] Fix negative immediate offset for unbuffered smem loads (prelim)
This is a cherry-pick of an unmerged upstream change llvm#89165 Once the upstream change is finished and merged, this will need reverting. Original commit message: For unbuffered smem loads, it is illegal for the immediate offset to be negative if the resulting IOFFSET + (SGPR[Offset] or M0 or zero) is negative. New PR of llvm#79553. Change-Id: I235ac5d0de5da2a1544760ab3c9749665340310d
1 parent 79e3e42 commit 0c0d406

13 files changed

+352
-107
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,35 +1986,47 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
19861986
// offsets available on CI.
19871987
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
19881988
SDValue *SOffset, SDValue *Offset,
1989-
bool Imm32Only, bool IsBuffer) const {
1989+
bool Imm32Only, bool IsBuffer,
1990+
bool HasSOffset,
1991+
int64_t ImmOffset) const {
19901992
assert((!SOffset || !Offset) &&
19911993
"Cannot match both soffset and offset at the same time!");
19921994

19931995
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
19941996
if (!C) {
19951997
if (!SOffset)
19961998
return false;
1999+
bool Changed = false;
19972000
if (ByteOffsetNode.getValueType().isScalarInteger() &&
19982001
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
19992002
*SOffset = ByteOffsetNode;
2000-
return true;
2001-
}
2002-
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2003+
Changed = true;
2004+
} else if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
20032005
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
20042006
*SOffset = ByteOffsetNode.getOperand(0);
2005-
return true;
2007+
Changed = true;
20062008
}
20072009
}
2008-
return false;
2010+
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2011+
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2012+
// Handle the case where the Immediate Offset + SOffset is negative.
2013+
if (AMDGPU::hasSMRDSignedImmOffset(*Subtarget) && Changed &&
2014+
!IsBuffer & !Imm32Only && ImmOffset < 0) {
2015+
KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2016+
if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2017+
return false;
2018+
}
2019+
2020+
return Changed;
20092021
}
20102022

20112023
SDLoc SL(ByteOffsetNode);
20122024

20132025
// GFX9 and GFX10 have signed byte immediate offsets. The immediate
20142026
// offset for S_BUFFER instructions is unsigned.
20152027
int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2016-
std::optional<int64_t> EncodedOffset =
2017-
AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2028+
std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2029+
*Subtarget, ByteOffset, IsBuffer, HasSOffset);
20182030
if (EncodedOffset && Offset && !Imm32Only) {
20192031
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
20202032
return true;
@@ -2073,13 +2085,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
20732085
// true, match only 32-bit immediate offsets available on CI.
20742086
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
20752087
SDValue *SOffset, SDValue *Offset,
2076-
bool Imm32Only,
2077-
bool IsBuffer) const {
2088+
bool Imm32Only, bool IsBuffer,
2089+
bool HasSOffset,
2090+
int64_t ImmOffset) const {
20782091
if (SOffset && Offset) {
20792092
assert(!Imm32Only && !IsBuffer);
20802093
SDValue B;
2081-
return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
2082-
SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
2094+
2095+
if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2096+
return false;
2097+
2098+
int64_t ImmOff = 0;
2099+
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2100+
ImmOff = C->getSExtValue();
2101+
2102+
return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2103+
ImmOff);
20832104
}
20842105

20852106
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2098,11 +2119,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
20982119
}
20992120
if (!N0 || !N1)
21002121
return false;
2101-
if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2122+
2123+
if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2124+
ImmOffset)) {
21022125
SBase = N0;
21032126
return true;
21042127
}
2105-
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2128+
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2129+
ImmOffset)) {
21062130
SBase = N1;
21072131
return true;
21082132
}

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,11 +185,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
185185

186186
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
187187
SDValue *Offset, bool Imm32Only = false,
188-
bool IsBuffer = false) const;
188+
bool IsBuffer = false, bool HasSOffset = false,
189+
int64_t ImmOffset = 0) const;
189190
SDValue Expand32BitAddress(SDValue Addr) const;
190191
bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
191192
SDValue *Offset, bool Imm32Only = false,
192-
bool IsBuffer = false) const;
193+
bool IsBuffer = false, bool HasSOffset = false,
194+
int64_t ImmOffset = 0) const;
193195
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
194196
SDValue *Offset, bool Imm32Only = false) const;
195197
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -201,6 +203,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
201203
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
202204
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
203205
SDValue &Offset) const;
206+
bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
207+
SDValue &Offset) const;
204208
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
205209

206210
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4199,10 +4199,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
41994199
return false;
42004200

42014201
const GEPInfo &GEPI = AddrInfo[0];
4202-
std::optional<int64_t> EncodedImm =
4203-
AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4202+
std::optional<int64_t> EncodedImm;
42044203

42054204
if (SOffset && Offset) {
4205+
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4206+
/*HasSOffset=*/true);
42064207
if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
42074208
AddrInfo.size() > 1) {
42084209
const GEPInfo &GEPI2 = AddrInfo[1];
@@ -4212,13 +4213,26 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
42124213
Base = GEPI2.SgprParts[0];
42134214
*SOffset = OffsetReg;
42144215
*Offset = *EncodedImm;
4216+
if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4217+
return true;
4218+
4219+
// For unbuffered smem loads, it is illegal for the Immediate Offset
4220+
// to be negative if the resulting (Offset + (M0 or SOffset or zero)
4221+
// is negative. Handle the case where the Immediate Offset + SOffset
4222+
// is negative.
4223+
auto SKnown = KB->getKnownBits(*SOffset);
4224+
if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4225+
return false;
4226+
42154227
return true;
42164228
}
42174229
}
42184230
}
42194231
return false;
42204232
}
42214233

4234+
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4235+
/*HasSOffset=*/false);
42224236
if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
42234237
Base = GEPI.SgprParts[0];
42244238
*Offset = *EncodedImm;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,6 +1325,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13251325
// of sign-extending.
13261326
bool hasGetPCZeroExtension() const { return GFX12Insts; }
13271327

1328+
// \returns true if the target supports signed immediate offset for SMRD
1329+
// instructions.
1330+
bool hasSignedSMRDImmOffset() const { return getGeneration() >= GFX9; }
1331+
13281332
/// \returns SGPR allocation granularity supported by the subtarget.
13291333
unsigned getSGPRAllocGranule() const {
13301334
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,12 @@ namespace llvm {
158158

159159
namespace AMDGPU {
160160

161+
/// \returns true if the target supports signed immediate offset for SMRD
162+
/// instructions.
163+
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
164+
return isGFX9Plus(ST);
165+
}
166+
161167
/// \returns True if \p STI is AMDHSA.
162168
bool isHsaAbi(const MCSubtargetInfo &STI) {
163169
return STI.getTargetTriple().getOS() == Triple::AMDHSA;
@@ -2804,10 +2810,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
28042810
return isGCN3Encoding(ST) || isGFX10Plus(ST);
28052811
}
28062812

2807-
static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
2808-
return isGFX9Plus(ST);
2809-
}
2810-
28112813
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
28122814
int64_t EncodedOffset) {
28132815
if (isGFX12Plus(ST))
@@ -2842,7 +2844,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
28422844
}
28432845

28442846
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
2845-
int64_t ByteOffset, bool IsBuffer) {
2847+
int64_t ByteOffset, bool IsBuffer,
2848+
bool HasSOffset) {
2849+
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2850+
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2851+
// Handle case where SOffset is not present.
2852+
if (!IsBuffer && hasSMRDSignedImmOffset(ST) && !HasSOffset && ByteOffset < 0)
2853+
return std::nullopt;
2854+
28462855
if (isGFX12Plus(ST)) // 24 bit signed offsets
28472856
return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
28482857
: std::nullopt;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1295,6 +1295,7 @@ bool hasVOPD(const MCSubtargetInfo &STI);
12951295
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
12961296
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
12971297
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
1298+
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);
12981299

12991300
/// Is Reg - scalar register
13001301
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -1467,7 +1468,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
14671468
/// S_LOAD instructions have a signed offset, on other subtargets it is
14681469
/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
14691470
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
1470-
int64_t ByteOffset, bool IsBuffer);
1471+
int64_t ByteOffset, bool IsBuffer,
1472+
bool HasSOffset = false);
14711473

14721474
/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
14731475
/// instruction. This is only useful on CI.s

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,7 +1234,15 @@ body: |
12341234
; GFX10: liveins: $sgpr0_sgpr1
12351235
; GFX10-NEXT: {{ $}}
12361236
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
1237-
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4)
1237+
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
1238+
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
1239+
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
1240+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
1241+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
1242+
; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
1243+
; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
1244+
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
1245+
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
12381246
; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
12391247
%0:sgpr(p4) = COPY $sgpr0_sgpr1
12401248
%1:sgpr(s64) = G_CONSTANT i64 -1
@@ -1304,7 +1312,15 @@ body: |
13041312
; GFX10: liveins: $sgpr0_sgpr1
13051313
; GFX10-NEXT: {{ $}}
13061314
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
1307-
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4)
1315+
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
1316+
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
1317+
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
1318+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
1319+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
1320+
; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
1321+
; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
1322+
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
1323+
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
13081324
; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
13091325
%0:sgpr(p4) = COPY $sgpr0_sgpr1
13101326
%1:sgpr(s64) = G_CONSTANT i64 -524288

llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,13 @@ entry:
8888
ret void
8989
}
9090

91-
; GFX9_10 can use a signed immediate byte offset
91+
; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
9292
; GCN-LABEL: {{^}}smrd6:
9393
; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
9494
; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
95-
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4
95+
; GFX9_10: s_add_u32 s2, s2, -4
96+
; GFX9_10: s_addc_u32 s3, s3, -1
97+
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
9698
define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
9799
entry:
98100
%tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1

llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -297,20 +297,26 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
297297
; GFX9: ; %bb.0: ; %entry
298298
; GFX9-NEXT: .LBB5_1: ; %loop
299299
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
300-
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
301-
; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190
302300
; GFX9-NEXT: s_add_i32 s2, s2, -1
301+
; GFX9-NEXT: s_add_u32 s4, s0, 0xfffffe70
302+
; GFX9-NEXT: s_addc_u32 s5, s1, -1
303+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
304+
; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
303305
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
304306
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
305307
; GFX9-NEXT: ; %bb.2: ; %end
306308
; GFX9-NEXT: s_endpgm
307309
;
308310
; GFX12-LABEL: test_sink_smem_offset_neg400:
309311
; GFX12: ; %bb.0: ; %entry
312+
; GFX12-NEXT: s_movk_i32 s4, 0xfe70
313+
; GFX12-NEXT: s_mov_b32 s5, -1
314+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
315+
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
310316
; GFX12-NEXT: .LBB5_1: ; %loop
311317
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
312318
; GFX12-NEXT: s_wait_kmcnt 0x0
313-
; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190
319+
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
314320
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
315321
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
316322
; GFX12-NEXT: s_cmp_lg_u32 s2, 0

llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,31 @@ define amdgpu_ps void @test_s_load_i8(ptr addrspace(4) inreg %in, ptr addrspace(
1919
}
2020

2121
define amdgpu_ps void @test_s_load_i8_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
22-
; GCN-LABEL: test_s_load_i8_imm:
23-
; GCN: ; %bb.0:
24-
; GCN-NEXT: s_load_i8 s0, s[0:1], -0x64
25-
; GCN-NEXT: s_wait_kmcnt 0x0
26-
; GCN-NEXT: v_mov_b32_e32 v2, s0
27-
; GCN-NEXT: global_store_b32 v[0:1], v2, off
28-
; GCN-NEXT: s_nop 0
29-
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
30-
; GCN-NEXT: s_endpgm
22+
; DAG-LABEL: test_s_load_i8_imm:
23+
; DAG: ; %bb.0:
24+
; DAG-NEXT: s_movk_i32 s2, 0xff9c
25+
; DAG-NEXT: s_mov_b32 s3, -1
26+
; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
27+
; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
28+
; DAG-NEXT: s_load_i8 s0, s[0:1], 0x0
29+
; DAG-NEXT: s_wait_kmcnt 0x0
30+
; DAG-NEXT: v_mov_b32_e32 v2, s0
31+
; DAG-NEXT: global_store_b32 v[0:1], v2, off
32+
; DAG-NEXT: s_nop 0
33+
; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
34+
; DAG-NEXT: s_endpgm
35+
;
36+
; GISEL-LABEL: test_s_load_i8_imm:
37+
; GISEL: ; %bb.0:
38+
; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff9c
39+
; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
40+
; GISEL-NEXT: s_load_i8 s0, s[0:1], 0x0
41+
; GISEL-NEXT: s_wait_kmcnt 0x0
42+
; GISEL-NEXT: v_mov_b32_e32 v2, s0
43+
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
44+
; GISEL-NEXT: s_nop 0
45+
; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
46+
; GISEL-NEXT: s_endpgm
3147
%gep = getelementptr i8, ptr addrspace(4) %in, i64 -100
3248
%ld = load i8, ptr addrspace(4) %gep
3349
%sext = sext i8 %ld to i32
@@ -195,15 +211,31 @@ define amdgpu_ps void @test_s_load_i16(ptr addrspace(4) inreg %in, ptr addrspace
195211
}
196212

197213
define amdgpu_ps void @test_s_load_i16_imm(ptr addrspace(4) inreg %in, ptr addrspace(1) %out) {
198-
; GCN-LABEL: test_s_load_i16_imm:
199-
; GCN: ; %bb.0:
200-
; GCN-NEXT: s_load_i16 s0, s[0:1], -0xc8
201-
; GCN-NEXT: s_wait_kmcnt 0x0
202-
; GCN-NEXT: v_mov_b32_e32 v2, s0
203-
; GCN-NEXT: global_store_b32 v[0:1], v2, off
204-
; GCN-NEXT: s_nop 0
205-
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206-
; GCN-NEXT: s_endpgm
214+
; DAG-LABEL: test_s_load_i16_imm:
215+
; DAG: ; %bb.0:
216+
; DAG-NEXT: s_movk_i32 s2, 0xff38
217+
; DAG-NEXT: s_mov_b32 s3, -1
218+
; DAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
219+
; DAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
220+
; DAG-NEXT: s_load_i16 s0, s[0:1], 0x0
221+
; DAG-NEXT: s_wait_kmcnt 0x0
222+
; DAG-NEXT: v_mov_b32_e32 v2, s0
223+
; DAG-NEXT: global_store_b32 v[0:1], v2, off
224+
; DAG-NEXT: s_nop 0
225+
; DAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
226+
; DAG-NEXT: s_endpgm
227+
;
228+
; GISEL-LABEL: test_s_load_i16_imm:
229+
; GISEL: ; %bb.0:
230+
; GISEL-NEXT: s_add_co_u32 s0, s0, 0xffffff38
231+
; GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
232+
; GISEL-NEXT: s_load_i16 s0, s[0:1], 0x0
233+
; GISEL-NEXT: s_wait_kmcnt 0x0
234+
; GISEL-NEXT: v_mov_b32_e32 v2, s0
235+
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
236+
; GISEL-NEXT: s_nop 0
237+
; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
238+
; GISEL-NEXT: s_endpgm
207239
%gep = getelementptr i16, ptr addrspace(4) %in, i64 -100
208240
%ld = load i16, ptr addrspace(4) %gep
209241
%sext = sext i16 %ld to i32

0 commit comments

Comments
 (0)