Skip to content

Commit 706c502

Browse files
committed
[NVPTX] Cleanup ld/st lowering
1 parent 01d58d9 commit 706c502

File tree

9 files changed

+229
-365
lines changed

9 files changed

+229
-365
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 174 additions & 276 deletions
Large diffs are not rendered by default.

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
7575
void SelectTexSurfHandle(SDNode *N);
7676
bool tryLoad(SDNode *N);
7777
bool tryLoadVector(SDNode *N);
78-
bool tryLDGLDU(SDNode *N);
78+
bool tryLDU(SDNode *N);
79+
bool tryLDG(MemSDNode *N);
7980
bool tryStore(SDNode *N);
8081
bool tryStoreVector(SDNode *N);
8182
bool tryLoadParam(SDNode *N);

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135135
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136136
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137137
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138-
def hasVote : Predicate<"Subtarget->hasVote()">;
139-
def hasDouble : Predicate<"Subtarget->hasDouble()">;
140138
def hasClusters : Predicate<"Subtarget->hasClusters()">;
141-
def hasLDG : Predicate<"Subtarget->hasLDG()">;
142-
def hasLDU : Predicate<"Subtarget->hasLDU()">;
143139
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
144140
def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
145141
def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 39 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
21432143

21442144
class LDU_G<string TyStr, NVPTXRegClass regclass>
21452145
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
2146-
"ldu.global." # TyStr # " \t$result, [$src];",
2147-
[]>, Requires<[hasLDU]>;
2146+
"ldu.global." # TyStr # " \t$result, [$src];", []>;
21482147

2149-
def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
2150-
def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
2151-
def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
2152-
def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
2153-
def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
2154-
def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
2148+
def LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
2149+
def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
2150+
def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
2151+
def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
21552152

21562153
// vector
21572154

@@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
21682165
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
21692166

21702167

2171-
def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
2172-
def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
2173-
def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
2174-
def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
2175-
def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
2176-
def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
2168+
def LDU_GLOBAL_v2i8 : VLDU_G_ELE_V2<"b8", Int16Regs>;
2169+
def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
2170+
def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
2171+
def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;
21772172

2178-
def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
2179-
def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
2180-
def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
2181-
def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
2182-
def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
2183-
def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
2173+
def LDU_GLOBAL_v4i8 : VLDU_G_ELE_V4<"b8", Int16Regs>;
2174+
def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
2175+
def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;
21842176

21852177

21862178
//-----------------------------------
@@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
21912183
// non-coherent texture cache, and therefore the values read must be read-only
21922184
// during the lifetime of the kernel.
21932185

2194-
class LDG_G<string TyStr, NVPTXRegClass regclass>
2195-
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
2196-
"ld.global.nc." # TyStr # " \t$result, [$src];",
2197-
[]>, Requires<[hasLDG]>;
2186+
class LDG_G<NVPTXRegClass regclass>
2187+
: NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2188+
"ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
21982189

2199-
def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
2200-
def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
2201-
def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
2202-
def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
2203-
def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
2204-
def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
2190+
def LD_GLOBAL_NC_i8 : LDG_G<Int16Regs>;
2191+
def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
2192+
def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
2193+
def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;
22052194

22062195
// vector
22072196

22082197
// Elementized vector ldg
2209-
class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
2198+
class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
22102199
NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2211-
(ins ADDR:$src),
2212-
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
2200+
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2201+
"ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
22132202

22142203

2215-
class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
2204+
class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
22162205
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2217-
(ins ADDR:$src),
2218-
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
2206+
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2207+
"ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
22192208

2220-
class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
2209+
class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
22212210
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
22222211
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
2223-
(ins ADDR:$src),
2224-
"ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
2212+
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2213+
"ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
22252214

22262215
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
2227-
def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
2228-
def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
2229-
def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
2230-
def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
2231-
def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
2232-
def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
2233-
2234-
def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
2235-
def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
2236-
def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
2237-
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
2238-
2239-
def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
2240-
def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
2241-
def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
2242-
def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
2216+
def LD_GLOBAL_NC_v2i8 : VLDG_G_ELE_V2<Int16Regs>;
2217+
def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
2218+
def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
2219+
def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;
2220+
2221+
def LD_GLOBAL_NC_v4i8 : VLDG_G_ELE_V4<Int16Regs>;
2222+
def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
2223+
def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;
2224+
2225+
def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
2226+
def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;
22432227

22442228
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
22452229
if Supports32 then

llvm/test/CodeGen/NVPTX/bug26185-2.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
1515
; CHECK-LABEL: spam(
1616
; CHECK: .maxntid 1, 1, 1
1717
; CHECK-NEXT: {
18-
; CHECK-NEXT: .reg .b16 %rs<2>;
1918
; CHECK-NEXT: .reg .b32 %r<2>;
2019
; CHECK-NEXT: .reg .b64 %rd<9>;
2120
; CHECK-EMPTY:
@@ -25,8 +24,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
2524
; CHECK-NEXT: shl.b64 %rd3, %rd2, 1;
2625
; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
2726
; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1];
28-
; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd4+16];
29-
; CHECK-NEXT: cvt.s32.s16 %r1, %rs1;
27+
; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16];
3028
; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1;
3129
; CHECK-NEXT: ld.global.b64 %rd7, [%rd5];
3230
; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7;

llvm/test/CodeGen/NVPTX/bug26185.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ target triple = "nvptx64-unknown-unknown"
1111
define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
1212
; CHECK-LABEL: ex_zext(
1313
; CHECK: {
14-
; CHECK-NEXT: .reg .b16 %rs<2>;
1514
; CHECK-NEXT: .reg .b32 %r<2>;
1615
; CHECK-NEXT: .reg .b64 %rd<5>;
1716
; CHECK-EMPTY:
@@ -20,8 +19,7 @@ define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
2019
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
2120
; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_param_1];
2221
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
23-
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2];
24-
; CHECK-NEXT: cvt.u32.u8 %r1, %rs1;
22+
; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd2];
2523
; CHECK-NEXT: st.global.b32 [%rd4], %r1;
2624
; CHECK-NEXT: ret;
2725
entry:
@@ -34,7 +32,6 @@ entry:
3432
define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
3533
; CHECK-LABEL: ex_sext(
3634
; CHECK: {
37-
; CHECK-NEXT: .reg .b16 %rs<2>;
3835
; CHECK-NEXT: .reg .b32 %r<2>;
3936
; CHECK-NEXT: .reg .b64 %rd<5>;
4037
; CHECK-EMPTY:
@@ -43,8 +40,7 @@ define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
4340
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
4441
; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_param_1];
4542
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
46-
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2];
47-
; CHECK-NEXT: cvt.s32.s8 %r1, %rs1;
43+
; CHECK-NEXT: ld.global.nc.s8 %r1, [%rd2];
4844
; CHECK-NEXT: st.global.b32 [%rd4], %r1;
4945
; CHECK-NEXT: ret;
5046
entry:

llvm/test/CodeGen/NVPTX/i1-ext-load.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,14 @@ target triple = "nvptx-nvidia-cuda"
77

88
define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
99
; CHECK-LABEL: foo(
10-
; CHECK: .reg .b16 %rs<2>;
1110
; CHECK: .reg .b32 %r<4>;
1211
; CHECK: .reg .b64 %rd<5>;
1312
; CHECK-EMPTY:
1413
; CHECK: ld.param.b64 %rd1, [foo_param_0];
1514
; CHECK: cvta.to.global.u64 %rd2, %rd1;
1615
; CHECK: ld.param.b64 %rd3, [foo_param_1];
1716
; CHECK: cvta.to.global.u64 %rd4, %rd3;
18-
; CHECK: ld.global.nc.b8 %rs1, [%rd2];
19-
; CHECK: cvt.u32.u8 %r1, %rs1;
17+
; CHECK: ld.global.nc.b8 %r1, [%rd2];
2018
; CHECK: add.s32 %r2, %r1, 1;
2119
; CHECK: and.b32 %r3, %r2, 1;
2220
; CHECK: st.global.b32 [%rd4], %r3;

llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
163163
define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
164164
; CHECK-LABEL: test_ldg_i8(
165165
; CHECK: {
166-
; CHECK-NEXT: .reg .b16 %rs<2>;
167166
; CHECK-NEXT: .reg .b32 %r<2>;
168167
; CHECK-NEXT: .reg .b64 %rd<2>;
169168
; CHECK-EMPTY:
170169
; CHECK-NEXT: // %bb.0:
171170
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i8_param_0];
172-
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd1];
173-
; CHECK-NEXT: cvt.u32.u8 %r1, %rs1;
171+
; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd1];
174172
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
175173
; CHECK-NEXT: ret;
176174
%val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
@@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
180178
define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
181179
; CHECK-LABEL: test_ldg_i16(
182180
; CHECK: {
183-
; CHECK-NEXT: .reg .b16 %rs<2>;
184181
; CHECK-NEXT: .reg .b32 %r<2>;
185182
; CHECK-NEXT: .reg .b64 %rd<2>;
186183
; CHECK-EMPTY:
187184
; CHECK-NEXT: // %bb.0:
188185
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i16_param_0];
189-
; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1];
190-
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
186+
; CHECK-NEXT: ld.global.nc.b16 %r1, [%rd1];
191187
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
192188
; CHECK-NEXT: ret;
193189
%val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)

llvm/test/CodeGen/NVPTX/variadics-backend.ll

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ define dso_local i32 @bar() {
211211
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
212212
; CHECK-PTX-NEXT: .reg .b64 %SP;
213213
; CHECK-PTX-NEXT: .reg .b64 %SPL;
214-
; CHECK-PTX-NEXT: .reg .b16 %rs<8>;
214+
; CHECK-PTX-NEXT: .reg .b16 %rs<5>;
215215
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
216216
; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
217217
; CHECK-PTX-EMPTY:
@@ -220,18 +220,15 @@ define dso_local i32 @bar() {
220220
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
221221
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
222222
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
223-
; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
224-
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs2;
225-
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
226-
; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3;
227-
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs4;
228-
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
229-
; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5;
230-
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs6;
223+
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1;
224+
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
225+
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
226+
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
227+
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
231228
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
232229
; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
233-
; CHECK-PTX-NEXT: mov.b16 %rs7, 1;
234-
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs7;
230+
; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
231+
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4;
235232
; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
236233
; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3;
237234
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;

0 commit comments

Comments
 (0)