Skip to content

Commit 08a45d3

Browse files
committed
[NVPTX] Cleanup ld/st lowering
1 parent f1885fd commit 08a45d3

File tree

9 files changed

+229
-365
lines changed

9 files changed

+229
-365
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 174 additions & 276 deletions
Large diffs are not rendered by default.

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
7575
void SelectTexSurfHandle(SDNode *N);
7676
bool tryLoad(SDNode *N);
7777
bool tryLoadVector(SDNode *N);
78-
bool tryLDGLDU(SDNode *N);
78+
bool tryLDU(SDNode *N);
79+
bool tryLDG(MemSDNode *N);
7980
bool tryStore(SDNode *N);
8081
bool tryStoreVector(SDNode *N);
8182
bool tryLoadParam(SDNode *N);

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135135
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136136
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137137
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138-
def hasVote : Predicate<"Subtarget->hasVote()">;
139-
def hasDouble : Predicate<"Subtarget->hasDouble()">;
140138
def hasClusters : Predicate<"Subtarget->hasClusters()">;
141-
def hasLDG : Predicate<"Subtarget->hasLDG()">;
142-
def hasLDU : Predicate<"Subtarget->hasLDU()">;
143139
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
144140
def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
145141
def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 39 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2170,15 +2170,12 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
21702170

21712171
class LDU_G<string TyStr, NVPTXRegClass regclass>
21722172
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
2173-
"ldu.global." # TyStr # " \t$result, [$src];",
2174-
[]>, Requires<[hasLDU]>;
2173+
"ldu.global." # TyStr # " \t$result, [$src];", []>;
21752174

2176-
def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
2177-
def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
2178-
def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
2179-
def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
2180-
def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
2181-
def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
2175+
def LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
2176+
def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
2177+
def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
2178+
def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
21822179

21832180
// vector
21842181

@@ -2195,19 +2192,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
21952192
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
21962193

21972194

2198-
def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
2199-
def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
2200-
def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
2201-
def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
2202-
def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
2203-
def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
2195+
def LDU_GLOBAL_v2i8 : VLDU_G_ELE_V2<"b8", Int16Regs>;
2196+
def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
2197+
def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
2198+
def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;
22042199

2205-
def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
2206-
def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
2207-
def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
2208-
def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
2209-
def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
2210-
def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
2200+
def LDU_GLOBAL_v4i8 : VLDU_G_ELE_V4<"b8", Int16Regs>;
2201+
def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
2202+
def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;
22112203

22122204

22132205
//-----------------------------------
@@ -2218,55 +2210,47 @@ def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
22182210
// non-coherent texture cache, and therefore the values read must be read-only
22192211
// during the lifetime of the kernel.
22202212

2221-
class LDG_G<string TyStr, NVPTXRegClass regclass>
2222-
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
2223-
"ld.global.nc." # TyStr # " \t$result, [$src];",
2224-
[]>, Requires<[hasLDG]>;
2213+
class LDG_G<NVPTXRegClass regclass>
2214+
: NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2215+
"ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
22252216

2226-
def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
2227-
def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
2228-
def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
2229-
def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
2230-
def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
2231-
def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
2217+
def LD_GLOBAL_NC_i8 : LDG_G<Int16Regs>;
2218+
def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
2219+
def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
2220+
def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;
22322221

22332222
// vector
22342223

22352224
// Elementized vector ldg
2236-
class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
2225+
class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
22372226
NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2238-
(ins ADDR:$src),
2239-
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
2227+
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2228+
"ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
22402229

22412230

2242-
class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
2231+
class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
22432232
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2244-
(ins ADDR:$src),
2245-
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
2233+
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2234+
"ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
22462235

2247-
class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
2236+
class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
22482237
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
22492238
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
2250-
(ins ADDR:$src),
2251-
"ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
2239+
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
2240+
"ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
22522241

22532242
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
2254-
def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
2255-
def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
2256-
def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
2257-
def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
2258-
def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
2259-
def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
2260-
2261-
def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
2262-
def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
2263-
def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
2264-
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
2265-
2266-
def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
2267-
def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
2268-
def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
2269-
def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
2243+
def LD_GLOBAL_NC_v2i8 : VLDG_G_ELE_V2<Int16Regs>;
2244+
def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
2245+
def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
2246+
def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;
2247+
2248+
def LD_GLOBAL_NC_v4i8 : VLDG_G_ELE_V4<Int16Regs>;
2249+
def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
2250+
def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;
2251+
2252+
def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
2253+
def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;
22702254

22712255
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
22722256
if Supports32 then

llvm/test/CodeGen/NVPTX/bug26185-2.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
1515
; CHECK-LABEL: spam(
1616
; CHECK: .maxntid 1, 1, 1
1717
; CHECK-NEXT: {
18-
; CHECK-NEXT: .reg .b16 %rs<2>;
1918
; CHECK-NEXT: .reg .b32 %r<2>;
2019
; CHECK-NEXT: .reg .b64 %rd<9>;
2120
; CHECK-EMPTY:
@@ -25,8 +24,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
2524
; CHECK-NEXT: shl.b64 %rd3, %rd2, 1;
2625
; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
2726
; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1];
28-
; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd4+16];
29-
; CHECK-NEXT: cvt.s32.s16 %r1, %rs1;
27+
; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16];
3028
; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1;
3129
; CHECK-NEXT: ld.global.b64 %rd7, [%rd5];
3230
; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7;

llvm/test/CodeGen/NVPTX/bug26185.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ target triple = "nvptx64-unknown-unknown"
1111
define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
1212
; CHECK-LABEL: ex_zext(
1313
; CHECK: {
14-
; CHECK-NEXT: .reg .b16 %rs<2>;
1514
; CHECK-NEXT: .reg .b32 %r<2>;
1615
; CHECK-NEXT: .reg .b64 %rd<5>;
1716
; CHECK-EMPTY:
@@ -20,8 +19,7 @@ define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
2019
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
2120
; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_param_1];
2221
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
23-
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2];
24-
; CHECK-NEXT: cvt.u32.u8 %r1, %rs1;
22+
; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd2];
2523
; CHECK-NEXT: st.global.b32 [%rd4], %r1;
2624
; CHECK-NEXT: ret;
2725
entry:
@@ -34,7 +32,6 @@ entry:
3432
define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
3533
; CHECK-LABEL: ex_sext(
3634
; CHECK: {
37-
; CHECK-NEXT: .reg .b16 %rs<2>;
3835
; CHECK-NEXT: .reg .b32 %r<2>;
3936
; CHECK-NEXT: .reg .b64 %rd<5>;
4037
; CHECK-EMPTY:
@@ -43,8 +40,7 @@ define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
4340
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
4441
; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_param_1];
4542
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
46-
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd2];
47-
; CHECK-NEXT: cvt.s32.s8 %r1, %rs1;
43+
; CHECK-NEXT: ld.global.nc.s8 %r1, [%rd2];
4844
; CHECK-NEXT: st.global.b32 [%rd4], %r1;
4945
; CHECK-NEXT: ret;
5046
entry:

llvm/test/CodeGen/NVPTX/i1-ext-load.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,14 @@ target triple = "nvptx-nvidia-cuda"
77

88
define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
99
; CHECK-LABEL: foo(
10-
; CHECK: .reg .b16 %rs<2>;
1110
; CHECK: .reg .b32 %r<4>;
1211
; CHECK: .reg .b64 %rd<5>;
1312
; CHECK-EMPTY:
1413
; CHECK: ld.param.b64 %rd1, [foo_param_0];
1514
; CHECK: cvta.to.global.u64 %rd2, %rd1;
1615
; CHECK: ld.param.b64 %rd3, [foo_param_1];
1716
; CHECK: cvta.to.global.u64 %rd4, %rd3;
18-
; CHECK: ld.global.nc.b8 %rs1, [%rd2];
19-
; CHECK: cvt.u32.u8 %r1, %rs1;
17+
; CHECK: ld.global.nc.b8 %r1, [%rd2];
2018
; CHECK: add.s32 %r2, %r1, 1;
2119
; CHECK: and.b32 %r3, %r2, 1;
2220
; CHECK: st.global.b32 [%rd4], %r3;

llvm/test/CodeGen/NVPTX/ldu-ldg.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
163163
define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
164164
; CHECK-LABEL: test_ldg_i8(
165165
; CHECK: {
166-
; CHECK-NEXT: .reg .b16 %rs<2>;
167166
; CHECK-NEXT: .reg .b32 %r<2>;
168167
; CHECK-NEXT: .reg .b64 %rd<2>;
169168
; CHECK-EMPTY:
170169
; CHECK-NEXT: // %bb.0:
171170
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i8_param_0];
172-
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd1];
173-
; CHECK-NEXT: cvt.u32.u8 %r1, %rs1;
171+
; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd1];
174172
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
175173
; CHECK-NEXT: ret;
176174
%val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
@@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
180178
define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
181179
; CHECK-LABEL: test_ldg_i16(
182180
; CHECK: {
183-
; CHECK-NEXT: .reg .b16 %rs<2>;
184181
; CHECK-NEXT: .reg .b32 %r<2>;
185182
; CHECK-NEXT: .reg .b64 %rd<2>;
186183
; CHECK-EMPTY:
187184
; CHECK-NEXT: // %bb.0:
188185
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i16_param_0];
189-
; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1];
190-
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
186+
; CHECK-NEXT: ld.global.nc.b16 %r1, [%rd1];
191187
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
192188
; CHECK-NEXT: ret;
193189
%val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)

llvm/test/CodeGen/NVPTX/variadics-backend.ll

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ define dso_local i32 @bar() {
211211
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
212212
; CHECK-PTX-NEXT: .reg .b64 %SP;
213213
; CHECK-PTX-NEXT: .reg .b64 %SPL;
214-
; CHECK-PTX-NEXT: .reg .b16 %rs<8>;
214+
; CHECK-PTX-NEXT: .reg .b16 %rs<5>;
215215
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
216216
; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
217217
; CHECK-PTX-EMPTY:
@@ -220,18 +220,15 @@ define dso_local i32 @bar() {
220220
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
221221
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
222222
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
223-
; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
224-
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs2;
225-
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
226-
; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3;
227-
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs4;
228-
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
229-
; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5;
230-
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs6;
223+
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1;
224+
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
225+
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
226+
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
227+
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
231228
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
232229
; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
233-
; CHECK-PTX-NEXT: mov.b16 %rs7, 1;
234-
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs7;
230+
; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
231+
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4;
235232
; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
236233
; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3;
237234
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;

0 commit comments

Comments
 (0)