Skip to content

[NVPTX] Cleanup ld/st lowering #143936

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
450 changes: 174 additions & 276 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectTexSurfHandle(SDNode *N);
bool tryLoad(SDNode *N);
bool tryLoadVector(SDNode *N);
bool tryLDGLDU(SDNode *N);
bool tryLDU(SDNode *N);
bool tryLDG(MemSDNode *N);
bool tryStore(SDNode *N);
bool tryStoreVector(SDNode *N);
bool tryLoadParam(SDNode *N);
Expand Down
4 changes: 0 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
def hasVote : Predicate<"Subtarget->hasVote()">;
def hasDouble : Predicate<"Subtarget->hasDouble()">;
def hasClusters : Predicate<"Subtarget->hasClusters()">;
def hasLDG : Predicate<"Subtarget->hasLDG()">;
def hasLDU : Predicate<"Subtarget->hasLDU()">;
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
Expand Down
94 changes: 39 additions & 55 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;

class LDU_G<string TyStr, NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
"ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
"ldu.global." # TyStr # " \t$result, [$src];", []>;

def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
def LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;

// vector

Expand All @@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;


def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
def LDU_GLOBAL_v2i8 : VLDU_G_ELE_V2<"b8", Int16Regs>;
def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;

def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
def LDU_GLOBAL_v4i8 : VLDU_G_ELE_V4<"b8", Int16Regs>;
def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;


//-----------------------------------
Expand All @@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
// non-coherent texture cache, and therefore the values read must be read-only
// during the lifetime of the kernel.

class LDG_G<string TyStr, NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
"ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
class LDG_G<NVPTXRegClass regclass>
: NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;

def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
def LD_GLOBAL_NC_i8 : LDG_G<Int16Regs>;
def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;

// vector

// Elementized vector ldg
class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins ADDR:$src),
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;


class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins ADDR:$src),
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;

class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
(ins ADDR:$src),
"ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
(ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
"ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;

// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;

def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;

def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
def LD_GLOBAL_NC_v2i8 : VLDG_G_ELE_V2<Int16Regs>;
def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;

def LD_GLOBAL_NC_v4i8 : VLDG_G_ELE_V4<Int16Regs>;
def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;

def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;

multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
if Supports32 then
Expand Down
22 changes: 19 additions & 3 deletions llvm/test/CodeGen/NVPTX/bug26185-2.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}

Expand All @@ -10,14 +11,29 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

; CHECK-LABEL: spam
define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
; CHECK-LABEL: spam(
; CHECK: .maxntid 1, 1, 1
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %bb
; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [spam_param_3];
; CHECK-NEXT: shl.b64 %rd3, %rd2, 1;
; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3;
; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1];
; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16];
; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1;
; CHECK-NEXT: ld.global.b64 %rd7, [%rd5];
; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7;
; CHECK-NEXT: st.global.b64 [%rd5], %rd8;
; CHECK-NEXT: ret;
bb:
%tmp5 = add nsw i64 %arg3, 8
%tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
; CHECK: ld.global.nc.b16
%tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
; CHECK: cvt.s32.s16
%tmp8 = sext i16 %tmp7 to i64
%tmp9 = mul nsw i64 %tmp8, %tmp8
%tmp10 = load i64, ptr addrspace(1) %arg1, align 8
Expand Down
73 changes: 61 additions & 12 deletions llvm/test/CodeGen/NVPTX/bug26185.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}

Expand All @@ -7,45 +8,93 @@
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-unknown-unknown"

; CHECK-LABEL: ex_zext
define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
; CHECK-LABEL: ex_zext(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [ex_zext_param_0];
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_param_1];
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd2];
; CHECK-NEXT: st.global.b32 [%rd4], %r1;
; CHECK-NEXT: ret;
entry:
; CHECK: ld.global.nc.b8
%val = load i8, ptr %data
; CHECK: cvt.u32.u8
%valext = zext i8 %val to i32
store i32 %valext, ptr %res
ret void
}

; CHECK-LABEL: ex_sext
define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
; CHECK-LABEL: ex_sext(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [ex_sext_param_0];
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_param_1];
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
; CHECK-NEXT: ld.global.nc.s8 %r1, [%rd2];
; CHECK-NEXT: st.global.b32 [%rd4], %r1;
; CHECK-NEXT: ret;
entry:
; CHECK: ld.global.nc.b8
%val = load i8, ptr %data
; CHECK: cvt.s32.s8
%valext = sext i8 %val to i32
store i32 %valext, ptr %res
ret void
}

; CHECK-LABEL: ex_zext_v2
define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
; CHECK-LABEL: ex_zext_v2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [ex_zext_v2_param_0];
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
; CHECK-NEXT: ld.param.b64 %rd3, [ex_zext_v2_param_1];
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
; CHECK-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
; CHECK-NEXT: st.global.v2.b32 [%rd4], {%r2, %r1};
; CHECK-NEXT: ret;
entry:
; CHECK: ld.global.nc.v2.b8
%val = load <2 x i8>, ptr %data
; CHECK: cvt.u32.u16
%valext = zext <2 x i8> %val to <2 x i32>
store <2 x i32> %valext, ptr %res
ret void
}

; CHECK-LABEL: ex_sext_v2
define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
; CHECK-LABEL: ex_sext_v2(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [ex_sext_v2_param_0];
; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1;
; CHECK-NEXT: ld.param.b64 %rd3, [ex_sext_v2_param_1];
; CHECK-NEXT: cvta.to.global.u64 %rd4, %rd3;
; CHECK-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
; CHECK-NEXT: cvt.s32.s8 %r2, %r1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
; CHECK-NEXT: cvt.s32.s8 %r4, %r3;
; CHECK-NEXT: st.global.v2.b32 [%rd4], {%r4, %r2};
; CHECK-NEXT: ret;
entry:
; CHECK: ld.global.nc.v2.b8
%val = load <2 x i8>, ptr %data
; CHECK: cvt.s32.s8
%valext = sext <2 x i8> %val to <2 x i32>
store <2 x i32> %valext, ptr %res
ret void
Expand Down
4 changes: 1 addition & 3 deletions llvm/test/CodeGen/NVPTX/i1-ext-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,14 @@ target triple = "nvptx-nvidia-cuda"

define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
; CHECK-LABEL: foo(
; CHECK: .reg .b16 %rs<2>;
; CHECK: .reg .b32 %r<4>;
; CHECK: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK: ld.param.b64 %rd1, [foo_param_0];
; CHECK: cvta.to.global.u64 %rd2, %rd1;
; CHECK: ld.param.b64 %rd3, [foo_param_1];
; CHECK: cvta.to.global.u64 %rd4, %rd3;
; CHECK: ld.global.nc.b8 %rs1, [%rd2];
; CHECK: cvt.u32.u8 %r1, %rs1;
; CHECK: ld.global.nc.b8 %r1, [%rd2];
; CHECK: add.s32 %r2, %r1, 1;
; CHECK: and.b32 %r3, %r2, 1;
; CHECK: st.global.b32 [%rd4], %r3;
Expand Down
8 changes: 2 additions & 6 deletions llvm/test/CodeGen/NVPTX/ldu-ldg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
; CHECK-LABEL: test_ldg_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i8_param_0];
; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd1];
; CHECK-NEXT: cvt.u32.u8 %r1, %rs1;
; CHECK-NEXT: ld.global.nc.b8 %r1, [%rd1];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
Expand All @@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
; CHECK-LABEL: test_ldg_i16(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i16_param_0];
; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1];
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: ld.global.nc.b16 %r1, [%rd1];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
Expand Down
19 changes: 8 additions & 11 deletions llvm/test/CodeGen/NVPTX/variadics-backend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<8>;
; CHECK-PTX-NEXT: .reg .b16 %rs<5>;
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
; CHECK-PTX-EMPTY:
Expand All @@ -220,18 +220,15 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs2;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3;
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs4;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5;
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs6;
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
; CHECK-PTX-NEXT: mov.b32 %r1, 1;
; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1;
; CHECK-PTX-NEXT: mov.b16 %rs7, 1;
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs7;
; CHECK-PTX-NEXT: mov.b16 %rs4, 1;
; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4;
; CHECK-PTX-NEXT: mov.b64 %rd3, 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3;
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8;
Expand Down
Loading