Skip to content

[NVPTX] Remove redundant addressing mode instrs #128044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
443 changes: 62 additions & 381 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Large diffs are not rendered by default.

6 changes: 1 addition & 5 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,18 +107,14 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
// Match direct address complex pattern.
bool SelectDirectAddr(SDValue N, SDValue &Address);

bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
void SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset, MVT VT);
bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset, MVT VT);
bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);
bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
SDValue &Offset);

bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;

Expand Down
114 changes: 0 additions & 114 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2754,24 +2754,6 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
// Load / Store Handling
//
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
Expand Down Expand Up @@ -2802,24 +2784,6 @@ let mayLoad=1, hasSideEffects=0 in {
}

multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
(ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
(ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
(ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
(ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
Expand Down Expand Up @@ -2856,24 +2820,6 @@ let mayStore=1, hasSideEffects=0 in {
// elementization happens at the machine instruction level, so the following
// instructions never appear in the DAG.
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
Expand All @@ -2892,24 +2838,6 @@ multiclass LD_VEC<NVPTXRegClass regclass> {
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
Expand Down Expand Up @@ -2939,27 +2867,6 @@ let mayLoad=1, hasSideEffects=0 in {
}

multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
imem:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
Int32Regs:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
Int64Regs:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
Expand All @@ -2981,27 +2888,6 @@ multiclass ST_VEC<NVPTXRegClass regclass> {
imem:$addr, Offseti32imm:$offset),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
Expand Down
30 changes: 0 additions & 30 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2693,12 +2693,6 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
// Scalar

multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
!strconcat("ldu.global.", TyStr),
[]>, Requires<[hasLDU]>;
def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
!strconcat("ldu.global.", TyStr),
[]>, Requires<[hasLDU]>;
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
!strconcat("ldu.global.", TyStr),
[]>, Requires<[hasLDU]>;
Expand All @@ -2721,12 +2715,6 @@ defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;

// Elementized vector ldu
multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins Int32Regs:$src),
!strconcat("ldu.global.", TyStr), []>;
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins Int64Regs:$src),
!strconcat("ldu.global.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
!strconcat("ldu.global.", TyStr), []>;
Expand All @@ -2739,12 +2727,6 @@ multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
}

multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins Int32Regs:$src),
!strconcat("ldu.global.", TyStr), []>;
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins Int64Regs:$src),
!strconcat("ldu.global.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
!strconcat("ldu.global.", TyStr), []>;
Expand Down Expand Up @@ -2796,12 +2778,6 @@ defm INT_PTX_LDU_G_v4f32_ELE
// during the lifetime of the kernel.

multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
!strconcat("ld.global.nc.", TyStr),
[]>, Requires<[hasLDG]>;
def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
!strconcat("ld.global.nc.", TyStr),
[]>, Requires<[hasLDG]>;
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
!strconcat("ld.global.nc.", TyStr),
[]>, Requires<[hasLDG]>;
Expand Down Expand Up @@ -2830,12 +2806,6 @@ defm INT_PTX_LDG_GLOBAL_f64

// Elementized vector ldg
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins Int32Regs:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins Int64Regs:$src),
!strconcat("ld.global.nc.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
!strconcat("ld.global.nc.", TyStr), []>;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1809,7 +1809,7 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg());

switch (TexHandleDef.getOpcode()) {
case NVPTX::LD_i64_avar: {
case NVPTX::LD_i64_asi: {
// The handle is a parameter value being loaded, replace with the
// parameter symbol
const NVPTXTargetMachine &TM =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ registers:
- { id: 1, class: float32regs }
body: |
bb.0.entry:
%0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0
%0 = LD_f32_asi 0, 4, 1, 2, 32, &test_param_0, 0
; CHECK: [[@LINE+1]]:33: expected a floating point literal
%1 = FADD_rnf32ri %0, float 3
StoreRetvalF32 %1, 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
%0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test_param_0
%0 = LD_f32_asi 0, 0, 4, 1, 2, 32, &test_param_0, 0
%1 = CVT_f64_f32 %0, 0
%2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test_param_1
%2 = LD_i32_asi 0, 0, 4, 1, 0, 32, &test_param_1, 0
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00
%3 = FADD_rnf64ri %1, double 3.250000e+00
%4 = CVT_f32_f64 %3, 5
Expand All @@ -66,9 +66,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
%0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test2_param_0
%0 = LD_f32_asi 0, 0, 4, 1, 2, 32, &test2_param_0, 0
%1 = CVT_f64_f32 %0, 0
%2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test2_param_1
%2 = LD_i32_asi 0, 0, 4, 1, 0, 32, &test2_param_1, 0
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000
%3 = FADD_rnf64ri %1, double 0x7FF8000000000000
%4 = CVT_f32_f64 %3, 5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ registers:
- { id: 1, class: float32regs }
body: |
bb.0.entry:
%0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0
%0 = LD_f32_asi 0, 4, 1, 2, 32, &test_param_0, 0
; CHECK: [[@LINE+1]]:33: floating point constant does not have type 'float'
%1 = FADD_rnf32ri %0, float 0xH3C00
StoreRetvalF32 %1, 0
Expand Down
36 changes: 16 additions & 20 deletions llvm/test/CodeGen/NVPTX/variadics-backend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -215,21 +215,18 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<10>;
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: mov.u64 %rd1, __const_$_bar_$_s1;
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7;
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [%rd2];
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [%rd1+7];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
; CHECK-PTX-NEXT: st.u8 [%SP+2], %rs2;
; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 5;
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd3];
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd1+5];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3;
; CHECK-PTX-NEXT: add.s64 %rd4, %rd1, 6;
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [%rd4];
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [%rd1+6];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5;
; CHECK-PTX-NEXT: shl.b16 %rs7, %rs6, 8;
; CHECK-PTX-NEXT: or.b16 %rs8, %rs7, %rs4;
Expand All @@ -238,14 +235,14 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1;
; CHECK-PTX-NEXT: mov.b16 %rs9, 1;
; CHECK-PTX-NEXT: st.u8 [%SP+12], %rs9;
; CHECK-PTX-NEXT: mov.b64 %rd5, 1;
; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5;
; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 8;
; CHECK-PTX-NEXT: mov.b64 %rd2, 1;
; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd2;
; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8;
; CHECK-PTX-NEXT: { // callseq 1, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: .param .b64 param1;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd6;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0),
; CHECK-PTX-NEXT: variadics2,
Expand Down Expand Up @@ -384,26 +381,25 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b32 %r<3>;
; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
; CHECK-PTX-NEXT: .reg .b64 %rd<6>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot7;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: ld.global.nc.u64 %rd1, [__const_$_qux_$_s];
; CHECK-PTX-NEXT: st.u64 [%SP], %rd1;
; CHECK-PTX-NEXT: mov.u64 %rd2, __const_$_qux_$_s;
; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 8;
; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [%rd3];
; CHECK-PTX-NEXT: st.u64 [%SP+8], %rd4;
; CHECK-PTX-NEXT: mov.b64 %rd5, 1;
; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5;
; CHECK-PTX-NEXT: add.u64 %rd6, %SP, 16;
; CHECK-PTX-NEXT: ld.global.nc.u64 %rd3, [%rd2+8];
; CHECK-PTX-NEXT: st.u64 [%SP+8], %rd3;
; CHECK-PTX-NEXT: mov.b64 %rd4, 1;
; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd4;
; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 16;
; CHECK-PTX-NEXT: { // callseq 3, 0
; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16];
; CHECK-PTX-NEXT: st.param.b64 [param0], %rd1;
; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd4;
; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd3;
; CHECK-PTX-NEXT: .param .b64 param1;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd6;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: call.uni (retval0),
; CHECK-PTX-NEXT: variadics4,
Expand Down