Skip to content

Commit d3dae84

Browse files
authored
[NVPTX] Switch to imm offset variants for LDG and LDU (#128270)
1 parent f5d80c3 commit d3dae84

File tree

4 files changed

+339
-211
lines changed

4 files changed

+339
-211
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 38 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11061106
std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
11071107
break;
11081108
}
1109-
if (!Opcode)
1110-
return false;
1111-
Ops.append({Base, Offset, Chain});
11121109
} else {
11131110
if (PointerSize == 64) {
11141111
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11481145
break;
11491146
}
11501147
}
1151-
if (!Opcode)
1152-
return false;
1153-
Ops.append({Base, Offset, Chain});
11541148
}
1149+
if (!Opcode)
1150+
return false;
1151+
Ops.append({Base, Offset, Chain});
11551152
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
11561153

11571154
MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
12021199
std::optional<unsigned> Opcode;
12031200
SDLoc DL(N);
12041201
SDNode *LD;
1205-
SDValue Base, Offset, Addr;
1202+
SDValue Base, Offset;
12061203

1207-
if (SelectDirectAddr(Op1, Addr)) {
1204+
if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
12081205
switch (N->getOpcode()) {
12091206
default:
12101207
return false;
12111208
case ISD::LOAD:
12121209
Opcode = pickOpcodeForVT(
1213-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1214-
NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1215-
NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1216-
NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1210+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
1211+
NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
1212+
NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
1213+
NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
12171214
break;
12181215
case ISD::INTRINSIC_W_CHAIN:
12191216
Opcode = pickOpcodeForVT(
1220-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1221-
NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1222-
NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1223-
NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1217+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
1218+
NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
1219+
NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
1220+
NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
12241221
break;
12251222
case NVPTXISD::LoadV2:
12261223
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1227-
NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1228-
NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1229-
NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1230-
NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1231-
NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1232-
NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1224+
NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
1225+
NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
1226+
NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
1227+
NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
1228+
NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
1229+
NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
12331230
break;
12341231
case NVPTXISD::LDUV2:
12351232
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1236-
NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1237-
NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1238-
NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1239-
NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1240-
NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1241-
NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1233+
NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
1234+
NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
1235+
NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
1236+
NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
1237+
NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
1238+
NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
12421239
break;
12431240
case NVPTXISD::LoadV4:
12441241
Opcode = pickOpcodeForVT(
1245-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1246-
NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1247-
NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1248-
NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1242+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
1243+
NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
1244+
NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
1245+
NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
12491246
break;
12501247
case NVPTXISD::LDUV4:
12511248
Opcode = pickOpcodeForVT(
1252-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1253-
NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1254-
NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1255-
NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1249+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
1250+
NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
1251+
NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
1252+
NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
12561253
break;
12571254
}
1258-
if (!Opcode)
1259-
return false;
1260-
SDValue Ops[] = { Addr, Chain };
1261-
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
12621255
} else {
12631256
if (TM.is64Bit()) {
12641257
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
13691362
break;
13701363
}
13711364
}
1372-
if (!Opcode)
1373-
return false;
1374-
SDValue Ops[] = {Base, Offset, Chain};
1375-
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
13761365
}
1366+
if (!Opcode)
1367+
return false;
1368+
SDValue Ops[] = {Base, Offset, Chain};
1369+
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
13771370

13781371
// For automatic generation of LDG (through SelectLoad[Vector], not the
13791372
// intrinsics), we may have an extending load like:
@@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
15771570
std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
15781571
break;
15791572
}
1580-
Ops.append({Base, Offset});
15811573
} else {
15821574
if (PointerSize == 64) {
15831575
SelectADDRri64(N2.getNode(), N2, Base, Offset);
@@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
16171609
break;
16181610
}
16191611
}
1620-
Ops.append({Base, Offset});
16211612
}
16221613
if (!Opcode)
16231614
return false;
1624-
1625-
Ops.push_back(Chain);
1615+
Ops.append({Base, Offset, Chain});
16261616

16271617
ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
16281618

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 66 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2718,80 +2718,64 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
27182718
// Scalar
27192719

27202720
multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
2721-
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
2722-
!strconcat("ldu.global.", TyStr),
2721+
def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
2722+
"ldu.global." # TyStr # " \t$result, [$src$offset];",
27232723
[]>, Requires<[hasLDU]>;
27242724
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
2725-
!strconcat("ldu.global.", TyStr),
2725+
"ldu.global." # TyStr # " \t$result, [$src];",
27262726
[]>, Requires<[hasLDU]>;
27272727
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
2728-
!strconcat("ldu.global.", TyStr),
2728+
"ldu.global." # TyStr # " \t$result, [$src];",
27292729
[]>, Requires<[hasLDU]>;
27302730
}
27312731

2732-
defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
2733-
defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
2734-
defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
2735-
defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
2736-
defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
2737-
defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
2732+
defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
2733+
defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
2734+
defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
2735+
defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
2736+
defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
2737+
defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
27382738

27392739
// vector
27402740

27412741
// Elementized vector ldu
27422742
multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
27432743
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
27442744
(ins MEMri:$src),
2745-
!strconcat("ldu.global.", TyStr), []>;
2745+
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
27462746
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
27472747
(ins MEMri64:$src),
2748-
!strconcat("ldu.global.", TyStr), []>;
2749-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2750-
(ins imemAny:$src),
2751-
!strconcat("ldu.global.", TyStr), []>;
2748+
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
2749+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2750+
(ins imemAny:$src, Offseti32imm:$offset),
2751+
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
27522752
}
27532753

27542754
multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
27552755
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
27562756
regclass:$dst4), (ins MEMri:$src),
2757-
!strconcat("ldu.global.", TyStr), []>;
2757+
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
27582758
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
27592759
regclass:$dst4), (ins MEMri64:$src),
2760-
!strconcat("ldu.global.", TyStr), []>;
2761-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2762-
regclass:$dst4), (ins imemAny:$src),
2763-
!strconcat("ldu.global.", TyStr), []>;
2764-
}
2765-
2766-
defm INT_PTX_LDU_G_v2i8_ELE
2767-
: VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2768-
defm INT_PTX_LDU_G_v2i16_ELE
2769-
: VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2770-
defm INT_PTX_LDU_G_v2i32_ELE
2771-
: VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
2772-
defm INT_PTX_LDU_G_v2f32_ELE
2773-
: VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
2774-
defm INT_PTX_LDU_G_v2i64_ELE
2775-
: VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
2776-
defm INT_PTX_LDU_G_v2f64_ELE
2777-
: VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
2778-
defm INT_PTX_LDU_G_v4i8_ELE
2779-
: VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2780-
defm INT_PTX_LDU_G_v4i16_ELE
2781-
: VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2782-
Int16Regs>;
2783-
defm INT_PTX_LDU_G_v4i32_ELE
2784-
: VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2785-
Int32Regs>;
2786-
defm INT_PTX_LDU_G_v4f16_ELE
2787-
: VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2788-
Int16Regs>;
2789-
defm INT_PTX_LDU_G_v4f16x2_ELE
2790-
: VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2791-
Int32Regs>;
2792-
defm INT_PTX_LDU_G_v4f32_ELE
2793-
: VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2794-
Float32Regs>;
2760+
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
2761+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2762+
regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
2763+
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
2764+
}
2765+
2766+
defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
2767+
defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
2768+
defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
2769+
defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
2770+
defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
2771+
defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
2772+
2773+
defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
2774+
defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
2775+
defm INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
2776+
defm INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
2777+
defm INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
2778+
defm INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;
27952779

27962780

27972781
//-----------------------------------
@@ -2803,84 +2787,63 @@ defm INT_PTX_LDU_G_v4f32_ELE
28032787
// during the lifetime of the kernel.
28042788

28052789
multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
2806-
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
2807-
!strconcat("ld.global.nc.", TyStr),
2790+
def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
2791+
"ld.global.nc." # TyStr # " \t$result, [$src$offset];",
28082792
[]>, Requires<[hasLDG]>;
28092793
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
2810-
!strconcat("ld.global.nc.", TyStr),
2794+
"ld.global.nc." # TyStr # " \t$result, [$src];",
28112795
[]>, Requires<[hasLDG]>;
28122796
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
2813-
!strconcat("ld.global.nc.", TyStr),
2797+
"ld.global.nc." # TyStr # " \t$result, [$src];",
28142798
[]>, Requires<[hasLDG]>;
28152799
}
28162800

2817-
defm INT_PTX_LDG_GLOBAL_i8
2818-
: LDG_G<"u8 \t$result, [$src];", Int16Regs>;
2819-
defm INT_PTX_LDG_GLOBAL_i16
2820-
: LDG_G<"u16 \t$result, [$src];", Int16Regs>;
2821-
defm INT_PTX_LDG_GLOBAL_i32
2822-
: LDG_G<"u32 \t$result, [$src];", Int32Regs>;
2823-
defm INT_PTX_LDG_GLOBAL_i64
2824-
: LDG_G<"u64 \t$result, [$src];", Int64Regs>;
2825-
defm INT_PTX_LDG_GLOBAL_f32
2826-
: LDG_G<"f32 \t$result, [$src];", Float32Regs>;
2827-
defm INT_PTX_LDG_GLOBAL_f64
2828-
: LDG_G<"f64 \t$result, [$src];", Float64Regs>;
2801+
defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
2802+
defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
2803+
defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
2804+
defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
2805+
defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
2806+
defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
28292807

28302808
// vector
28312809

28322810
// Elementized vector ldg
28332811
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
28342812
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
28352813
(ins MEMri:$src),
2836-
!strconcat("ld.global.nc.", TyStr), []>;
2814+
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
28372815
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
28382816
(ins MEMri64:$src),
2839-
!strconcat("ld.global.nc.", TyStr), []>;
2840-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2841-
(ins imemAny:$src),
2842-
!strconcat("ld.global.nc.", TyStr), []>;
2817+
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
2818+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2819+
(ins imemAny:$src, Offseti32imm:$offset),
2820+
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
28432821
}
28442822

28452823
multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
2846-
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2847-
regclass:$dst4), (ins Int32Regs:$src),
2848-
!strconcat("ld.global.nc.", TyStr), []>;
2849-
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2850-
regclass:$dst4), (ins Int64Regs:$src),
2851-
!strconcat("ld.global.nc.", TyStr), []>;
28522824
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
28532825
regclass:$dst4), (ins MEMri:$src),
2854-
!strconcat("ld.global.nc.", TyStr), []>;
2826+
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
28552827
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
28562828
regclass:$dst4), (ins MEMri64:$src),
2857-
!strconcat("ld.global.nc.", TyStr), []>;
2858-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2859-
regclass:$dst4), (ins imemAny:$src),
2860-
!strconcat("ld.global.nc.", TyStr), []>;
2829+
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
2830+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2831+
regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
2832+
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
28612833
}
28622834

28632835
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
2864-
defm INT_PTX_LDG_G_v2i8_ELE
2865-
: VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2866-
defm INT_PTX_LDG_G_v2i16_ELE
2867-
: VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2868-
defm INT_PTX_LDG_G_v2i32_ELE
2869-
: VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
2870-
defm INT_PTX_LDG_G_v2f32_ELE
2871-
: VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
2872-
defm INT_PTX_LDG_G_v2i64_ELE
2873-
: VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
2874-
defm INT_PTX_LDG_G_v2f64_ELE
2875-
: VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
2876-
defm INT_PTX_LDG_G_v4i8_ELE
2877-
: VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2878-
defm INT_PTX_LDG_G_v4i16_ELE
2879-
: VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2880-
defm INT_PTX_LDG_G_v4i32_ELE
2881-
: VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
2882-
defm INT_PTX_LDG_G_v4f32_ELE
2883-
: VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
2836+
defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
2837+
defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
2838+
defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
2839+
defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
2840+
defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
2841+
defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
2842+
2843+
defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
2844+
defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
2845+
defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
2846+
defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
28842847

28852848

28862849
multiclass NG_TO_G<string Str> {

0 commit comments

Comments
 (0)