Skip to content

Commit a773356

Browse files
authored
[NVPTX][NFC] Move more TMA lowering to tablegen (#140914)
This patch migrates the lowering of the non-tensor TMA intrinsics to table-gen based. Also, use ADDR nodes for the pointer operands wherever applicable. Signed-off-by: Durgadoss R <[email protected]>
1 parent f5c6b7b commit a773356

File tree

4 files changed

+111
-128
lines changed

4 files changed

+111
-128
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 0 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -2686,69 +2686,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
26862686
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
26872687
}
26882688

2689-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
2690-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2691-
// {dst, mbar, src, size, multicast, cache_hint,
2692-
// multicast_flag, cache_hint_flag}
2693-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2694-
// = {2} + {8}
2695-
size_t NumOps = N->getNumOperands();
2696-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2697-
bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
2698-
size_t NumBaseArgs = 4; // dst, mbar, src, size
2699-
size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
2700-
2701-
SDLoc DL(N);
2702-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
2703-
2704-
// Push MultiCast operand, if available
2705-
if (IsMultiCast)
2706-
Ops.push_back(N->getOperand(MultiCastIdx));
2707-
2708-
// Push CacheHint operand, if available
2709-
if (IsCacheHint)
2710-
Ops.push_back(N->getOperand(MultiCastIdx + 1));
2711-
2712-
// Finally, the chain operand
2713-
Ops.push_back(N->getOperand(0));
2714-
2715-
bool IsShared32 =
2716-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2717-
unsigned Opcode = [&]() {
2718-
if (IsMultiCast && IsCacheHint)
2719-
return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH
2720-
: NVPTX::CP_ASYNC_BULK_G2S_MC_CH;
2721-
if (IsMultiCast)
2722-
return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC
2723-
: NVPTX::CP_ASYNC_BULK_G2S_MC;
2724-
if (IsCacheHint)
2725-
return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH
2726-
: NVPTX::CP_ASYNC_BULK_G2S_CH;
2727-
return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32
2728-
: NVPTX::CP_ASYNC_BULK_G2S;
2729-
}();
2730-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2731-
}
2732-
2733-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) {
2734-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2735-
// src, size, cache_hint, cache_hint_flag
2736-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2737-
// = {2} + {4}
2738-
size_t NumOps = N->getNumOperands();
2739-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2740-
size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint
2741-
2742-
SDLoc DL(N);
2743-
SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
2744-
Ops.push_back(N->getOperand(0)); // Chain operand
2745-
2746-
unsigned Opcode = IsCacheHint
2747-
? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
2748-
: NVPTX::CP_ASYNC_BULK_PREFETCH;
2749-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2750-
}
2751-
27522689
#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
27532690
(enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
27542691
: NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
@@ -2865,12 +2802,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
28652802
switch (IID) {
28662803
default:
28672804
return false;
2868-
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
2869-
SelectCpAsyncBulkG2S(N);
2870-
return true;
2871-
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
2872-
SelectCpAsyncBulkPrefetchL2(N);
2873-
return true;
28742805
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
28752806
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
28762807
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9292
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
9393
void SelectV2I64toI128(SDNode *N);
9494
void SelectI128toV2I64(SDNode *N);
95-
void SelectCpAsyncBulkG2S(SDNode *N);
96-
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
9795
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9896
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
9997
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 39 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -544,52 +544,50 @@ multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
544544
[(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>,
545545
Requires<[hasPTX<86>, hasSM<100>]>;
546546
}
547-
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>;
548-
defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>;
547+
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<has_ch = 0>;
548+
defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
549549

550-
multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
551-
def NAME: NVPTXInst<(outs),
552-
(ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size),
553-
!strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>,
554-
Requires<[hasPTX<80>, hasSM<90>]>;
555-
def NAME # _MC: NVPTXInst<(outs),
556-
(ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc),
557-
!strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>,
558-
Requires<[hasPTX<80>, hasSM<90>]>;
559-
def NAME # _CH: NVPTXInst<(outs),
560-
(ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
561-
!strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>,
562-
Requires<[hasPTX<80>, hasSM<90>]>;
563-
def NAME # _MC_CH: NVPTXInst<(outs),
564-
(ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch),
565-
!strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>,
566-
Requires<[hasPTX<80>, hasSM<90>]>;
550+
multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
551+
defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
552+
553+
def NAME : NVPTXInst<(outs),
554+
(ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
555+
Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
556+
!if(has_ch,
557+
CpAsyncBulkStr<0, 1>.G2S # " [$dst], [$src], $size, [$mbar], $ch;",
558+
CpAsyncBulkStr<0, 0>.G2S # " [$dst], [$src], $size, [$mbar];"),
559+
[(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
560+
Requires<[hasPTX<80>, hasSM<90>]>;
561+
562+
def NAME # _MC : NVPTXInst<(outs),
563+
(ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
564+
Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
565+
!if(has_ch,
566+
CpAsyncBulkStr<1, 1>.G2S # " [$dst], [$src], $size, [$mbar], $mask, $ch;",
567+
CpAsyncBulkStr<1, 0>.G2S # " [$dst], [$src], $size, [$mbar], $mask;"),
568+
[(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, -1, !if(has_ch, -1, 0))]>,
569+
Requires<[hasPTX<80>, hasSM<90>]>;
567570
}
568-
defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S<Int64Regs>;
569-
defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S<Int32Regs>;
571+
defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR<has_ch = 0>;
572+
defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR<has_ch = 1>;
570573

571-
multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
572-
def NAME: NVPTXInst<(outs),
573-
(ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size),
574-
!strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"),
575-
[(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>,
576-
Requires<[hasPTX<80>, hasSM<90>]>;
574+
def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
575+
(ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size),
576+
CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];",
577+
[(int_nvvm_cp_async_bulk_shared_cta_to_cluster addr:$dst, addr:$mbar, addr:$src, i32:$size)]>,
578+
Requires<[hasPTX<80>, hasSM<90>]>;
579+
580+
multiclass CP_ASYNC_BULK_PREFETCH_INTR<bit has_ch> {
581+
def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
582+
!if(has_ch,
583+
"cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;",
584+
"cp.async.bulk.prefetch.L2.global" # " [$src], $size;"),
585+
[(int_nvvm_cp_async_bulk_prefetch_L2 addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
586+
Requires<[hasPTX<80>, hasSM<90>]>;
577587
}
578-
defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
579-
defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;
588+
defm CP_ASYNC_BULK_PREFETCH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 0>;
589+
defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
580590

581-
//------------------------------
582-
// Bulk Copy Prefetch Functions
583-
//------------------------------
584-
def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
585-
(ins Int64Regs:$src, Int32Regs:$size),
586-
"cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
587-
Requires<[hasPTX<80>, hasSM<90>]>;
588-
589-
def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
590-
(ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
591-
"cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
592-
Requires<[hasPTX<80>, hasSM<90>]>;
593591
//-------------------------------------
594592
// TMA Async Bulk Tensor Copy Functions
595593
//-------------------------------------

0 commit comments

Comments
 (0)