@@ -544,52 +544,50 @@ multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
544
544
[(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>,
545
545
Requires<[hasPTX<86>, hasSM<100>]>;
546
546
}
547
- defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>;
548
- defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>;
547
+ defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<has_ch = 0>;
548
+ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
549
549
550
- multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
551
- def NAME: NVPTXInst<(outs),
552
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size),
553
- !strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>,
554
- Requires<[hasPTX<80>, hasSM<90>]>;
555
- def NAME # _MC: NVPTXInst<(outs),
556
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc),
557
- !strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>,
558
- Requires<[hasPTX<80>, hasSM<90>]>;
559
- def NAME # _CH: NVPTXInst<(outs),
560
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
561
- !strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>,
562
- Requires<[hasPTX<80>, hasSM<90>]>;
563
- def NAME # _MC_CH: NVPTXInst<(outs),
564
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch),
565
- !strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>,
566
- Requires<[hasPTX<80>, hasSM<90>]>;
550
+ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
551
+ defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
552
+
553
+ def NAME : NVPTXInst<(outs),
554
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
555
+ Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
556
+ !if(has_ch,
557
+ CpAsyncBulkStr<0, 1>.G2S # " [$dst], [$src], $size, [$mbar], $ch;",
558
+ CpAsyncBulkStr<0, 0>.G2S # " [$dst], [$src], $size, [$mbar];"),
559
+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
560
+ Requires<[hasPTX<80>, hasSM<90>]>;
561
+
562
+ def NAME # _MC : NVPTXInst<(outs),
563
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
564
+ Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
565
+ !if(has_ch,
566
+ CpAsyncBulkStr<1, 1>.G2S # " [$dst], [$src], $size, [$mbar], $mask, $ch;",
567
+ CpAsyncBulkStr<1, 0>.G2S # " [$dst], [$src], $size, [$mbar], $mask;"),
568
+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, -1, !if(has_ch, -1, 0))]>,
569
+ Requires<[hasPTX<80>, hasSM<90>]>;
567
570
}
568
- defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S<Int64Regs >;
569
- defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S<Int32Regs >;
571
+ defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR<has_ch = 0 >;
572
+ defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR<has_ch = 1 >;
570
573
571
- multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
572
- def NAME: NVPTXInst<(outs),
573
- (ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size),
574
- !strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"),
575
- [(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>,
576
- Requires<[hasPTX<80>, hasSM<90>]>;
574
+ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
575
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size),
576
+ CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];",
577
+ [(int_nvvm_cp_async_bulk_shared_cta_to_cluster addr:$dst, addr:$mbar, addr:$src, i32:$size)]>,
578
+ Requires<[hasPTX<80>, hasSM<90>]>;
579
+
580
+ multiclass CP_ASYNC_BULK_PREFETCH_INTR<bit has_ch> {
581
+ def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
582
+ !if(has_ch,
583
+ "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;",
584
+ "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"),
585
+ [(int_nvvm_cp_async_bulk_prefetch_L2 addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
586
+ Requires<[hasPTX<80>, hasSM<90>]>;
577
587
}
578
- defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs >;
579
- defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs >;
588
+ defm CP_ASYNC_BULK_PREFETCH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 0 >;
589
+ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1 >;
580
590
581
- //------------------------------
582
- // Bulk Copy Prefetch Functions
583
- //------------------------------
584
- def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
585
- (ins Int64Regs:$src, Int32Regs:$size),
586
- "cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
587
- Requires<[hasPTX<80>, hasSM<90>]>;
588
-
589
- def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
590
- (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
591
- "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
592
- Requires<[hasPTX<80>, hasSM<90>]>;
593
591
//-------------------------------------
594
592
// TMA Async Bulk Tensor Copy Functions
595
593
//-------------------------------------
0 commit comments