@@ -2617,6 +2617,30 @@ def Tcgen05GroupKindAttr :
2617
2617
let assemblyFormat = "`<` $value `>`";
2618
2618
}
2619
2619
2620
+ def Tcgen05FenceBefore : I32EnumAttrCase<"BEFORE_THREAD_SYNC", 0, "before">;
2621
+ def Tcgen05FenceAfter : I32EnumAttrCase<"AFTER_THREAD_SYNC", 1, "after">;
2622
+ def Tcgen05FenceKind : I32EnumAttr<"Tcgen05FenceKind", "NVVM Tcgen05 fence kind",
2623
+ [Tcgen05FenceBefore, Tcgen05FenceAfter]> {
2624
+ let genSpecializedAttr = 0;
2625
+ let cppNamespace = "::mlir::NVVM";
2626
+ }
2627
+ def Tcgen05FenceKindAttr :
2628
+ EnumAttr<NVVM_Dialect, Tcgen05FenceKind, "tcgen05_fence"> {
2629
+ let assemblyFormat = "`<` $value `>`";
2630
+ }
2631
+
2632
+ def Tcgen05WaitLoad : I32EnumAttrCase<"LOAD", 0, "load">;
2633
+ def Tcgen05WaitStore : I32EnumAttrCase<"STORE", 1, "store">;
2634
+ def Tcgen05WaitKind : I32EnumAttr<"Tcgen05WaitKind", "NVVM Tcgen05 wait kind",
2635
+ [Tcgen05WaitLoad, Tcgen05WaitStore]> {
2636
+ let genSpecializedAttr = 0;
2637
+ let cppNamespace = "::mlir::NVVM";
2638
+ }
2639
+ def Tcgen05WaitKindAttr :
2640
+ EnumAttr<NVVM_Dialect, Tcgen05WaitKind, "tcgen05_wait"> {
2641
+ let assemblyFormat = "`<` $value `>`";
2642
+ }
2643
+
2620
2644
def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
2621
2645
let summary = "Tcgen05 alloc operation";
2622
2646
let description = [{
@@ -2701,6 +2725,91 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm
2701
2725
}];
2702
2726
}
2703
2727
2728
+ def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
2729
+ let summary = "Tcgen05 fence operations";
2730
+ let description = [{
2731
+ The `tcgen05.fence<before>` orders all prior async tcgen05 operations
2732
+ with respect to the subsequent tcgen05 and execution ordering operations.
2733
+ The `tcgen05.fence<after>` orders all subsequent async tcgen05 operations
2734
+ with respect to the prior tcgen05 and execution ordering operations.
2735
+
2736
+ [For more information refer to the PTX ISA]
2737
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
2738
+ }];
2739
+
2740
+ let arguments = (ins Tcgen05FenceKindAttr:$kind);
2741
+ let assemblyFormat = "$kind attr-dict";
2742
+
2743
+ string llvmBuilder = [{
2744
+ auto id = ($kind == NVVM::Tcgen05FenceKind::BEFORE_THREAD_SYNC)
2745
+ ? llvm::Intrinsic::nvvm_tcgen05_fence_before_thread_sync
2746
+ : llvm::Intrinsic::nvvm_tcgen05_fence_after_thread_sync;
2747
+ createIntrinsicCall(builder, id);
2748
+ }];
2749
+ }
2750
+
2751
+ def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
2752
+ let summary = "Tcgen05 wait operations";
2753
+ let description = [{
2754
+ The `tcgen05.wait<load>` causes the executing thread to block until
2755
+ all prior `tcgen05.ld` operations issued by the executing thread
2756
+ have completed. Similarly, the `tcgen05.wait<store>` causes the executing
2757
+ thread to block until all prior `tcgen05.st` operations issued by the
2758
+ executing thread have completed.
2759
+ [For more information refer PTX ISA]
2760
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
2761
+ }];
2762
+
2763
+ let arguments = (ins Tcgen05WaitKindAttr:$kind);
2764
+ let assemblyFormat = "$kind attr-dict";
2765
+
2766
+ string llvmBuilder = [{
2767
+ auto id = ($kind == NVVM::Tcgen05WaitKind::LOAD)
2768
+ ? llvm::Intrinsic::nvvm_tcgen05_wait_ld
2769
+ : llvm::Intrinsic::nvvm_tcgen05_wait_st;
2770
+ createIntrinsicCall(builder, id);
2771
+ }];
2772
+ }
2773
+
2774
+ def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
2775
+ let summary = "Tcgen05 commit operations";
2776
+ let description = [{
2777
+ The `tcgen05.commit` makes the mbarrier object, specified by
2778
+ the operand `addr`, track the completion of all the prior
2779
+ async-tcgen05 operations initiated by the executing thread.
2780
+ The multicast variants allow signaling on the mbarrier objects
2781
+ of multiple CTAs within the cluster. Operand `multicastMask`,
2782
+ when present, specifies the destination CTAs in the cluster such
2783
+ that each bit position in the 16-bit `multicastMask` operand
2784
+ corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
2785
+ [For more information refer PTX ISA]
2786
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
2787
+ }];
2788
+
2789
+ let arguments = (ins
2790
+ AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
2791
+ Optional<I16>:$multicastMask,
2792
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2793
+
2794
+ let assemblyFormat = [{
2795
+ $addr (`,` `multicast_mask` `=` $multicastMask^)?
2796
+ attr-dict `:` type(operands)
2797
+ }];
2798
+
2799
+ let extraClassDeclaration = [{
2800
+ static llvm::Intrinsic::ID
2801
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2802
+ llvm::SmallVector<llvm::Value *> &args);
2803
+ }];
2804
+
2805
+ string llvmBuilder = [{
2806
+ llvm::SmallVector<llvm::Value *> args;
2807
+ auto id = NVVM::Tcgen05CommitOp::getIntrinsicIDAndArgs(
2808
+ *op, moduleTranslation, args);
2809
+ createIntrinsicCall(builder, id, args);
2810
+ }];
2811
+ }
2812
+
2704
2813
//===----------------------------------------------------------------------===//
2705
2814
// NVVM target attribute.
2706
2815
//===----------------------------------------------------------------------===//
0 commit comments