-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MLIR][NVVM] Add tcgen05 wait/fence Ops #126265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MLIR][NVVM] Add tcgen05 wait/fence Ops #126265
Conversation
PR llvm#126091 adds intrinsics for tcgen05 wait/fence/commit operations. This patch adds NVVM Dialect Ops for them. Signed-off-by: Durgadoss R <[email protected]>
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-llvm Author: Durgadoss R (durga4github) ChangesPR #126091 adds intrinsics for tcgen05 Full diff: https://github.com/llvm/llvm-project/pull/126265.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 11226dae2c3f375..fe15a524ec3b5cb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2617,6 +2617,30 @@ def Tcgen05GroupKindAttr :
let assemblyFormat = "`<` $value `>`";
}
+def Tcgen05FenceBefore : I32EnumAttrCase<"BEFORE_THREAD_SYNC", 0, "before">;
+def Tcgen05FenceAfter : I32EnumAttrCase<"AFTER_THREAD_SYNC", 1, "after">;
+def Tcgen05FenceKind : I32EnumAttr<"Tcgen05FenceKind", "NVVM Tcgen05 fence kind",
+ [Tcgen05FenceBefore, Tcgen05FenceAfter]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+def Tcgen05FenceKindAttr :
+ EnumAttr<NVVM_Dialect, Tcgen05FenceKind, "tcgen05_fence"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def Tcgen05WaitLoad : I32EnumAttrCase<"LOAD", 0, "load">;
+def Tcgen05WaitStore : I32EnumAttrCase<"STORE", 1, "store">;
+def Tcgen05WaitKind : I32EnumAttr<"Tcgen05WaitKind", "NVVM Tcgen05 wait kind",
+ [Tcgen05WaitLoad, Tcgen05WaitStore]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+def Tcgen05WaitKindAttr :
+ EnumAttr<NVVM_Dialect, Tcgen05WaitKind, "tcgen05_wait"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
let summary = "Tcgen05 alloc operation";
let description = [{
@@ -2701,6 +2725,91 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm
}];
}
+def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
+ let summary = "Tcgen05 fence operations";
+ let description = [{
+ The `tcgen05.fence<before>` orders all prior async tcgen05 operations
+ with respect to the subsequent tcgen05 and execution ordering operations.
+ The `tcgen05.fence<after>` orders all subsequent async tcgen05 operations
+ with respect to the prior tcgen05 and execution ordering operations.
+
+ [For more information refer to the PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
+ }];
+
+ let arguments = (ins Tcgen05FenceKindAttr:$kind);
+ let assemblyFormat = "$kind attr-dict";
+
+ string llvmBuilder = [{
+ auto id = ($kind == NVVM::Tcgen05FenceKind::BEFORE_THREAD_SYNC)
+ ? llvm::Intrinsic::nvvm_tcgen05_fence_before_thread_sync
+ : llvm::Intrinsic::nvvm_tcgen05_fence_after_thread_sync;
+ createIntrinsicCall(builder, id);
+ }];
+}
+
+def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
+ let summary = "Tcgen05 wait operations";
+ let description = [{
+ The `tcgen05.wait<load>` causes the executing thread to block until
+ all prior `tcgen05.ld` operations issued by the executing thread
+ have completed. Similarly, the `tcgen05.wait<store>` causes the executing
+ thread to block until all prior `tcgen05.st` operations issued by the
+ executing thread have completed.
+ [For more information refer PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
+ }];
+
+ let arguments = (ins Tcgen05WaitKindAttr:$kind);
+ let assemblyFormat = "$kind attr-dict";
+
+ string llvmBuilder = [{
+ auto id = ($kind == NVVM::Tcgen05WaitKind::LOAD)
+ ? llvm::Intrinsic::nvvm_tcgen05_wait_ld
+ : llvm::Intrinsic::nvvm_tcgen05_wait_st;
+ createIntrinsicCall(builder, id);
+ }];
+}
+
+def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
+ let summary = "Tcgen05 commit operations";
+ let description = [{
+ The `tcgen05.commit` makes the mbarrier object, specified by
+ the operand `addr`, track the completion of all the prior
+ async-tcgen05 operations initiated by the executing thread.
+ The multicast variants allow signaling on the mbarrier objects
+ of multiple CTAs within the cluster. Operand `multicastMask`,
+ when present, specifies the destination CTAs in the cluster such
+ that each bit position in the 16-bit `multicastMask` operand
+ corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
+ [For more information refer PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
+ }];
+
+ let arguments = (ins
+ AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
+ Optional<I16>:$multicastMask,
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+ let assemblyFormat = [{
+ $addr (`,` `multicast_mask` `=` $multicastMask^)?
+ attr-dict `:` type(operands)
+ }];
+
+ let extraClassDeclaration = [{
+ static llvm::Intrinsic::ID
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::SmallVector<llvm::Value *> &args);
+ }];
+
+ string llvmBuilder = [{
+ llvm::SmallVector<llvm::Value *> args;
+ auto id = NVVM::Tcgen05CommitOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, args);
+ createIntrinsicCall(builder, id, args);
+ }];
+}
+
//===----------------------------------------------------------------------===//
// NVVM target attribute.
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 241b25c6caf128e..62f0c213381111c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1284,6 +1284,36 @@ llvm::Intrinsic::ID Tcgen05DeallocOp::getIntrinsicIDAndArgs(
return id;
}
+#define TCGEN05_COMMIT_IMPL(cg, is_shared, mc) \
+ is_shared ? llvm::Intrinsic::nvvm_tcgen05_commit##mc##_shared##_##cg \
+ : llvm::Intrinsic::nvvm_tcgen05_commit##mc##_##cg
+
+#define GET_TCGEN05_COMMIT_ID(cta_group, is_shared, has_mc) \
+ has_mc ? TCGEN05_COMMIT_IMPL(cta_group, is_shared, _mc) \
+ : TCGEN05_COMMIT_IMPL(cta_group, is_shared, )
+
+llvm::Intrinsic::ID
+Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op,
+ LLVM::ModuleTranslation &mt,
+ llvm::SmallVector<llvm::Value *> &args) {
+ auto curOp = cast<NVVM::Tcgen05CommitOp>(op);
+ unsigned AS = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
+ .getAddressSpace();
+ bool isShared = AS == NVVMMemorySpace::kSharedMemorySpace;
+ bool hasMulticast = curOp.getMulticastMask() ? true : false;
+ bool is2CTAMode = curOp.getGroup() == Tcgen05GroupKind::CTA_2;
+
+ auto id = is2CTAMode ? GET_TCGEN05_COMMIT_ID(cg2, isShared, hasMulticast)
+ : GET_TCGEN05_COMMIT_ID(cg1, isShared, hasMulticast);
+
+ // Fill the Intrinsic Args
+ args.push_back(mt.lookupValue(curOp.getAddr()));
+ if (hasMulticast)
+ args.push_back(mt.lookupValue(curOp.getMulticastMask()));
+
+ return id;
+}
+
/// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might
/// have ConstantRangeAttr.
static void nvvmInferResultRanges(Operation *op, Value result,
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir
new file mode 100644
index 000000000000000..7536a4567e34e58
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_fence
+llvm.func @llvm_nvvm_tcgen05_fence() {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync()
+ nvvm.tcgen05.fence #nvvm.tcgen05_fence<before>
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync()
+ nvvm.tcgen05.fence #nvvm.tcgen05_fence<after>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_wait
+llvm.func @llvm_nvvm_tcgen05_wait() {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld()
+ nvvm.tcgen05.wait #nvvm.tcgen05_wait<load>
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st()
+ nvvm.tcgen05.wait #nvvm.tcgen05_wait<store>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_generic
+llvm.func @llvm_nvvm_tcgen05_commit_generic(%barrier : !llvm.ptr, %cta_mask : i16) {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.cg1(ptr %{{.*}})
+ nvvm.tcgen05.commit %barrier : !llvm.ptr
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.cg2(ptr %{{.*}})
+ nvvm.tcgen05.commit %barrier {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %{{.*}}, i16 %{{.*}})
+ nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask : !llvm.ptr, i16
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %{{.*}}, i16 %{{.*}})
+ nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr, i16
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_shared
+llvm.func @llvm_nvvm_tcgen05_commit_shared(%barrier : !llvm.ptr<3>, %cta_mask : i16) {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %{{.*}})
+ nvvm.tcgen05.commit %barrier : !llvm.ptr<3>
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %{{.*}})
+ nvvm.tcgen05.commit %barrier {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr<3>
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %{{.*}}, i16 %{{.*}})
+ nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask : !llvm.ptr<3>, i16
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %{{.*}}, i16 %{{.*}})
+ nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr<3>, i16
+ llvm.return
+}
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/35/builds/7104 Here is the relevant piece of the build log for the reference
|
PR llvm#126091 adds intrinsics for tcgen05 wait/fence/commit operations. This patch adds NVVM Dialect Ops for them. Signed-off-by: Durgadoss R <[email protected]>
PR #126091 adds intrinsics for tcgen05
wait/fence/commit operations. This patch
adds NVVM Dialect Ops for them.