-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MLIR][NVVM] Add support for aligned variants of cluster barriers #78142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This patch adds: * Support for the 'aligned' variants of the cluster barrier Ops, by extending the existing Op with an 'aligned' attribute. * Docs for these Ops. * Test cases to verify the lowering to the corresponding intrinsics. Signed-off-by: Durgadoss R <[email protected]>
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-llvm Author: Durgadoss R (durga4github) ChangesThis patch adds:
Full diff: https://github.com/llvm/llvm-project/pull/78142.diff 3 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index c5f68a2ebe3952..7140e614412f98 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -378,22 +378,74 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
}
def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
+ let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
+
+ let summary = "Cluster Barrier Arrive Op";
+ let description = [{
+ The `cluster.arrive` can be used by the threads within the cluster for synchronization and
+ communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier
+ without causing the executing thread to wait for other participating threads.
+
+ The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.
+
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+ }];
+
string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive);
+ if ($aligned)
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_aligned);
+ else
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive);
}];
let assemblyFormat = "attr-dict";
}
def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> {
+ let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
+
+ let summary = "Cluster Barrier Relaxed Arrive Op";
+ let description = [{
+ The `cluster.arrive` can be used by the threads within the cluster for synchronization and
+ communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier
+ without causing the executing thread to wait for other participating threads.
+
+ The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.
+ The .relaxed qualifier on `cluster.arrive` specifies that there are no memory
+ ordering and visibility guarantees provided for the memory accesses performed prior to
+ `cluster.arrive`.
+
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+ }];
+
string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed);
+ if ($aligned)
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned);
+ else
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed);
}];
let assemblyFormat = "attr-dict";
}
def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> {
+ let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
+
+ let summary = "Cluster Barrier Wait Op";
+ let description = [{
+ The `cluster.wait` causes the executing thread to wait for all non-exited threads
+ of the cluster to perform `cluster.arrive`. The `aligned` attribute, when provided,
+ generates the .aligned version of the PTX instruction.
+
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
+ }];
+
string llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait);
+ if ($aligned)
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait_aligned);
+ else
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait);
}];
let assemblyFormat = "attr-dict";
}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 39516b5090d07b..ce483ddab22a0e 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -47,6 +47,8 @@ func.func @llvm_nvvm_barrier0() {
func.func @llvm_nvvm_cluster_arrive() {
// CHECK: nvvm.cluster.arrive
nvvm.cluster.arrive
+ // CHECK: nvvm.cluster.arrive {aligned}
+ nvvm.cluster.arrive {aligned}
llvm.return
}
@@ -54,6 +56,8 @@ func.func @llvm_nvvm_cluster_arrive() {
func.func @llvm_nvvm_cluster_arrive_relaxed() {
// CHECK: nvvm.cluster.arrive.relaxed
nvvm.cluster.arrive.relaxed
+ // CHECK: nvvm.cluster.arrive.relaxed {aligned}
+ nvvm.cluster.arrive.relaxed {aligned}
llvm.return
}
@@ -61,6 +65,8 @@ func.func @llvm_nvvm_cluster_arrive_relaxed() {
func.func @llvm_nvvm_cluster_wait() {
// CHECK: nvvm.cluster.wait
nvvm.cluster.wait
+ // CHECK: nvvm.cluster.wait {aligned}
+ nvvm.cluster.wait {aligned}
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 423b1a133a4ae2..8c5e3524a848f6 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -80,6 +80,33 @@ llvm.func @llvm_nvvm_barrier0() {
llvm.return
}
+// CHECK-LABEL: @llvm_nvvm_cluster_arrive
+llvm.func @llvm_nvvm_cluster_arrive() {
+ // CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
+ nvvm.cluster.arrive
+ // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.aligned()
+ nvvm.cluster.arrive {aligned}
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_cluster_arrive_relaxed
+llvm.func @llvm_nvvm_cluster_arrive_relaxed() {
+ // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+ nvvm.cluster.arrive.relaxed
+ // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed.aligned()
+ nvvm.cluster.arrive.relaxed {aligned}
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_cluster_wait
+llvm.func @llvm_nvvm_cluster_wait() {
+ // CHECK: call void @llvm.nvvm.barrier.cluster.wait()
+ nvvm.cluster.wait
+ // CHECK: call void @llvm.nvvm.barrier.cluster.wait.aligned()
+ nvvm.cluster.wait {aligned}
+ llvm.return
+}
+
// CHECK-LABEL: @nvvm_shfl
llvm.func @nvvm_shfl(
%0 : i32, %1 : i32, %2 : i32,
|
@grypp , Kindly help review. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks great! Thanks for
Build seems clean. @grypp , Please help with the merge. |
I'll merge this, but before can we remove ':' on the title |
Sure, updated it |
…vm#78142) This patch adds: * Support for the 'aligned' variants of the cluster barrier Ops, by extending the existing Op with an 'aligned' attribute. * Docs for these Ops. * Test cases to verify the lowering to the corresponding intrinsics. Signed-off-by: Durgadoss R <[email protected]>
This patch adds: