Skip to content

[MLIR][NVVM] Add support for aligned variants of cluster barriers #78142

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 55 additions & 3 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -378,22 +378,74 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
}

def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
let arguments = (ins OptionalAttr<UnitAttr>:$aligned);

let summary = "Cluster Barrier Arrive Op";
let description = [{
The `cluster.arrive` can be used by the threads within the cluster for synchronization and
communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier
without causing the executing thread to wait for other participating threads.

The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.

[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
}];

string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive);
if ($aligned)
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_aligned);
else
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive);
}];
let assemblyFormat = "attr-dict";
}

def NVVM_ClusterArriveRelaxedOp : NVVM_Op<"cluster.arrive.relaxed"> {
let arguments = (ins OptionalAttr<UnitAttr>:$aligned);

let summary = "Cluster Barrier Relaxed Arrive Op";
let description = [{
The `cluster.arrive` can be used by the threads within the cluster for synchronization and
communication. The `cluster.arrive` instruction marks the warps' arrival at the barrier
without causing the executing thread to wait for other participating threads.

The `aligned` attribute, when provided, generates the .aligned version of the PTX instruction.
The .relaxed qualifier on `cluster.arrive` specifies that there are no memory
ordering and visibility guarantees provided for the memory accesses performed prior to
`cluster.arrive`.

[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
}];

string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed);
if ($aligned)
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned);
else
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_arrive_relaxed);
}];
let assemblyFormat = "attr-dict";
}

def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait"> {
let arguments = (ins OptionalAttr<UnitAttr>:$aligned);

let summary = "Cluster Barrier Wait Op";
let description = [{
The `cluster.wait` causes the executing thread to wait for all non-exited threads
of the cluster to perform `cluster.arrive`. The `aligned` attribute, when provided,
generates the .aligned version of the PTX instruction.

[For more information, see PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)
}];

string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait);
if ($aligned)
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait_aligned);
else
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_barrier_cluster_wait);
}];
let assemblyFormat = "attr-dict";
}
Expand Down
6 changes: 6 additions & 0 deletions mlir/test/Dialect/LLVMIR/nvvm.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,26 @@ func.func @llvm_nvvm_barrier0() {
func.func @llvm_nvvm_cluster_arrive() {
// CHECK: nvvm.cluster.arrive
nvvm.cluster.arrive
// CHECK: nvvm.cluster.arrive {aligned}
nvvm.cluster.arrive {aligned}
llvm.return
}

// CHECK-LABEL: @llvm_nvvm_cluster_arrive_relaxed
func.func @llvm_nvvm_cluster_arrive_relaxed() {
// CHECK: nvvm.cluster.arrive.relaxed
nvvm.cluster.arrive.relaxed
// CHECK: nvvm.cluster.arrive.relaxed {aligned}
nvvm.cluster.arrive.relaxed {aligned}
llvm.return
}

// CHECK-LABEL: @llvm_nvvm_cluster_wait
func.func @llvm_nvvm_cluster_wait() {
// CHECK: nvvm.cluster.wait
nvvm.cluster.wait
// CHECK: nvvm.cluster.wait {aligned}
nvvm.cluster.wait {aligned}
llvm.return
}

Expand Down
27 changes: 27 additions & 0 deletions mlir/test/Target/LLVMIR/nvvmir.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ llvm.func @llvm_nvvm_barrier0() {
llvm.return
}

// CHECK-LABEL: @llvm_nvvm_cluster_arrive
llvm.func @llvm_nvvm_cluster_arrive() {
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
nvvm.cluster.arrive
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive.aligned()
nvvm.cluster.arrive {aligned}
llvm.return
}

// CHECK-LABEL: @llvm_nvvm_cluster_arrive_relaxed
llvm.func @llvm_nvvm_cluster_arrive_relaxed() {
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
nvvm.cluster.arrive.relaxed
// CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed.aligned()
nvvm.cluster.arrive.relaxed {aligned}
llvm.return
}

// CHECK-LABEL: @llvm_nvvm_cluster_wait
llvm.func @llvm_nvvm_cluster_wait() {
// CHECK: call void @llvm.nvvm.barrier.cluster.wait()
nvvm.cluster.wait
// CHECK: call void @llvm.nvvm.barrier.cluster.wait.aligned()
nvvm.cluster.wait {aligned}
llvm.return
}

// CHECK-LABEL: @nvvm_shfl
llvm.func @nvvm_shfl(
%0 : i32, %1 : i32, %2 : i32,
Expand Down