Skip to content

[MLIR][NVVM] [NFC] Update test cmd-lines and doc links #128207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 6 additions & 12 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -2648,8 +2648,7 @@ def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
the amount specified by `nCols` and writes the destination
address to the `addr` argument. The `nCols` operand specifies the
number of columns to be allocated and it must be a power-of-two.
[For more information, refer to the PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
}];

let arguments = (ins
Expand Down Expand Up @@ -2679,8 +2678,7 @@ def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
specified by `tmemAddr`, which must be from a previous tensor
memory allocation. The `nCols` operand specifies the number
of columns to be de-allocated, and it must be a power-of-two.
[For more information, refer to the PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
}];

let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols,
Expand Down Expand Up @@ -2708,8 +2706,7 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm
of the executing thread is relinquishing the right to allocate
Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc`
after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`.
[For more information, refer to the PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
}];

let arguments = (ins
Expand All @@ -2733,8 +2730,7 @@ def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> {
The `tcgen05.fence<after>` orders all subsequent async tcgen05 operations
with respect to the prior tcgen05 and execution ordering operations.

[For more information refer to the PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence)
}];

let arguments = (ins Tcgen05FenceKindAttr:$kind);
Expand All @@ -2756,8 +2752,7 @@ def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> {
have completed. Similarly, the `tcgen05.wait<store>` causes the executing
thread to block until all prior `tcgen05.st` operations issued by the
executing thread have completed.
[For more information refer PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait)
}];

let arguments = (ins Tcgen05WaitKindAttr:$kind);
Expand All @@ -2782,8 +2777,7 @@ def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> {
when present, specifies the destination CTAs in the cluster such
that each bit position in the 16-bit `multicastMask` operand
corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
[For more information refer PTX ISA]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit)
}];

let arguments = (ins
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Target/LLVMIR/nvvm/cvt_tf32.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-translate -mlir-to-llvmir %s -split-input-file --verify-diagnostics | FileCheck %s
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// CHECK-LABEL: @convert_float_to_tf32_rna
llvm.func @convert_float_to_tf32_rna(%src : f32) -> i32 {
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt -split-input-file -verify-diagnostics %s
// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --check-prefix=CHECK-LLVM

// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc
llvm.func @llvm_nvvm_tcgen05_alloc(%addr : !llvm.ptr, %ncols : i32) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,4 @@
// RUN: mlir-opt -split-input-file -verify-diagnostics %s
// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM

// CHECK-LABEL: @llvm_nvvm_tcgen05_fence
llvm.func @llvm_nvvm_tcgen05_fence() {
// CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync()
nvvm.tcgen05.fence #nvvm.tcgen05_fence<before>

// CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync()
nvvm.tcgen05.fence #nvvm.tcgen05_fence<after>

llvm.return
}

// CHECK-LABEL: @llvm_nvvm_tcgen05_wait
llvm.func @llvm_nvvm_tcgen05_wait() {
// CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld()
nvvm.tcgen05.wait #nvvm.tcgen05_wait<load>

// CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st()
nvvm.tcgen05.wait #nvvm.tcgen05_wait<store>

llvm.return
}
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --check-prefix=CHECK-LLVM

// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_generic
llvm.func @llvm_nvvm_tcgen05_commit_generic(%barrier : !llvm.ptr, %cta_mask : i16) {
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Target/LLVMIR/nvvm/tcgen05-cp.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// CHECK-LABEL: @nvvm_tcgen05_cp_128x256b
llvm.func @nvvm_tcgen05_cp_128x256b(%taddr : !llvm.ptr<6>, %smem_desc : i64) {
Expand Down
23 changes: 23 additions & 0 deletions mlir/test/Target/LLVMIR/nvvm/tcgen05-fence-wait.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --check-prefix=CHECK-LLVM

// CHECK-LABEL: @llvm_nvvm_tcgen05_fence
llvm.func @llvm_nvvm_tcgen05_fence() {
// CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync()
nvvm.tcgen05.fence #nvvm.tcgen05_fence<before>

// CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync()
nvvm.tcgen05.fence #nvvm.tcgen05_fence<after>

llvm.return
}

// CHECK-LABEL: @llvm_nvvm_tcgen05_wait
llvm.func @llvm_nvvm_tcgen05_wait() {
// CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld()
nvvm.tcgen05.wait #nvvm.tcgen05_wait<load>

// CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st()
nvvm.tcgen05.wait #nvvm.tcgen05_wait<store>

llvm.return
}
2 changes: 1 addition & 1 deletion mlir/test/Target/LLVMIR/nvvm/tcgen05-shift.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// CHECK-LABEL: @llvm_nvvm_tcgen05_shift
llvm.func @llvm_nvvm_tcgen05_shift(%taddr : !llvm.ptr<6>) {
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt -split-input-file -verify-diagnostics %s
// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// CHECK-LABEL: @llvm_nvvm_cp_async_bulk_global_to_shared_cluster
llvm.func @llvm_nvvm_cp_async_bulk_global_to_shared_cluster(%dst : !llvm.ptr<3>, %src : !llvm.ptr<1>, %mbar : !llvm.ptr<3>, %size : i32, %mc : i16, %ch : i64) {
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-translate -mlir-to-llvmir %s -split-input-file --verify-diagnostics | FileCheck %s
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// CHECK-LABEL: @tma_prefetch_1d
llvm.func @tma_prefetch_1d(%tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-translate -mlir-to-llvmir -split-input-file --verify-diagnostics %s | FileCheck %s
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

// CHECK-LABEL: define void @tma_store_reduce_1d(
llvm.func @tma_store_reduce_1d(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {
Expand Down