[NVPTX] Add float to tf32 conversion intrinsics #124316

durga4github · 2025-01-24T17:43:46Z

This patch adds the set of f32 -> tf32 cvt intrinsics introduced
in sm100 with ptx8.6. This builds on top of the recent PR #121507.

Tests are verified with a 12.8 ptxas executable.

PTX ISA link:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

llvmbot · 2025-01-24T17:44:18Z

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-nvptx

Author: Durgadoss R (durga4github)

Changes

This patch adds the set of f32 -> tf32 cvt intrinsics introduced
in sm100 with ptx8.6. This builds on top of the recent PR #121507.

Tests are verified with a 12.8 ptxas executable.

This patch also updates the lit.cfg.py to include the latest PTXAS_EXE versions.

PTX ISA link:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

Full diff: https://github.com/llvm/llvm-project/pull/124316.diff

4 Files Affected:

(modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+8)
(modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+5)
(added) llvm/test/CodeGen/NVPTX/convert-sm100.ll (+68)
(modified) llvm/test/lit.cfg.py (+3)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 68c2373a1a4541..9a2f38d760e659 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1444,10 +1444,18 @@ let TargetPrefix = "nvvm" in {
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rn_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rn_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_relu_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rz_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rz_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_relu_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
 
   def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">,
       Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f17799c1300153..633a99d0fc1be3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -764,6 +764,11 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rz_relu  : CVT_TO_TF32<"rz.relu">;
   defm CVT_to_tf32_rna      : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
   defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
+
+  defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
+  defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
+  defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
+  defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
 }
 
 def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
new file mode 100644
index 00000000000000..dff5b35dfa9139
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
+
+declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
+
+define i32 @cvt_rn_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rn_relu_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rz_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rz_relu_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
+  ret i32 %val
+}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index b17d41fa11af7c..3c0069d10412a5 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -311,6 +311,9 @@ def enable_ptxas(ptxas_executable):
             (12, 2),
             (12, 3),
             (12, 4),
+            (12, 5),
+            (12, 6),
+            (12, 8),
         ]
 
         def version_int(ver):

llvm/test/lit.cfg.py

This patch adds the set of f32->tf32 cvt intrinsics introduced in sm100 with ptx8.6. This builds on top of the recent PR llvm#121507. Tests are verified with a 12.8 ptxas executable. PTX ISA link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt Signed-off-by: Durgadoss R <[email protected]>

llvm-ci · 2025-01-27T10:47:16Z

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/12474

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: mapping/data_member_ref.cpp' FAILED ********************
Exit Code: 2

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/mapping/data_member_ref.cpp -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/mapping/Output/data_member_ref.cpp.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a && /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/mapping/Output/data_member_ref.cpp.tmp | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/mapping/data_member_ref.cpp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/mapping/data_member_ref.cpp -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/mapping/Output/data_member_ref.cpp.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/mapping/Output/data_member_ref.cpp.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/mapping/data_member_ref.cpp
# .---command stderr------------
# | FileCheck error: '<stdin>' is empty.
# | FileCheck command line:  /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/mapping/data_member_ref.cpp
# `-----------------------------
# error: command failed with exit status: 2

--

********************

llvm-ci · 2025-01-27T10:59:21Z

LLVM Buildbot has detected a new failure on builder flang-aarch64-dylib running on linaro-flang-aarch64-dylib while building llvm at step 5 "build-unified-tree".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/50/builds/9443

Here is the relevant piece of the build log for the reference

Step 5 (build-unified-tree) failure: build (failure)
...
1918.390 [1993/1/4828] Building CXX object tools/mlir/lib/Dialect/Linalg/Transforms/CMakeFiles/obj.MLIRLinalgTransforms.dir/BubbleUpExtractSlice.cpp.o
1918.594 [1992/1/4829] Building CXX object tools/mlir/lib/Conversion/FuncToSPIRV/CMakeFiles/obj.MLIRFuncToSPIRV.dir/FuncToSPIRVPass.cpp.o
1918.702 [1991/1/4830] Building CXX object tools/mlir/lib/Dialect/Transform/Transforms/CMakeFiles/obj.MLIRTransformDialectTransforms.dir/CheckUses.cpp.o
1918.878 [1990/1/4831] Building CXX object tools/mlir/lib/CAPI/IR/CMakeFiles/obj.MLIRCAPIIR.dir/AffineMap.cpp.o
1918.986 [1989/1/4832] Building CXX object tools/mlir/lib/CAPI/IR/CMakeFiles/obj.MLIRCAPIIR.dir/BuiltinAttributes.cpp.o
1919.062 [1988/1/4833] Building CXX object tools/mlir/lib/CAPI/Transforms/CMakeFiles/obj.MLIRCAPITransforms.dir/Rewrite.cpp.o
1919.119 [1987/1/4834] Building CXX object tools/mlir/test/lib/Conversion/VectorToSPIRV/CMakeFiles/MLIRTestVectorToSPIRV.dir/TestVectorReductionToSPIRVDotProd.cpp.o
1919.153 [1986/1/4835] Building CXX object tools/mlir/tools/mlir-parser-fuzzer/bytecode/CMakeFiles/mlir-bytecode-parser-fuzzer.dir/DummyParserFuzzer.cpp.o
1919.226 [1985/1/4836] Building CXX object tools/mlir/test/lib/IR/CMakeFiles/MLIRTestIR.dir/TestVisitors.cpp.o
1927.804 [1984/1/4837] Building CXX object tools/mlir/test/lib/IR/CMakeFiles/MLIRTestIR.dir/TestVisitorsGeneric.cpp.o
FAILED: tools/mlir/test/lib/IR/CMakeFiles/MLIRTestIR.dir/TestVisitorsGeneric.cpp.o 
/usr/local/bin/c++ -DGTEST_HAS_RTTI=0 -DMLIR_INCLUDE_TESTS -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/build/tools/mlir/test/lib/IR -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/IR -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/build/tools/mlir/include -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/include -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/build/include -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/llvm/include -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/IR/../Dialect/Test -I/home/tcwg-buildbot/worker/flang-aarch64-dylib/build/tools/mlir/test/lib/IR/../Dialect/Test -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Wundef -Werror=mismatched-tags -O3 -DNDEBUG -std=c++17  -fno-exceptions -funwind-tables -fno-rtti -UNDEBUG -MD -MT tools/mlir/test/lib/IR/CMakeFiles/MLIRTestIR.dir/TestVisitorsGeneric.cpp.o -MF tools/mlir/test/lib/IR/CMakeFiles/MLIRTestIR.dir/TestVisitorsGeneric.cpp.o.d -o tools/mlir/test/lib/IR/CMakeFiles/MLIRTestIR.dir/TestVisitorsGeneric.cpp.o -c /home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/IR/TestVisitorsGeneric.cpp
In file included from /home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/IR/TestVisitorsGeneric.cpp:9:
/home/tcwg-buildbot/worker/flang-aarch64-dylib/llvm-project/mlir/test/lib/IR/../Dialect/Test/TestOps.h:148:10: fatal error: 'TestOps.h.inc' file not found
  148 | #include "TestOps.h.inc"
      |          ^~~~~~~~~~~~~~~
1 error generated.
ninja: build stopped: subcommand failed.

llvmbot added backend:NVPTX llvm:ir labels Jan 24, 2025

durga4github force-pushed the durgadossr/nvptx_cvt_tf32_sf_rn_rz branch from 33d50c5 to 067a163 Compare January 24, 2025 17:46

durga4github requested a review from Artem-B January 24, 2025 17:48

Artem-B approved these changes Jan 25, 2025

View reviewed changes

llvm/test/lit.cfg.py Outdated Show resolved Hide resolved

durga4github force-pushed the durgadossr/nvptx_cvt_tf32_sf_rn_rz branch from 067a163 to 4db3c56 Compare January 25, 2025 05:59

durga4github merged commit 3b5e9ee into llvm:main Jan 27, 2025
8 checks passed

durga4github deleted the durgadossr/nvptx_cvt_tf32_sf_rn_rz branch January 27, 2025 10:22

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[NVPTX] Add float to tf32 conversion intrinsics #124316

[NVPTX] Add float to tf32 conversion intrinsics #124316

Uh oh!

durga4github commented Jan 24, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jan 24, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

llvm-ci commented Jan 27, 2025

Uh oh!

llvm-ci commented Jan 27, 2025

Uh oh!

Uh oh!

[NVPTX] Add float to tf32 conversion intrinsics #124316

[NVPTX] Add float to tf32 conversion intrinsics #124316

Uh oh!

Conversation

durga4github commented Jan 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

llvm-ci commented Jan 27, 2025

Uh oh!

llvm-ci commented Jan 27, 2025

Uh oh!

Uh oh!

durga4github commented Jan 24, 2025 •

edited

Loading

llvmbot commented Jan 24, 2025 •

edited

Loading