-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Add float to tf32 conversion intrinsics #124316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Add float to tf32 conversion intrinsics #124316
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-nvptx Author: Durgadoss R (durga4github) ChangesThis patch adds the set of f32 -> tf32 cvt intrinsics introduced Tests are verified with a 12.8 ptxas executable. This patch also updates the lit.cfg.py to include the latest PTXAS_EXE versions. PTX ISA link: Full diff: https://github.com/llvm/llvm-project/pull/124316.diff 4 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 68c2373a1a4541..9a2f38d760e659 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1444,10 +1444,18 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">,
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_f2tf32_rn_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_satfinite">,
+ Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_f2tf32_rn_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_relu_satfinite">,
+ Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">,
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">,
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_f2tf32_rz_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_satfinite">,
+ Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+ def int_nvvm_f2tf32_rz_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_relu_satfinite">,
+ Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">,
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f17799c1300153..633a99d0fc1be3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -764,6 +764,11 @@ let hasSideEffects = false in {
defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">;
defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
+
+ defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
+ defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
+ defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
+ defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
}
def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
new file mode 100644
index 00000000000000..dff5b35dfa9139
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
+
+declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
+
+define i32 @cvt_rn_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_tf32_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0];
+; CHECK-NEXT: cvt.rn.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
+ ret i32 %val
+}
+
+define i32 @cvt_rn_relu_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_tf32_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0];
+; CHECK-NEXT: cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
+ ret i32 %val
+}
+
+define i32 @cvt_rz_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_tf32_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0];
+; CHECK-NEXT: cvt.rz.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
+ ret i32 %val
+}
+
+define i32 @cvt_rz_relu_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_tf32_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0];
+; CHECK-NEXT: cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
+ ret i32 %val
+}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index b17d41fa11af7c..3c0069d10412a5 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -311,6 +311,9 @@ def enable_ptxas(ptxas_executable):
(12, 2),
(12, 3),
(12, 4),
+ (12, 5),
+ (12, 6),
+ (12, 8),
]
def version_int(ver):
|
33d50c5
to
067a163
Compare
This patch adds the set of f32->tf32 cvt intrinsics introduced in sm100 with ptx8.6. This builds on top of the recent PR llvm#121507. Tests are verified with a 12.8 ptxas executable. PTX ISA link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt Signed-off-by: Durgadoss R <[email protected]>
067a163
to
4db3c56
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/12474 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/50/builds/9443 Here is the relevant piece of the build log for the reference
|
This patch adds the set of f32 -> tf32 cvt intrinsics introduced
in sm100 with ptx8.6. This builds on top of the recent PR #121507.
Tests are verified with a 12.8 ptxas executable.
PTX ISA link:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt