Skip to content

Commit 4db3c56

Browse files
committed
[NVPTX] Add float to tf32 conversion intrinsics
This patch adds the set of f32->tf32 cvt intrinsics introduced in sm100 with ptx8.6. This builds on top of the recent PR llvm#121507. Tests are verified with a 12.8 ptxas executable. PTX ISA link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt Signed-off-by: Durgadoss R <[email protected]>
1 parent 134401d commit 4db3c56

File tree

3 files changed

+81
-0
lines changed

3 files changed

+81
-0
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,10 +1444,18 @@ let TargetPrefix = "nvvm" in {
14441444
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
14451445
def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">,
14461446
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1447+
def int_nvvm_f2tf32_rn_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_satfinite">,
1448+
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1449+
def int_nvvm_f2tf32_rn_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_relu_satfinite">,
1450+
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
14471451
def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">,
14481452
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
14491453
def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">,
14501454
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1455+
def int_nvvm_f2tf32_rz_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_satfinite">,
1456+
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1457+
def int_nvvm_f2tf32_rz_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_relu_satfinite">,
1458+
Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
14511459

14521460
def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">,
14531461
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,6 +764,11 @@ let hasSideEffects = false in {
764764
defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">;
765765
defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
766766
defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
767+
768+
defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
769+
defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
770+
defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
771+
defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
767772
}
768773

769774
def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s
3+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
4+
5+
declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
6+
declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
7+
declare i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
8+
declare i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
9+
10+
define i32 @cvt_rn_satf_tf32_f32(float %f1) {
11+
; CHECK-LABEL: cvt_rn_satf_tf32_f32(
12+
; CHECK: {
13+
; CHECK-NEXT: .reg .b32 %r<2>;
14+
; CHECK-NEXT: .reg .f32 %f<2>;
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0:
17+
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_satf_tf32_f32_param_0];
18+
; CHECK-NEXT: cvt.rn.satfinite.tf32.f32 %r1, %f1;
19+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
20+
; CHECK-NEXT: ret;
21+
%val = call i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
22+
ret i32 %val
23+
}
24+
25+
define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) {
26+
; CHECK-LABEL: cvt_rn_relu_satf_tf32_f32(
27+
; CHECK: {
28+
; CHECK-NEXT: .reg .b32 %r<2>;
29+
; CHECK-NEXT: .reg .f32 %f<2>;
30+
; CHECK-EMPTY:
31+
; CHECK-NEXT: // %bb.0:
32+
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
33+
; CHECK-NEXT: cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
34+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
35+
; CHECK-NEXT: ret;
36+
%val = call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
37+
ret i32 %val
38+
}
39+
40+
define i32 @cvt_rz_satf_tf32_f32(float %f1) {
41+
; CHECK-LABEL: cvt_rz_satf_tf32_f32(
42+
; CHECK: {
43+
; CHECK-NEXT: .reg .b32 %r<2>;
44+
; CHECK-NEXT: .reg .f32 %f<2>;
45+
; CHECK-EMPTY:
46+
; CHECK-NEXT: // %bb.0:
47+
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_satf_tf32_f32_param_0];
48+
; CHECK-NEXT: cvt.rz.satfinite.tf32.f32 %r1, %f1;
49+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
50+
; CHECK-NEXT: ret;
51+
%val = call i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
52+
ret i32 %val
53+
}
54+
55+
define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) {
56+
; CHECK-LABEL: cvt_rz_relu_satf_tf32_f32(
57+
; CHECK: {
58+
; CHECK-NEXT: .reg .b32 %r<2>;
59+
; CHECK-NEXT: .reg .f32 %f<2>;
60+
; CHECK-EMPTY:
61+
; CHECK-NEXT: // %bb.0:
62+
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
63+
; CHECK-NEXT: cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
64+
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
65+
; CHECK-NEXT: ret;
66+
%val = call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
67+
ret i32 %val
68+
}

0 commit comments

Comments
 (0)