Skip to content

[NVPTX] Cleanup and document nvvm.fabs intrinsics, adding f16 support #135644

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.td
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,11 @@ def __nvvm_fabs_ftz_f : NVPTXBuiltin<"float(float)">;
def __nvvm_fabs_f : NVPTXBuiltin<"float(float)">;
def __nvvm_fabs_d : NVPTXBuiltin<"double(double)">;

def __nvvm_fabs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16)", SM_53, PTX65>;
def __nvvm_fabs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>)", SM_53, PTX65>;
def __nvvm_fabs_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16)", SM_53, PTX65>;
def __nvvm_fabs_ftz_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>)", SM_53, PTX65>;

// Round

def __nvvm_round_ftz_f : NVPTXBuiltin<"float(float)">;
Expand Down
15 changes: 15 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,21 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
*this);
case NVPTX::BI__nvvm_fabs_f:
case NVPTX::BI__nvvm_abs_bf16:
case NVPTX::BI__nvvm_abs_bf16x2:
case NVPTX::BI__nvvm_fabs_f16:
case NVPTX::BI__nvvm_fabs_f16x2:
return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs,
EmitScalarExpr(E->getArg(0)));
case NVPTX::BI__nvvm_fabs_ftz_f:
case NVPTX::BI__nvvm_fabs_ftz_f16:
case NVPTX::BI__nvvm_fabs_ftz_f16x2:
return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs_ftz,
EmitScalarExpr(E->getArg(0)));
case NVPTX::BI__nvvm_fabs_d:
return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
EmitScalarExpr(E->getArg(0)));
case NVPTX::BI__nvvm_ldg_h:
case NVPTX::BI__nvvm_ldg_h2:
return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
Expand Down
41 changes: 29 additions & 12 deletions clang/test/CodeGen/builtins-nvptx-native-half-type.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
// RUN: sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type \
// RUN: sm_53 -target-feature +ptx65 -fcuda-is-device -fnative-half-type \
// RUN: -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX65_SM53 %s

// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
// RUN: -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \
// RUN: -target-cpu sm_53 -target-feature +ptx65 -fcuda-is-device \
// RUN: -fnative-half-type -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX65_SM53 %s

#define __device__ __attribute__((device))

Expand Down Expand Up @@ -108,25 +108,25 @@ __device__ void nvvm_fma_f16_f16x2_sm80() {
// CHECK-LABEL: nvvm_fma_f16_f16x2_sm53
__device__ void nvvm_fma_f16_f16x2_sm53() {
#if __CUDA_ARCH__ >= 530
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.f16
// CHECK_PTX65_SM53: call half @llvm.nvvm.fma.rn.f16
__nvvm_fma_rn_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.f16
// CHECK_PTX65_SM53: call half @llvm.nvvm.fma.rn.ftz.f16
__nvvm_fma_rn_ftz_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.sat.f16
// CHECK_PTX65_SM53: call half @llvm.nvvm.fma.rn.sat.f16
__nvvm_fma_rn_sat_f16(0.1f16, 0.1f16, 0.1f16);
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16
// CHECK_PTX65_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16
__nvvm_fma_rn_ftz_sat_f16(0.1f16, 0.1f16, 0.1f16);

// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2
// CHECK_PTX65_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2
__nvvm_fma_rn_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2
// CHECK_PTX65_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2
__nvvm_fma_rn_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2
// CHECK_PTX65_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2
__nvvm_fma_rn_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2
// CHECK_PTX65_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2
__nvvm_fma_rn_ftz_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
{0.1f16, 0.7f16});
#endif
Expand Down Expand Up @@ -173,6 +173,23 @@ __device__ void nvvm_min_max_sm86() {
// CHECK: ret void
}

// CHECK-LABEL: nvvm_fabs_f16
__device__ void nvvm_fabs_f16() {
#if __CUDA_ARCH__ >= 530
// CHECK: call half @llvm.nvvm.fabs.f16
__nvvm_fabs_f16(0.1f16);
// CHECK: call half @llvm.nvvm.fabs.ftz.f16
__nvvm_fabs_ftz_f16(0.1f16);
// CHECK: call <2 x half> @llvm.nvvm.fabs.v2f16
__nvvm_fabs_f16x2({0.1f16, 0.7f16});
// CHECK: call <2 x half> @llvm.nvvm.fabs.ftz.v2f16
__nvvm_fabs_ftz_f16x2({0.1f16, 0.7f16});
#endif
// CHECK: ret void
}



typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));

// CHECK-LABEL: nvvm_ldg_native_half_types
Expand Down
12 changes: 10 additions & 2 deletions clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
// CHECK: call double @llvm.nvvm.rcp.rn.d
double td4 = __nvvm_rcp_rn_d(d2);

// CHECK: call float @llvm.nvvm.fabs.f32
float t6 = __nvvm_fabs_f(f1);
// CHECK: call float @llvm.nvvm.fabs.ftz.f32
float t7 = __nvvm_fabs_ftz_f(f2);

// CHECK: call double @llvm.fabs.f64
double td5 = __nvvm_fabs_d(d1);

// CHECK: call void @llvm.nvvm.membar.cta()
__nvvm_membar_cta();
// CHECK: call void @llvm.nvvm.membar.gl()
Expand Down Expand Up @@ -1038,9 +1046,9 @@ __device__ void nvvm_cvt_sm89() {
__device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
#if __CUDA_ARCH__ >= 800

// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD)
// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fabs.bf16(bfloat 0xR3DCD)
__nvvm_abs_bf16(BF16);
// CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
// CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat> splat (bfloat 0xR3DCD))
__nvvm_abs_bf16x2(BF16X2);

// CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD)
Expand Down
53 changes: 53 additions & 0 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,59 @@ space casted to this space), 1 is returned, otherwise 0 is returned.
Arithmetic Intrinsics
---------------------

'``llvm.nvvm.fabs.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare float @llvm.nvvm.fabs.f32(float %a)
declare double @llvm.nvvm.fabs.f64(double %a)
declare half @llvm.nvvm.fabs.f16(half %a)
declare <2 x half> @llvm.nvvm.fabs.v2f16(<2 x half> %a)
declare bfloat @llvm.nvvm.fabs.bf16(bfloat %a)
declare <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat> %a)

Overview:
"""""""""

The '``llvm.nvvm.fabs.*``' intrinsics return the absolute value of the operand.

Semantics:
""""""""""

Unlike, '``llvm.fabs.*``', these intrinsics do not perfectly preserve NaN
values. Instead, a NaN input yeilds an unspecified NaN output.


'``llvm.nvvm.fabs.ftz.*``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare float @llvm.nvvm.fabs.ftz.f32(float %a)
declare half @llvm.nvvm.fabs.ftz.f16(half %a)
declare <2 x half> @llvm.nvvm.fabs.ftz.v2f16(<2 x half> %a)

Overview:
"""""""""

The '``llvm.nvvm.fabs.ftz.*``' intrinsics return the absolute value of the
operand, flushing subnormals to sign preserving zero.

Semantics:
""""""""""

Before the absolute value is taken, the input is flushed to sign preserving
zero if it is a subnormal. In addtion, unlike '``llvm.fabs.*``', a NaN input
yields an unspecified NaN output.


'``llvm.nvvm.idp2a.[us].[us]``' Intrinsics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
14 changes: 7 additions & 7 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -1039,18 +1039,18 @@ let TargetPrefix = "nvvm" in {
// Abs
//

def int_nvvm_fabs_ftz_f : ClangBuiltin<"__nvvm_fabs_ftz_f">,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
def int_nvvm_fabs_f : ClangBuiltin<"__nvvm_fabs_f">,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>;
def int_nvvm_fabs_d : ClangBuiltin<"__nvvm_fabs_d">,
DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
def int_nvvm_fabs_ftz :
DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]>;

def int_nvvm_fabs :
DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]>;
//
// Abs, Neg bf16, bf16x2
//

foreach unary = ["abs", "neg"] in {
foreach unary = ["neg"] in {
def int_nvvm_ # unary # _bf16 :
ClangBuiltin<!strconcat("__nvvm_", unary, "_bf16")>,
DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty], [IntrNoMem]>;
Expand Down
16 changes: 9 additions & 7 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -939,12 +939,6 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
}

static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
if (Name.consume_front("abs."))
return StringSwitch<Intrinsic::ID>(Name)
.Case("bf16", Intrinsic::nvvm_abs_bf16)
.Case("bf16x2", Intrinsic::nvvm_abs_bf16x2)
.Default(Intrinsic::not_intrinsic);

if (Name.consume_front("fma.rn."))
return StringSwitch<Intrinsic::ID>(Name)
.Case("bf16", Intrinsic::nvvm_fma_rn_bf16)
Expand Down Expand Up @@ -1291,7 +1285,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
bool Expand = false;
if (Name.consume_front("abs."))
// nvvm.abs.{i,ii}
Expand = Name == "i" || Name == "ll";
Expand =
Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2";
else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
Name == "swap.lo.hi.b64")
Expand = true;
Expand Down Expand Up @@ -2311,6 +2306,13 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
Value *Cmp = Builder.CreateICmpSGE(
Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
} else if (Name == "abs.bf16" || Name == "abs.bf16x2") {
Type *Ty = (Name == "abs.bf16")
? Builder.getBFloatTy()
: FixedVectorType::get(Builder.getBFloatTy(), 2);
Value *Arg = Builder.CreateBitCast(CI->getArgOperand(0), Ty);
Value *Abs = Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs, Arg);
Rep = Builder.CreateBitCast(Abs, CI->getType());
} else if (Name.starts_with("atomic.load.add.f32.p") ||
Name.starts_with("atomic.load.add.f64.p")) {
Value *Ptr = CI->getArgOperand(0);
Expand Down
19 changes: 11 additions & 8 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,17 @@ class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node,
int Size = ty.Size;
}

def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>;
def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>;
def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>;

def F32RT : RegTyInfo<f32, Float32Regs, f32imm, fpimm>;
def F64RT : RegTyInfo<f64, Float64Regs, f64imm, fpimm>;
def F16RT : RegTyInfo<f16, Int16Regs, f16imm, fpimm, supports_imm = 0>;
def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>;
def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>;
def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>;
def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>;

def F32RT : RegTyInfo<f32, Float32Regs, f32imm, fpimm>;
def F64RT : RegTyInfo<f64, Float64Regs, f64imm, fpimm>;
def F16RT : RegTyInfo<f16, Int16Regs, f16imm, fpimm, supports_imm = 0>;
def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>;

def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
Comment on lines +238 to +239
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. I never thought of passing ? as an argument. That can indeed be convenient in some cases.



// Template for instructions which take three int64, int32, or int16 args.
Expand Down
Loading
Loading