-
Notifications
You must be signed in to change notification settings - Fork 788
[SYCL][CUDA][libclc] Add bf16 builtins and optimize half builtins for fma, fmin, fmax and fmax #5724
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
[SYCL][CUDA][libclc] Add bf16 builtins and optimize half builtins for fma, fmin, fmax and fmax #5724
Changes from all commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
1a53588
added fma,fmax,fmin for half, bf16 and bf16x2 and approx exp2 for bf1…
t4c1 32789b4
Added missing includes and forward declarations
t4c1 f20a4bd
removed exp2 bf16 implementations that do not have builtins
t4c1 a780a06
added optimized half2 overloads for fma, fmin and fmax
t4c1 21d98c1
added fma_relu for half, bf16 and bf16x2
t4c1 59758bd
[LIBCL][NVPTX] Add support for half tys native_exp2
jchlanda b362369
removed redundant undefs
t4c1 86a6f42
changed prefix for fma_relu to __clc
t4c1 0802831
changed bf16 builtins to use __clc prefix
t4c1 be46eb4
added bf16 fabs builtins
t4c1 c63c5ba
Merge branch 'sycl' into libclc_bf16
t4c1 f00535e
[NVPTX] Expose float tys min, max, abs, neg as builtins
jchlanda 4057339
[NVPTX] Add more FMA intriniscs/builtins
jchlanda c061090
[NVPTX] Add ex2.approx.f16/f16x2 support
npmiller 5ea7a87
bugfixes
t4c1 ab922cc
Merge branch 'libclc_bf16' into libclc_bf16_tmp
t4c1 9bbf569
Merge branch 'jakub/native_exp2' into libclc_bf16_tmp
t4c1 1b49010
Apply suggestions from code review
t4c1 50945d0
change fmax and fmin fallback implementation back to libdevice float …
t4c1 4658aac
fix libdevice builtin names
t4c1 cab6150
removed native_exp2.cl
t4c1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
// REQUIRES: nvptx-registered-target | ||
// | ||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ | ||
// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ | ||
// RUN: -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s | ||
|
||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ | ||
// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \ | ||
// RUN: -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s | ||
|
||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ | ||
// RUN: -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \ | ||
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s | ||
|
||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ | ||
// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type -S \ | ||
// RUN: -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s | ||
|
||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ | ||
// RUN: -target-cpu sm_86 -target-feature +ptx72 -fcuda-is-device \ | ||
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s | ||
|
||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ | ||
// RUN: sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type -S \ | ||
// RUN: -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s | ||
|
||
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \ | ||
// RUN: -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \ | ||
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \ | ||
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s | ||
|
||
#define __device__ __attribute__((device)) | ||
|
||
__device__ void nvvm_ex2_sm75() { | ||
#if __CUDA_ARCH__ >= 750 | ||
// CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16 | ||
__nvvm_ex2_approx_f16(0.1f16); | ||
// CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2 | ||
__nvvm_ex2_approx_f16x2({0.1f16, 0.7f16}); | ||
#endif | ||
// CHECK: ret void | ||
} | ||
|
||
// CHECK-LABEL: nvvm_min_max_sm80 | ||
__device__ void nvvm_min_max_sm80() { | ||
#if __CUDA_ARCH__ >= 800 | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.f16 | ||
__nvvm_fmin_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.f16 | ||
__nvvm_fmin_ftz_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.nan.f16 | ||
__nvvm_fmin_nan_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.nan.f16 | ||
__nvvm_fmin_ftz_nan_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.f16x2 | ||
__nvvm_fmin_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.f16x2 | ||
__nvvm_fmin_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.nan.f16x2 | ||
__nvvm_fmin_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2 | ||
__nvvm_fmin_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
|
||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.f16 | ||
__nvvm_fmax_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.f16 | ||
__nvvm_fmax_ftz_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.nan.f16 | ||
__nvvm_fmax_nan_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.nan.f16 | ||
__nvvm_fmax_ftz_nan_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.f16x2 | ||
__nvvm_fmax_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.f16x2 | ||
__nvvm_fmax_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.nan.f16x2 | ||
__nvvm_fmax_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2 | ||
__nvvm_fmax_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
#endif | ||
// CHECK: ret void | ||
} | ||
|
||
// CHECK-LABEL: nvvm_fma_f16_f16x2_sm80 | ||
__device__ void nvvm_fma_f16_f16x2_sm80() { | ||
#if __CUDA_ARCH__ >= 800 | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.relu.f16 | ||
__nvvm_fma_rn_relu_f16(0.1f16, 0.1f16, 0.1f16); | ||
// CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.ftz.relu.f16 | ||
__nvvm_fma_rn_ftz_relu_f16(0.1f16, 0.1f16, 0.1f16); | ||
|
||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2 | ||
__nvvm_fma_rn_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, | ||
{0.1f16, 0.7f16}); | ||
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2 | ||
__nvvm_fma_rn_ftz_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, | ||
{0.1f16, 0.7f16}); | ||
#endif | ||
// CHECK: ret void | ||
} | ||
|
||
// CHECK-LABEL: nvvm_fma_f16_f16x2_sm53 | ||
__device__ void nvvm_fma_f16_f16x2_sm53() { | ||
#if __CUDA_ARCH__ >= 530 | ||
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.f16 | ||
__nvvm_fma_rn_f16(0.1f16, 0.1f16, 0.1f16); | ||
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.f16 | ||
__nvvm_fma_rn_ftz_f16(0.1f16, 0.1f16, 0.1f16); | ||
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.sat.f16 | ||
__nvvm_fma_rn_sat_f16(0.1f16, 0.1f16, 0.1f16); | ||
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16 | ||
__nvvm_fma_rn_ftz_sat_f16(0.1f16, 0.1f16, 0.1f16); | ||
|
||
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2 | ||
__nvvm_fma_rn_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, | ||
{0.1f16, 0.7f16}); | ||
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2 | ||
__nvvm_fma_rn_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, | ||
{0.1f16, 0.7f16}); | ||
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2 | ||
__nvvm_fma_rn_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, | ||
{0.1f16, 0.7f16}); | ||
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2 | ||
__nvvm_fma_rn_ftz_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}, | ||
{0.1f16, 0.7f16}); | ||
#endif | ||
// CHECK: ret void | ||
} | ||
|
||
// CHECK-LABEL: nvvm_min_max_sm86 | ||
__device__ void nvvm_min_max_sm86() { | ||
#if __CUDA_ARCH__ >= 860 | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.xorsign.abs.f16 | ||
__nvvm_fmin_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16 | ||
__nvvm_fmin_ftz_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.nan.xorsign.abs.f16 | ||
__nvvm_fmin_nan_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16 | ||
__nvvm_fmin_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2 | ||
__nvvm_fmin_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2 | ||
__nvvm_fmin_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2 | ||
__nvvm_fmin_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2 | ||
__nvvm_fmin_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
|
||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.xorsign.abs.f16 | ||
__nvvm_fmax_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16 | ||
__nvvm_fmax_ftz_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.nan.xorsign.abs.f16 | ||
__nvvm_fmax_nan_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16 | ||
__nvvm_fmax_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2 | ||
__nvvm_fmax_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2 | ||
__nvvm_fmax_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2 | ||
__nvvm_fmax_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2 | ||
__nvvm_fmax_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16}); | ||
#endif | ||
// CHECK: ret void | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.