-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Implement FP8 floating-point mode helper intrinsics #100608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang Author: Momchil Velikov (momchil-velikov) ChangesFull diff: https://github.com/llvm/llvm-project/pull/100608.diff 2 Files Affected:
diff --git a/clang/test/CodeGen/aarch64-fpm-helpers.c b/clang/test/CodeGen/aarch64-fpm-helpers.c
new file mode 100644
index 0000000000000..dba79cebae547
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-fpm-helpers.c
@@ -0,0 +1,162 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1 -O2 -triple aarch64 -emit-llvm -x c -DUSE_NEON_H %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O2 -triple aarch64 -emit-llvm -x c -DUSE_SVE_H %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O2 -triple aarch64 -emit-llvm -x c -DUSE_SME_H %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O2 -triple aarch64 -emit-llvm -x c++ -DUSE_NEON_H %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O2 -triple aarch64 -emit-llvm -x c++ -DUSE_SVE_H %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O2 -triple aarch64 -emit-llvm -x c++ -DUSE_SME_H %s -o - | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef USE_NEON_H
+#include "arm_neon.h"
+#endif
+
+#ifdef USE_SVE_H
+#include "arm_sve.h"
+#endif
+
+#ifdef USE_SME_H
+#include "arm_sme.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// CHECK-LABEL: define dso_local noundef i64 @test_init(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 0
+//
+fpm_t test_init() { return __arm_fpm_init(); }
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 2) i64 @test_src1_1(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 0
+//
+fpm_t test_src1_1() {
+ return __arm_set_fpm_src1_format(__arm_fpm_init(), __ARM_FPM_E5M2);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 2) i64 @test_src1_2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 1
+//
+fpm_t test_src1_2() {
+ return __arm_set_fpm_src1_format(__arm_fpm_init(), __ARM_FPM_E4M3);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 16) i64 @test_src2_1(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 0
+//
+fpm_t test_src2_1() {
+ return __arm_set_fpm_src2_format(__arm_fpm_init(), __ARM_FPM_E5M2);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 16) i64 @test_src2_2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 8
+//
+fpm_t test_src2_2() {
+ return __arm_set_fpm_src2_format(__arm_fpm_init(), __ARM_FPM_E4M3);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 128) i64 @test_dst1_1(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 0
+//
+fpm_t test_dst1_1() {
+ return __arm_set_fpm_dst_format(__arm_fpm_init(), __ARM_FPM_E5M2);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 128) i64 @test_dst2_2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 64
+//
+fpm_t test_dst2_2() {
+ return __arm_set_fpm_dst_format(__arm_fpm_init(), __ARM_FPM_E4M3);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 32768) i64 @test_of_mul_1(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 0
+//
+fpm_t test_of_mul_1() {
+ return __arm_set_fpm_overflow_mul(__arm_fpm_init(), __ARM_FPM_INFNAN);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 32768) i64 @test_of_mul_2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 16384
+//
+fpm_t test_of_mul_2() {
+ return __arm_set_fpm_overflow_mul(__arm_fpm_init(), __ARM_FPM_SATURATE);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 65536) i64 @test_of_cvt_1(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 0
+//
+fpm_t test_of_cvt_1() {
+ return __arm_set_fpm_overflow_cvt(__arm_fpm_init(), __ARM_FPM_INFNAN);
+}
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 65536) i64 @test_of_cvt_2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 32768
+//
+fpm_t test_of_cvt_2() {
+ return __arm_set_fpm_overflow_cvt(__arm_fpm_init(), __ARM_FPM_SATURATE);
+}
+
+// CHECK-LABEL: define dso_local noundef i64 @test_lscale(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 8323072
+//
+fpm_t test_lscale() { return __arm_set_fpm_lscale(__arm_fpm_init(), 127); }
+
+// CHECK-LABEL: define dso_local noundef i64 @test_lscale2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 270582939648
+//
+fpm_t test_lscale2() { return __arm_set_fpm_lscale2(__arm_fpm_init(), 63); }
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_1(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 2147483648
+//
+fpm_t test_nscale_1() { return __arm_set_fpm_nscale(__arm_fpm_init(), -128); }
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_2(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 2130706432
+//
+fpm_t test_nscale_2() { return __arm_set_fpm_nscale(__arm_fpm_init(), 127); }
+
+// CHECK-LABEL: define dso_local noundef range(i64 0, 4294967296) i64 @test_nscale_3(
+// CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: ret i64 4278190080
+//
+fpm_t test_nscale_3() { return __arm_set_fpm_nscale(__arm_fpm_init(), -1); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 30fbb8c5d65e5..3596c4a8a9ada 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -2581,6 +2581,60 @@ void NeonEmitter::runVectorTypes(raw_ostream &OS) {
OS << "typedef double float64_t;\n";
OS << "#endif\n\n";
+ OS << R"(
+typedef uint64_t fpm_t;
+
+enum __ARM_FPM_FORMAT { __ARM_FPM_E5M2, __ARM_FPM_E4M3 };
+
+enum __ARM_FPM_OVERFLOW { __ARM_FPM_INFNAN, __ARM_FPM_SATURATE };
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_fpm_init(void) {
+ return 0;
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_src1_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
+ return (__fpm & ~7ull) | (fpm_t)__format;
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_src2_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
+ return (__fpm & ~0x38ull) | ((fpm_t)__format << 3u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_dst_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
+ return (__fpm & ~0x1c0ull) | ((fpm_t)__format << 6u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_overflow_mul(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
+ return (__fpm & ~0x4000ull) | ((fpm_t)__behaviour << 14u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_overflow_cvt(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
+ return (__fpm & ~0x8000ull) | ((fpm_t)__behaviour << 15u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_lscale(fpm_t __fpm, uint64_t __scale) {
+ return (__fpm & ~0x7f0000ull) | (__scale << 16u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_nscale(fpm_t __fpm, int64_t __scale) {
+ return (__fpm & ~0xff000000ull) | (((fpm_t)__scale & 0xffu) << 24u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_lscale2(fpm_t __fpm, uint64_t __scale) {
+ return (uint32_t)__fpm | (__scale << 32u);
+}
+
+)";
+
emitNeonTypeDefs("cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQd", OS);
emitNeonTypeDefs("bQb", OS);
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm unsure as to whether the fpm format will ever be changed or extended, but there are some things to consider if it is:
- using a 64 to 32-bit downcast to zero-out bits 32-37 might be a difficult bug to find, and the
__scale
parameter here is not masked to keep it in range. - passing
__arm_fpm_init()
as the__fpm
to these CodeGen tests cannot check whether the correct value is used to zero-out the target bits.
Good point, in the tests where we set a field to zero we should start with a non-zero in that field. |
bd28142
to
01548e2
Compare
Tweaked the test a bit. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for updating the tests! Just a few questions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cheers, it LGTM.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/195/builds/249 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/7640 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/139/builds/5486 Here is the relevant piece of the build log for the reference
|
Are you also planning to do the feature test macros, e.g. |
…00608) Implement FP8 mode helper intrinsics (as inline functions) as specified in ACLE 2024Q3 "14.2 Helper intrinsics" https://github.com/ARM-software/acle/releases/download/r2024Q3/acle-2024Q3.pdf
In principle, yes, but once the ACLE intrinsics are implemented, since the presence of the macro indicates that |
Implement FP8 mode helper intrinsics (as inline functions) as
specified in ACLE 2024Q3 "14.2 Helper intrinsics"
https://github.com/ARM-software/acle/releases/download/r2024Q3/acle-2024Q3.pdf