-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Reland "[X86][AVX10.2] Support AVX10.2 option and VMPSADBW/VADDP[D,H,S] new instructions (#101452)" #101616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-support Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 106.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101616.diff 49 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 866adefd5d3c4..183adb9e003f2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -216,6 +216,8 @@ X86 Support
functions defined by the ``*mmintrin.h`` headers. A mapping can be
found in the file ``clang/www/builtins.py``.
+- Support ISA of ``AVX10.2``.
+
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 06ca30d65f5bd..f028711a807c0 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1959,6 +1959,14 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+// AVX10.2 VMPSADBW
+TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 YMM Rounding
+TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+
// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f690467bb82cd..b5c19ebaaffab 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6205,6 +6205,12 @@ def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group
def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
+def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
+def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 18e6dbf03e00d..3fb3587eb5914 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -304,6 +304,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX10_1 = true;
} else if (Feature == "+avx10.1-512") {
HasAVX10_1_512 = true;
+ } else if (Feature == "+avx10.2-256") {
+ HasAVX10_2 = true;
+ } else if (Feature == "+avx10.2-512") {
+ HasAVX10_2_512 = true;
} else if (Feature == "+avx512cd") {
HasAVX512CD = true;
} else if (Feature == "+avx512vpopcntdq") {
@@ -824,6 +828,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVX10_1__");
if (HasAVX10_1_512)
Builder.defineMacro("__AVX10_1_512__");
+ if (HasAVX10_2)
+ Builder.defineMacro("__AVX10_2__");
+ if (HasAVX10_2_512)
+ Builder.defineMacro("__AVX10_2_512__");
if (HasAVX512CD)
Builder.defineMacro("__AVX512CD__");
if (HasAVX512VPOPCNTDQ)
@@ -1056,6 +1064,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
+ .Case("avx10.2-256", true)
+ .Case("avx10.2-512", true)
.Case("avx2", true)
.Case("avx512f", true)
.Case("avx512cd", true)
@@ -1171,6 +1181,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
+ .Case("avx10.2-256", HasAVX10_2)
+ .Case("avx10.2-512", HasAVX10_2_512)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
.Case("avx512cd", HasAVX512CD)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index ba34ab2c7f336..79fd5867cf667 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasF16C = false;
bool HasAVX10_1 = false;
bool HasAVX10_1_512 = false;
+ bool HasAVX10_2 = false;
+ bool HasAVX10_2_512 = false;
bool HasEVEX512 = false;
bool HasAVX512CD = false;
bool HasAVX512VPOPCNTDQ = false;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index dc6c8695488bb..b2109e11038fe 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -241,7 +241,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
StringRef Version, Width;
std::tie(Version, Width) = Name.substr(6).split('-');
- assert(Version == "1" && "Invalid AVX10 feature name.");
+ assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
#endif
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 89fa0ecd45eb4..b17ab24d625a0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,6 +147,8 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512niintrin.h
+ avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
new file mode 100644
index 0000000000000..5ad6993b45433
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -0,0 +1,35 @@
+/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512NIINTRIN_H
+#define __AVX10_2_512NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm512_mpsadbw_epu8(A, B, imm) \
+ ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+#endif /* __SSE2__ */
+#endif /* __AVX10_2_512NIINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
new file mode 100644
index 0000000000000..3527e0eaf5c89
--- /dev/null
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -0,0 +1,83 @@
+/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2NIINTRIN_H
+#define __AVX10_2NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+/* YMM Rounding */
+#define _mm256_add_round_pd(A, B, R) \
+ ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(R)))
+
+#define _mm256_mask_add_round_pd(W, U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_add_round_pd(U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_add_round_ph(A, B, R) \
+ ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
+ (__v16hf)(__m256h)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ph(W, U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)(__m256h)(W)))
+
+#define _mm256_maskz_add_round_ph(U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)_mm256_setzero_ph()))
+
+#define _mm256_add_round_ps(A, B, R) \
+ ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ps(W, U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_add_round_ps(U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+#endif /* __AVX10_2NIINTRIN_H */
+#endif /* __SSE2__ */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index cd6cf09b90cad..e0957257ed5c7 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) {
#include <avx512vlvp2intersectintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
+#include <avx10_2niintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
+#include <avx10_2_512niintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 8f9057bbaf259..bf2d2d8ac8f42 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -162,6 +162,9 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_mulps512:
case X86::BI__builtin_ia32_subpd512:
case X86::BI__builtin_ia32_subps512:
+ case X86::BI__builtin_ia32_vaddpd256_round:
+ case X86::BI__builtin_ia32_vaddph256_round:
+ case X86::BI__builtin_ia32_vaddps256_round:
case X86::BI__builtin_ia32_cvtsi2sd64:
case X86::BI__builtin_ia32_cvtsi2ss32:
case X86::BI__builtin_ia32_cvtsi2ss64:
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
new file mode 100644
index 0000000000000..b7982e6ecca84
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+ return _mm512_mpsadbw_epu8(__A, __B, 17);
+}
+
+__m512i test_mm512_mask_mpsadbw_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_mask_mpsadbw_epu8(__W, __U, __A, __B, 17);
+}
+
+__m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
new file mode 100644
index 0000000000000..baf3a35a9a191
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -0,0 +1,106 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+ return _mm_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m128i test_mm_mask_mpsadbw_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m128i test_mm_maskz_mpsadbw_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+__m256i test_mm256_mpsadbw_epu8(__m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+ return _mm256_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m256i test_mm256_mask_mpsadbw_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+// YMM Rounding
+__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 11)
+ return _mm256_add_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_mask_add_round_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 10)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_mask_add_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_maskz_add_round_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 9)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_maskz_add_round_pd(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_add_round_ph(__m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 11)
+ return _mm256_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_mask_add_round_ph(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 10)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_maskz_add_round_ph(__mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 9)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_add_round_ps(__m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 11)
+ return _mm256_add_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_mask_add_round_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 10)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_add_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_maskz_add_round_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 9)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_maskz_add_round_ps(__U, __A, __B, _MM_...
[truncated]
|
@llvm/pr-subscribers-mc Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 106.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101616.diff 49 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 866adefd5d3c4..183adb9e003f2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -216,6 +216,8 @@ X86 Support
functions defined by the ``*mmintrin.h`` headers. A mapping can be
found in the file ``clang/www/builtins.py``.
+- Support ISA of ``AVX10.2``.
+
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 06ca30d65f5bd..f028711a807c0 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1959,6 +1959,14 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+// AVX10.2 VMPSADBW
+TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 YMM Rounding
+TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+
// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f690467bb82cd..b5c19ebaaffab 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6205,6 +6205,12 @@ def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group
def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
+def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
+def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 18e6dbf03e00d..3fb3587eb5914 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -304,6 +304,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX10_1 = true;
} else if (Feature == "+avx10.1-512") {
HasAVX10_1_512 = true;
+ } else if (Feature == "+avx10.2-256") {
+ HasAVX10_2 = true;
+ } else if (Feature == "+avx10.2-512") {
+ HasAVX10_2_512 = true;
} else if (Feature == "+avx512cd") {
HasAVX512CD = true;
} else if (Feature == "+avx512vpopcntdq") {
@@ -824,6 +828,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVX10_1__");
if (HasAVX10_1_512)
Builder.defineMacro("__AVX10_1_512__");
+ if (HasAVX10_2)
+ Builder.defineMacro("__AVX10_2__");
+ if (HasAVX10_2_512)
+ Builder.defineMacro("__AVX10_2_512__");
if (HasAVX512CD)
Builder.defineMacro("__AVX512CD__");
if (HasAVX512VPOPCNTDQ)
@@ -1056,6 +1064,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
+ .Case("avx10.2-256", true)
+ .Case("avx10.2-512", true)
.Case("avx2", true)
.Case("avx512f", true)
.Case("avx512cd", true)
@@ -1171,6 +1181,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
+ .Case("avx10.2-256", HasAVX10_2)
+ .Case("avx10.2-512", HasAVX10_2_512)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
.Case("avx512cd", HasAVX512CD)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index ba34ab2c7f336..79fd5867cf667 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasF16C = false;
bool HasAVX10_1 = false;
bool HasAVX10_1_512 = false;
+ bool HasAVX10_2 = false;
+ bool HasAVX10_2_512 = false;
bool HasEVEX512 = false;
bool HasAVX512CD = false;
bool HasAVX512VPOPCNTDQ = false;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index dc6c8695488bb..b2109e11038fe 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -241,7 +241,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
StringRef Version, Width;
std::tie(Version, Width) = Name.substr(6).split('-');
- assert(Version == "1" && "Invalid AVX10 feature name.");
+ assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
#endif
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 89fa0ecd45eb4..b17ab24d625a0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,6 +147,8 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512niintrin.h
+ avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
new file mode 100644
index 0000000000000..5ad6993b45433
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -0,0 +1,35 @@
+/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512NIINTRIN_H
+#define __AVX10_2_512NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm512_mpsadbw_epu8(A, B, imm) \
+ ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+#endif /* __SSE2__ */
+#endif /* __AVX10_2_512NIINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
new file mode 100644
index 0000000000000..3527e0eaf5c89
--- /dev/null
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -0,0 +1,83 @@
+/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2NIINTRIN_H
+#define __AVX10_2NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+/* YMM Rounding */
+#define _mm256_add_round_pd(A, B, R) \
+ ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(R)))
+
+#define _mm256_mask_add_round_pd(W, U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_add_round_pd(U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_add_round_ph(A, B, R) \
+ ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
+ (__v16hf)(__m256h)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ph(W, U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)(__m256h)(W)))
+
+#define _mm256_maskz_add_round_ph(U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)_mm256_setzero_ph()))
+
+#define _mm256_add_round_ps(A, B, R) \
+ ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ps(W, U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_add_round_ps(U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+#endif /* __AVX10_2NIINTRIN_H */
+#endif /* __SSE2__ */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index cd6cf09b90cad..e0957257ed5c7 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) {
#include <avx512vlvp2intersectintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
+#include <avx10_2niintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
+#include <avx10_2_512niintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 8f9057bbaf259..bf2d2d8ac8f42 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -162,6 +162,9 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_mulps512:
case X86::BI__builtin_ia32_subpd512:
case X86::BI__builtin_ia32_subps512:
+ case X86::BI__builtin_ia32_vaddpd256_round:
+ case X86::BI__builtin_ia32_vaddph256_round:
+ case X86::BI__builtin_ia32_vaddps256_round:
case X86::BI__builtin_ia32_cvtsi2sd64:
case X86::BI__builtin_ia32_cvtsi2ss32:
case X86::BI__builtin_ia32_cvtsi2ss64:
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
new file mode 100644
index 0000000000000..b7982e6ecca84
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+ return _mm512_mpsadbw_epu8(__A, __B, 17);
+}
+
+__m512i test_mm512_mask_mpsadbw_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_mask_mpsadbw_epu8(__W, __U, __A, __B, 17);
+}
+
+__m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
new file mode 100644
index 0000000000000..baf3a35a9a191
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -0,0 +1,106 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+ return _mm_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m128i test_mm_mask_mpsadbw_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m128i test_mm_maskz_mpsadbw_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+__m256i test_mm256_mpsadbw_epu8(__m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+ return _mm256_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m256i test_mm256_mask_mpsadbw_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+// YMM Rounding
+__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 11)
+ return _mm256_add_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_mask_add_round_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 10)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_mask_add_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_maskz_add_round_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 9)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_maskz_add_round_pd(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_add_round_ph(__m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 11)
+ return _mm256_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_mask_add_round_ph(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 10)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_maskz_add_round_ph(__mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 9)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_add_round_ps(__m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 11)
+ return _mm256_add_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_mask_add_round_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 10)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_add_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_maskz_add_round_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 9)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_maskz_add_round_ps(__U, __A, __B, _MM_...
[truncated]
|
@llvm/pr-subscribers-clang Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 106.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101616.diff 49 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 866adefd5d3c4..183adb9e003f2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -216,6 +216,8 @@ X86 Support
functions defined by the ``*mmintrin.h`` headers. A mapping can be
found in the file ``clang/www/builtins.py``.
+- Support ISA of ``AVX10.2``.
+
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 06ca30d65f5bd..f028711a807c0 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1959,6 +1959,14 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+// AVX10.2 VMPSADBW
+TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 YMM Rounding
+TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+
// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f690467bb82cd..b5c19ebaaffab 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6205,6 +6205,12 @@ def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group
def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
+def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
+def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 18e6dbf03e00d..3fb3587eb5914 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -304,6 +304,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX10_1 = true;
} else if (Feature == "+avx10.1-512") {
HasAVX10_1_512 = true;
+ } else if (Feature == "+avx10.2-256") {
+ HasAVX10_2 = true;
+ } else if (Feature == "+avx10.2-512") {
+ HasAVX10_2_512 = true;
} else if (Feature == "+avx512cd") {
HasAVX512CD = true;
} else if (Feature == "+avx512vpopcntdq") {
@@ -824,6 +828,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVX10_1__");
if (HasAVX10_1_512)
Builder.defineMacro("__AVX10_1_512__");
+ if (HasAVX10_2)
+ Builder.defineMacro("__AVX10_2__");
+ if (HasAVX10_2_512)
+ Builder.defineMacro("__AVX10_2_512__");
if (HasAVX512CD)
Builder.defineMacro("__AVX512CD__");
if (HasAVX512VPOPCNTDQ)
@@ -1056,6 +1064,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
+ .Case("avx10.2-256", true)
+ .Case("avx10.2-512", true)
.Case("avx2", true)
.Case("avx512f", true)
.Case("avx512cd", true)
@@ -1171,6 +1181,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
+ .Case("avx10.2-256", HasAVX10_2)
+ .Case("avx10.2-512", HasAVX10_2_512)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
.Case("avx512cd", HasAVX512CD)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index ba34ab2c7f336..79fd5867cf667 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasF16C = false;
bool HasAVX10_1 = false;
bool HasAVX10_1_512 = false;
+ bool HasAVX10_2 = false;
+ bool HasAVX10_2_512 = false;
bool HasEVEX512 = false;
bool HasAVX512CD = false;
bool HasAVX512VPOPCNTDQ = false;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index dc6c8695488bb..b2109e11038fe 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -241,7 +241,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
StringRef Version, Width;
std::tie(Version, Width) = Name.substr(6).split('-');
- assert(Version == "1" && "Invalid AVX10 feature name.");
+ assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
#endif
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 89fa0ecd45eb4..b17ab24d625a0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,6 +147,8 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512niintrin.h
+ avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
new file mode 100644
index 0000000000000..5ad6993b45433
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -0,0 +1,35 @@
+/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512NIINTRIN_H
+#define __AVX10_2_512NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm512_mpsadbw_epu8(A, B, imm) \
+ ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+#endif /* __SSE2__ */
+#endif /* __AVX10_2_512NIINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
new file mode 100644
index 0000000000000..3527e0eaf5c89
--- /dev/null
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -0,0 +1,83 @@
+/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2NIINTRIN_H
+#define __AVX10_2NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+/* YMM Rounding */
+#define _mm256_add_round_pd(A, B, R) \
+ ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(R)))
+
+#define _mm256_mask_add_round_pd(W, U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_add_round_pd(U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_add_round_ph(A, B, R) \
+ ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
+ (__v16hf)(__m256h)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ph(W, U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)(__m256h)(W)))
+
+#define _mm256_maskz_add_round_ph(U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)_mm256_setzero_ph()))
+
+#define _mm256_add_round_ps(A, B, R) \
+ ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ps(W, U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_add_round_ps(U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+#endif /* __AVX10_2NIINTRIN_H */
+#endif /* __SSE2__ */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index cd6cf09b90cad..e0957257ed5c7 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) {
#include <avx512vlvp2intersectintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
+#include <avx10_2niintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
+#include <avx10_2_512niintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 8f9057bbaf259..bf2d2d8ac8f42 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -162,6 +162,9 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_mulps512:
case X86::BI__builtin_ia32_subpd512:
case X86::BI__builtin_ia32_subps512:
+ case X86::BI__builtin_ia32_vaddpd256_round:
+ case X86::BI__builtin_ia32_vaddph256_round:
+ case X86::BI__builtin_ia32_vaddps256_round:
case X86::BI__builtin_ia32_cvtsi2sd64:
case X86::BI__builtin_ia32_cvtsi2ss32:
case X86::BI__builtin_ia32_cvtsi2ss64:
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
new file mode 100644
index 0000000000000..b7982e6ecca84
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+ return _mm512_mpsadbw_epu8(__A, __B, 17);
+}
+
+__m512i test_mm512_mask_mpsadbw_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_mask_mpsadbw_epu8(__W, __U, __A, __B, 17);
+}
+
+__m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
new file mode 100644
index 0000000000000..baf3a35a9a191
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -0,0 +1,106 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+ return _mm_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m128i test_mm_mask_mpsadbw_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m128i test_mm_maskz_mpsadbw_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+__m256i test_mm256_mpsadbw_epu8(__m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+ return _mm256_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m256i test_mm256_mask_mpsadbw_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+// YMM Rounding
+__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 11)
+ return _mm256_add_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_mask_add_round_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 10)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_mask_add_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_maskz_add_round_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 9)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_maskz_add_round_pd(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_add_round_ph(__m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 11)
+ return _mm256_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_mask_add_round_ph(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 10)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_maskz_add_round_ph(__mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 9)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_add_round_ps(__m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 11)
+ return _mm256_add_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_mask_add_round_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 10)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_add_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_maskz_add_round_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 9)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_maskz_add_round_ps(__U, __A, __B, _MM_...
[truncated]
|
@llvm/pr-subscribers-clang-driver Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 106.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101616.diff 49 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 866adefd5d3c4..183adb9e003f2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -216,6 +216,8 @@ X86 Support
functions defined by the ``*mmintrin.h`` headers. A mapping can be
found in the file ``clang/www/builtins.py``.
+- Support ISA of ``AVX10.2``.
+
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 06ca30d65f5bd..f028711a807c0 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1959,6 +1959,14 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+// AVX10.2 VMPSADBW
+TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
+
+// AVX10.2 YMM Rounding
+TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
+
// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f690467bb82cd..b5c19ebaaffab 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6205,6 +6205,12 @@ def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group
def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
+def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
+def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
+def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 18e6dbf03e00d..3fb3587eb5914 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -304,6 +304,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX10_1 = true;
} else if (Feature == "+avx10.1-512") {
HasAVX10_1_512 = true;
+ } else if (Feature == "+avx10.2-256") {
+ HasAVX10_2 = true;
+ } else if (Feature == "+avx10.2-512") {
+ HasAVX10_2_512 = true;
} else if (Feature == "+avx512cd") {
HasAVX512CD = true;
} else if (Feature == "+avx512vpopcntdq") {
@@ -824,6 +828,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVX10_1__");
if (HasAVX10_1_512)
Builder.defineMacro("__AVX10_1_512__");
+ if (HasAVX10_2)
+ Builder.defineMacro("__AVX10_2__");
+ if (HasAVX10_2_512)
+ Builder.defineMacro("__AVX10_2_512__");
if (HasAVX512CD)
Builder.defineMacro("__AVX512CD__");
if (HasAVX512VPOPCNTDQ)
@@ -1056,6 +1064,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
+ .Case("avx10.2-256", true)
+ .Case("avx10.2-512", true)
.Case("avx2", true)
.Case("avx512f", true)
.Case("avx512cd", true)
@@ -1171,6 +1181,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
+ .Case("avx10.2-256", HasAVX10_2)
+ .Case("avx10.2-512", HasAVX10_2_512)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
.Case("avx512cd", HasAVX512CD)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index ba34ab2c7f336..79fd5867cf667 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasF16C = false;
bool HasAVX10_1 = false;
bool HasAVX10_1_512 = false;
+ bool HasAVX10_2 = false;
+ bool HasAVX10_2_512 = false;
bool HasEVEX512 = false;
bool HasAVX512CD = false;
bool HasAVX512VPOPCNTDQ = false;
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index dc6c8695488bb..b2109e11038fe 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -241,7 +241,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
StringRef Version, Width;
std::tie(Version, Width) = Name.substr(6).split('-');
- assert(Version == "1" && "Invalid AVX10 feature name.");
+ assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
#endif
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 89fa0ecd45eb4..b17ab24d625a0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,6 +147,8 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512niintrin.h
+ avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
new file mode 100644
index 0000000000000..5ad6993b45433
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -0,0 +1,35 @@
+/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512NIINTRIN_H
+#define __AVX10_2_512NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm512_mpsadbw_epu8(A, B, imm) \
+ ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512( \
+ (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+#endif /* __SSE2__ */
+#endif /* __AVX10_2_512NIINTRIN_H */
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
new file mode 100644
index 0000000000000..3527e0eaf5c89
--- /dev/null
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -0,0 +1,83 @@
+/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2NIINTRIN_H
+#define __AVX10_2NIINTRIN_H
+
+/* VMPSADBW */
+#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128( \
+ (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256( \
+ (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+/* YMM Rounding */
+#define _mm256_add_round_pd(A, B, R) \
+ ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(R)))
+
+#define _mm256_mask_add_round_pd(W, U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_add_round_pd(U, A, B, R) \
+ ((__m256d)__builtin_ia32_selectpd_256( \
+ (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_add_round_ph(A, B, R) \
+ ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
+ (__v16hf)(__m256h)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ph(W, U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)(__m256h)(W)))
+
+#define _mm256_maskz_add_round_ph(U, A, B, R) \
+ ((__m256h)__builtin_ia32_selectph_256( \
+ (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
+ (__v16hf)_mm256_setzero_ph()))
+
+#define _mm256_add_round_ps(A, B, R) \
+ ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(R)))
+
+#define _mm256_mask_add_round_ps(W, U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_add_round_ps(U, A, B, R) \
+ ((__m256)__builtin_ia32_selectps_256( \
+ (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+#endif /* __AVX10_2NIINTRIN_H */
+#endif /* __SSE2__ */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index cd6cf09b90cad..e0957257ed5c7 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) {
#include <avx512vlvp2intersectintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
+#include <avx10_2niintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
+#include <avx10_2_512niintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 8f9057bbaf259..bf2d2d8ac8f42 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -162,6 +162,9 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_mulps512:
case X86::BI__builtin_ia32_subpd512:
case X86::BI__builtin_ia32_subps512:
+ case X86::BI__builtin_ia32_vaddpd256_round:
+ case X86::BI__builtin_ia32_vaddph256_round:
+ case X86::BI__builtin_ia32_vaddps256_round:
case X86::BI__builtin_ia32_cvtsi2sd64:
case X86::BI__builtin_ia32_cvtsi2ss32:
case X86::BI__builtin_ia32_cvtsi2ss64:
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
new file mode 100644
index 0000000000000..b7982e6ecca84
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+ return _mm512_mpsadbw_epu8(__A, __B, 17);
+}
+
+__m512i test_mm512_mask_mpsadbw_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_mask_mpsadbw_epu8(__W, __U, __A, __B, 17);
+}
+
+__m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
+// CHECK-LABEL: @test_mm512_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx10.vmpsadbw.512
+// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+ return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
+}
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
new file mode 100644
index 0000000000000..baf3a35a9a191
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -0,0 +1,106 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
+
+#include <immintrin.h>
+
+// VMPSADBW
+__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+ return _mm_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m128i test_mm_mask_mpsadbw_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m128i test_mm_maskz_mpsadbw_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
+// CHECK-LABEL: @test_mm_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.sse41.mpsadbw
+// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+ return _mm_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+__m256i test_mm256_mpsadbw_epu8(__m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+ return _mm256_mpsadbw_epu8(__A, __B, 170);
+}
+
+__m256i test_mm256_mask_mpsadbw_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_mask_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
+}
+
+__m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
+// CHECK-LABEL: @test_mm256_maskz_mpsadbw_epu8
+// CHECK: @llvm.x86.avx2.mpsadbw
+// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+ return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
+}
+
+// YMM Rounding
+__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 11)
+ return _mm256_add_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_mask_add_round_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 10)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_mask_add_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256d test_mm256_maskz_add_round_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_pd
+// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 9)
+// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+ return _mm256_maskz_add_round_pd(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_add_round_ph(__m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 11)
+ return _mm256_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_mask_add_round_ph(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 10)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm256_maskz_add_round_ph(__mmask8 __U, __m256h __A, __m256h __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ph
+// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 9)
+// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+ return _mm256_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_add_round_ps(__m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 11)
+ return _mm256_add_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_mask_add_round_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_mask_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 10)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_mask_add_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+__m256 test_mm256_maskz_add_round_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+// CHECK-LABEL: @test_mm256_maskz_add_round_ps
+// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 9)
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+ return _mm256_maskz_add_round_ps(__U, __A, __B, _MM_...
[truncated]
|
@@ -223,6 +227,10 @@ InstructionContext RecognizableInstr::insnContext() const { | |||
insnContext = EVEX_KB_U(IC_EVEX_XD); | |||
else if (OpPrefix == X86Local::PS) | |||
insnContext = EVEX_KB_U(IC_EVEX); | |||
else { | |||
errs() << "Instruction does not use a prefix: " << Name << "\n"; | |||
llvm_unreachable("Invalid prefix"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default op prefix for EVEX
is PS
in compiler. 6e20df1
Should we use the message like
llvm_unreachable(("Invalid prefix for" + Name).c_str())
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean remove the errs() << ...
? How about the others? Both VEX and EVEX have the 4 prefixes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant there is always a default prefix opcode PS
for EVEX prefix, so does not use a prefix
is misleading.
Yes, I suggest merging the errs() <<
into llvm_unreachable
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I've asked myself some time ago if there is a difference between llvm_unreachable
and errs()
. The deal is in production build. The message from llvm_unreachable
is not shown.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think if the errs()
is followed by llvm_unreachable
, the message will be optimized away too in prod build.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No sure about the iostream operations, but I remember llvm_unreachable
won't optimizate function call like printf.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I lean toward to keep both.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update the error message?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are only 2 out of 4141 grep -rwn llvm_unreachable llvm/lib/ | wc
use +
to show more detail. I think llvm_unreachable
is mainly used to show the position to developer, so no need detailed information.
@@ -214,6 +214,10 @@ InstructionContext RecognizableInstr::insnContext() const { | |||
insnContext = EVEX_KB_U(IC_EVEX_W_XD); | |||
else if (OpPrefix == X86Local::PS) | |||
insnContext = EVEX_KB_U(IC_EVEX_W); | |||
else { | |||
errs() << "Instruction does not use a prefix: " << Name << "\n"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
Is this fail related to your PR?
|
I think so, investigating.. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
if (Has512Len) | ||
setFeature(FEATURE_AVX10_2_512); | ||
} | ||
if (AVX10Ver >= 1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the reason of relanding that compiler-rt parses the literal cpuid without any implications?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. There're two problems from compiling:
- compiler-rt doesn't support
llvm_unreachable
; - some buildbot will warn about
[[fallthrough]]
But yes, I found the implications problem during revisiting it, i.e., an old version compiler-rt runs on future HW.
@@ -223,6 +227,10 @@ InstructionContext RecognizableInstr::insnContext() const { | |||
insnContext = EVEX_KB_U(IC_EVEX_XD); | |||
else if (OpPrefix == X86Local::PS) | |||
insnContext = EVEX_KB_U(IC_EVEX); | |||
else { | |||
errs() << "Instruction does not use a prefix: " << Name << "\n"; | |||
llvm_unreachable("Invalid prefix"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I've asked myself some time ago if there is a difference between llvm_unreachable
and errs()
. The deal is in production build. The message from llvm_unreachable
is not shown.
✅ With the latest revision this PR passed the C/C++ code formatter. |
This is an interesting issue. The problem is due to https://github.com/llvm/llvm-project/blob/main/llvm/unittests/tools/llvm-cfi-verify/FileAnalysis.cpp#L242, which is an illegal encoding. While I switched the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor suggestion
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965