Skip to content

Commit 259ca9e

Browse files
authored
Reland "[X86][AVX10.2] Support AVX10.2 option and VMPSADBW/VADDP[D,H,S] new instructions (#101452)" (#101616)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
1 parent a43677c commit 259ca9e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1446
-42
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ X86 Support
217217
functions defined by the ``*mmintrin.h`` headers. A mapping can be
218218
found in the file ``clang/www/builtins.py``.
219219

220+
- Support ISA of ``AVX10.2``.
221+
220222
Arm and AArch64 Support
221223
^^^^^^^^^^^^^^^^^^^^^^^
222224

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1959,6 +1959,14 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
19591959
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
19601960
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
19611961

1962+
// AVX10.2 VMPSADBW
1963+
TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")
1964+
1965+
// AVX10.2 YMM Rounding
1966+
TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
1967+
TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
1968+
TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")
1969+
19621970
// AVX-VNNI-INT16
19631971
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
19641972
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")

clang/include/clang/Driver/Options.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6205,6 +6205,12 @@ def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group
62056205
def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
62066206
def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
62076207
def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
6208+
def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
6209+
def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
6210+
def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
6211+
def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
6212+
def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
6213+
def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
62086214
def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
62096215
def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
62106216
def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;

clang/lib/Basic/Targets/X86.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
304304
HasAVX10_1 = true;
305305
} else if (Feature == "+avx10.1-512") {
306306
HasAVX10_1_512 = true;
307+
} else if (Feature == "+avx10.2-256") {
308+
HasAVX10_2 = true;
309+
} else if (Feature == "+avx10.2-512") {
310+
HasAVX10_2_512 = true;
307311
} else if (Feature == "+avx512cd") {
308312
HasAVX512CD = true;
309313
} else if (Feature == "+avx512vpopcntdq") {
@@ -824,6 +828,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
824828
Builder.defineMacro("__AVX10_1__");
825829
if (HasAVX10_1_512)
826830
Builder.defineMacro("__AVX10_1_512__");
831+
if (HasAVX10_2)
832+
Builder.defineMacro("__AVX10_2__");
833+
if (HasAVX10_2_512)
834+
Builder.defineMacro("__AVX10_2_512__");
827835
if (HasAVX512CD)
828836
Builder.defineMacro("__AVX512CD__");
829837
if (HasAVX512VPOPCNTDQ)
@@ -1056,6 +1064,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
10561064
.Case("avx", true)
10571065
.Case("avx10.1-256", true)
10581066
.Case("avx10.1-512", true)
1067+
.Case("avx10.2-256", true)
1068+
.Case("avx10.2-512", true)
10591069
.Case("avx2", true)
10601070
.Case("avx512f", true)
10611071
.Case("avx512cd", true)
@@ -1171,6 +1181,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
11711181
.Case("avx", SSELevel >= AVX)
11721182
.Case("avx10.1-256", HasAVX10_1)
11731183
.Case("avx10.1-512", HasAVX10_1_512)
1184+
.Case("avx10.2-256", HasAVX10_2)
1185+
.Case("avx10.2-512", HasAVX10_2_512)
11741186
.Case("avx2", SSELevel >= AVX2)
11751187
.Case("avx512f", SSELevel >= AVX512F)
11761188
.Case("avx512cd", HasAVX512CD)

clang/lib/Basic/Targets/X86.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
9292
bool HasF16C = false;
9393
bool HasAVX10_1 = false;
9494
bool HasAVX10_1_512 = false;
95+
bool HasAVX10_2 = false;
96+
bool HasAVX10_2_512 = false;
9597
bool HasEVEX512 = false;
9698
bool HasAVX512CD = false;
9799
bool HasAVX512VPOPCNTDQ = false;

clang/lib/Driver/ToolChains/Arch/X86.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
241241
assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
242242
StringRef Version, Width;
243243
std::tie(Version, Width) = Name.substr(6).split('-');
244-
assert(Version == "1" && "Invalid AVX10 feature name.");
244+
assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
245245
assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
246246
#endif
247247

clang/lib/Headers/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ set(x86_files
147147
amxcomplexintrin.h
148148
amxfp16intrin.h
149149
amxintrin.h
150+
avx10_2_512niintrin.h
151+
avx10_2niintrin.h
150152
avx2intrin.h
151153
avx512bf16intrin.h
152154
avx512bitalgintrin.h
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
#ifndef __IMMINTRIN_H
10+
#error \
11+
"Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
12+
#endif
13+
14+
#ifdef __SSE2__
15+
16+
#ifndef __AVX10_2_512NIINTRIN_H
17+
#define __AVX10_2_512NIINTRIN_H
18+
19+
/* VMPSADBW */
20+
#define _mm512_mpsadbw_epu8(A, B, imm) \
21+
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
22+
(__v64qi)(__m512i)(B), (int)(imm)))
23+
24+
#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
25+
((__m512i)__builtin_ia32_selectw_512( \
26+
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
27+
(__v32hi)(__m512i)(W)))
28+
29+
#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
30+
((__m512i)__builtin_ia32_selectw_512( \
31+
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
32+
(__v32hi)_mm512_setzero_si512()))
33+
34+
#endif /* __SSE2__ */
35+
#endif /* __AVX10_2_512NIINTRIN_H */

clang/lib/Headers/avx10_2niintrin.h

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
#ifndef __IMMINTRIN_H
10+
#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
11+
#endif
12+
13+
#ifdef __SSE2__
14+
15+
#ifndef __AVX10_2NIINTRIN_H
16+
#define __AVX10_2NIINTRIN_H
17+
18+
/* VMPSADBW */
19+
#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
20+
((__m128i)__builtin_ia32_selectw_128( \
21+
(__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
22+
(__v8hi)(__m128i)(W)))
23+
24+
#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
25+
((__m128i)__builtin_ia32_selectw_128( \
26+
(__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
27+
(__v8hi)_mm_setzero_si128()))
28+
29+
#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
30+
((__m256i)__builtin_ia32_selectw_256( \
31+
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
32+
(__v16hi)(__m256i)(W)))
33+
34+
#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
35+
((__m256i)__builtin_ia32_selectw_256( \
36+
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
37+
(__v16hi)_mm256_setzero_si256()))
38+
39+
/* YMM Rounding */
40+
#define _mm256_add_round_pd(A, B, R) \
41+
((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
42+
(__v4df)(__m256d)(B), (int)(R)))
43+
44+
#define _mm256_mask_add_round_pd(W, U, A, B, R) \
45+
((__m256d)__builtin_ia32_selectpd_256( \
46+
(__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
47+
(__v4df)(__m256d)(W)))
48+
49+
#define _mm256_maskz_add_round_pd(U, A, B, R) \
50+
((__m256d)__builtin_ia32_selectpd_256( \
51+
(__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
52+
(__v4df)_mm256_setzero_pd()))
53+
54+
#define _mm256_add_round_ph(A, B, R) \
55+
((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
56+
(__v16hf)(__m256h)(B), (int)(R)))
57+
58+
#define _mm256_mask_add_round_ph(W, U, A, B, R) \
59+
((__m256h)__builtin_ia32_selectph_256( \
60+
(__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
61+
(__v16hf)(__m256h)(W)))
62+
63+
#define _mm256_maskz_add_round_ph(U, A, B, R) \
64+
((__m256h)__builtin_ia32_selectph_256( \
65+
(__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
66+
(__v16hf)_mm256_setzero_ph()))
67+
68+
#define _mm256_add_round_ps(A, B, R) \
69+
((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
70+
(__v8sf)(__m256)(B), (int)(R)))
71+
72+
#define _mm256_mask_add_round_ps(W, U, A, B, R) \
73+
((__m256)__builtin_ia32_selectps_256( \
74+
(__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
75+
(__v8sf)(__m256)(W)))
76+
77+
#define _mm256_maskz_add_round_ps(U, A, B, R) \
78+
((__m256)__builtin_ia32_selectps_256( \
79+
(__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
80+
(__v8sf)_mm256_setzero_ps()))
81+
82+
#endif /* __AVX10_2NIINTRIN_H */
83+
#endif /* __SSE2__ */

clang/lib/Headers/immintrin.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) {
648648
#include <avx512vlvp2intersectintrin.h>
649649
#endif
650650

651+
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
652+
#include <avx10_2niintrin.h>
653+
#endif
654+
655+
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
656+
#include <avx10_2_512niintrin.h>
657+
#endif
658+
651659
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
652660
#include <enqcmdintrin.h>
653661
#endif

clang/lib/Sema/SemaX86.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,9 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
162162
case X86::BI__builtin_ia32_mulps512:
163163
case X86::BI__builtin_ia32_subpd512:
164164
case X86::BI__builtin_ia32_subps512:
165+
case X86::BI__builtin_ia32_vaddpd256_round:
166+
case X86::BI__builtin_ia32_vaddph256_round:
167+
case X86::BI__builtin_ia32_vaddps256_round:
165168
case X86::BI__builtin_ia32_cvtsi2sd64:
166169
case X86::BI__builtin_ia32_cvtsi2ss32:
167170
case X86::BI__builtin_ia32_cvtsi2ss64:
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
2+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
3+
4+
#include <immintrin.h>
5+
6+
// VMPSADBW
7+
__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
8+
// CHECK-LABEL: @test_mm512_mpsadbw_epu8
9+
// CHECK: @llvm.x86.avx10.vmpsadbw.512
10+
return _mm512_mpsadbw_epu8(__A, __B, 17);
11+
}
12+
13+
__m512i test_mm512_mask_mpsadbw_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
14+
// CHECK-LABEL: @test_mm512_mask_mpsadbw_epu8
15+
// CHECK: @llvm.x86.avx10.vmpsadbw.512
16+
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
17+
return _mm512_mask_mpsadbw_epu8(__W, __U, __A, __B, 17);
18+
}
19+
20+
__m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
21+
// CHECK-LABEL: @test_mm512_maskz_mpsadbw_epu8
22+
// CHECK: @llvm.x86.avx10.vmpsadbw.512
23+
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
24+
return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
25+
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
2+
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
3+
4+
#include <immintrin.h>
5+
6+
// VMPSADBW
7+
__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
8+
// CHECK-LABEL: @test_mm_mpsadbw_epu8
9+
// CHECK: @llvm.x86.sse41.mpsadbw
10+
return _mm_mpsadbw_epu8(__A, __B, 170);
11+
}
12+
13+
__m128i test_mm_mask_mpsadbw_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
14+
// CHECK-LABEL: @test_mm_mask_mpsadbw_epu8
15+
// CHECK: @llvm.x86.sse41.mpsadbw
16+
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
17+
return _mm_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
18+
}
19+
20+
__m128i test_mm_maskz_mpsadbw_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
21+
// CHECK-LABEL: @test_mm_maskz_mpsadbw_epu8
22+
// CHECK: @llvm.x86.sse41.mpsadbw
23+
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
24+
return _mm_maskz_mpsadbw_epu8(__U, __A, __B, 170);
25+
}
26+
27+
__m256i test_mm256_mpsadbw_epu8(__m256i __A, __m256i __B) {
28+
// CHECK-LABEL: @test_mm256_mpsadbw_epu8
29+
// CHECK: @llvm.x86.avx2.mpsadbw
30+
return _mm256_mpsadbw_epu8(__A, __B, 170);
31+
}
32+
33+
__m256i test_mm256_mask_mpsadbw_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
34+
// CHECK-LABEL: @test_mm256_mask_mpsadbw_epu8
35+
// CHECK: @llvm.x86.avx2.mpsadbw
36+
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
37+
return _mm256_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
38+
}
39+
40+
__m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
41+
// CHECK-LABEL: @test_mm256_maskz_mpsadbw_epu8
42+
// CHECK: @llvm.x86.avx2.mpsadbw
43+
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
44+
return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
45+
}
46+
47+
// YMM Rounding
48+
__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
49+
// CHECK-LABEL: @test_mm256_add_round_pd
50+
// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 11)
51+
return _mm256_add_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
52+
}
53+
54+
__m256d test_mm256_mask_add_round_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
55+
// CHECK-LABEL: @test_mm256_mask_add_round_pd
56+
// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 10)
57+
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
58+
return _mm256_mask_add_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
59+
}
60+
61+
__m256d test_mm256_maskz_add_round_pd(__mmask8 __U, __m256d __A, __m256d __B) {
62+
// CHECK-LABEL: @test_mm256_maskz_add_round_pd
63+
// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 9)
64+
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
65+
return _mm256_maskz_add_round_pd(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
66+
}
67+
68+
__m256h test_mm256_add_round_ph(__m256h __A, __m256h __B) {
69+
// CHECK-LABEL: @test_mm256_add_round_ph
70+
// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 11)
71+
return _mm256_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
72+
}
73+
74+
__m256h test_mm256_mask_add_round_ph(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
75+
// CHECK-LABEL: @test_mm256_mask_add_round_ph
76+
// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 10)
77+
// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
78+
return _mm256_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
79+
}
80+
81+
__m256h test_mm256_maskz_add_round_ph(__mmask8 __U, __m256h __A, __m256h __B) {
82+
// CHECK-LABEL: @test_mm256_maskz_add_round_ph
83+
// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 9)
84+
// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
85+
return _mm256_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
86+
}
87+
88+
__m256 test_mm256_add_round_ps(__m256 __A, __m256 __B) {
89+
// CHECK-LABEL: @test_mm256_add_round_ps
90+
// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 11)
91+
return _mm256_add_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
92+
}
93+
94+
__m256 test_mm256_mask_add_round_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
95+
// CHECK-LABEL: @test_mm256_mask_add_round_ps
96+
// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 10)
97+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
98+
return _mm256_mask_add_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
99+
}
100+
101+
__m256 test_mm256_maskz_add_round_ps(__mmask8 __U, __m256 __A, __m256 __B) {
102+
// CHECK-LABEL: @test_mm256_maskz_add_round_ps
103+
// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 9)
104+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
105+
return _mm256_maskz_add_round_ps(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
106+
}

0 commit comments

Comments
 (0)