Skip to content

Reland "[X86][AVX10.2] Support AVX10.2 option and VMPSADBW/VADDP[D,H,S] new instructions (#101452)" #101616

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ X86 Support
functions defined by the ``*mmintrin.h`` headers. A mapping can be
found in the file ``clang/www/builtins.py``.

- Support ISA of ``AVX10.2``.

Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
8 changes: 8 additions & 0 deletions clang/include/clang/Basic/BuiltinsX86.def
Original file line number Diff line number Diff line change
Expand Up @@ -1959,6 +1959,14 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")

// AVX10.2 VMPSADBW
TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")

// AVX10.2 YMM Rounding
TARGET_BUILTIN(__builtin_ia32_vaddpd256_round, "V4dV4dV4dIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vaddph256_round, "V16xV16xV16xIi", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vaddps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")

// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
Expand Down
6 changes: 6 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -6205,6 +6205,12 @@ def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group<m_x86_AVX10_Features_Group
def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Group<m_x86_AVX10_Features_Group>;
def mavx10_1 : Flag<["-"], "mavx10.1">, Alias<mavx10_1_256>;
def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Alias<mno_avx10_1_256>;
def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group<m_x86_AVX10_Features_Group>;
def mno_avx10_2_256 : Flag<["-"], "mno-avx10.2-256">, Group<m_x86_AVX10_Features_Group>;
def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group<m_x86_AVX10_Features_Group>;
def mno_avx10_2_512 : Flag<["-"], "mno-avx10.2-512">, Group<m_x86_AVX10_Features_Group>;
def mavx10_2 : Flag<["-"], "mavx10.2">, Alias<mavx10_2_256>;
def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Alias<mno_avx10_2_256>;
def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
Expand Down
12 changes: 12 additions & 0 deletions clang/lib/Basic/Targets/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX10_1 = true;
} else if (Feature == "+avx10.1-512") {
HasAVX10_1_512 = true;
} else if (Feature == "+avx10.2-256") {
HasAVX10_2 = true;
} else if (Feature == "+avx10.2-512") {
HasAVX10_2_512 = true;
} else if (Feature == "+avx512cd") {
HasAVX512CD = true;
} else if (Feature == "+avx512vpopcntdq") {
Expand Down Expand Up @@ -824,6 +828,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVX10_1__");
if (HasAVX10_1_512)
Builder.defineMacro("__AVX10_1_512__");
if (HasAVX10_2)
Builder.defineMacro("__AVX10_2__");
if (HasAVX10_2_512)
Builder.defineMacro("__AVX10_2_512__");
if (HasAVX512CD)
Builder.defineMacro("__AVX512CD__");
if (HasAVX512VPOPCNTDQ)
Expand Down Expand Up @@ -1056,6 +1064,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
.Case("avx10.2-256", true)
.Case("avx10.2-512", true)
.Case("avx2", true)
.Case("avx512f", true)
.Case("avx512cd", true)
Expand Down Expand Up @@ -1171,6 +1181,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
.Case("avx10.2-256", HasAVX10_2)
.Case("avx10.2-512", HasAVX10_2_512)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
.Case("avx512cd", HasAVX512CD)
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Basic/Targets/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasF16C = false;
bool HasAVX10_1 = false;
bool HasAVX10_1_512 = false;
bool HasAVX10_2 = false;
bool HasAVX10_2_512 = false;
bool HasEVEX512 = false;
bool HasAVX512CD = false;
bool HasAVX512VPOPCNTDQ = false;
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Driver/ToolChains/Arch/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");
StringRef Version, Width;
std::tie(Version, Width) = Name.substr(6).split('-');
assert(Version == "1" && "Invalid AVX10 feature name.");
assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name.");
assert((Width == "256" || Width == "512") && "Invalid AVX10 feature name.");
#endif

Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
avx10_2_512niintrin.h
avx10_2niintrin.h
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
Expand Down
35 changes: 35 additions & 0 deletions clang/lib/Headers/avx10_2_512niintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
#endif

#ifdef __SSE2__

#ifndef __AVX10_2_512NIINTRIN_H
#define __AVX10_2_512NIINTRIN_H

/* VMPSADBW */
#define _mm512_mpsadbw_epu8(A, B, imm) \
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(imm)))

#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
((__m512i)__builtin_ia32_selectw_512( \
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
(__v32hi)(__m512i)(W)))

#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
((__m512i)__builtin_ia32_selectw_512( \
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
(__v32hi)_mm512_setzero_si512()))

#endif /* __SSE2__ */
#endif /* __AVX10_2_512NIINTRIN_H */
83 changes: 83 additions & 0 deletions clang/lib/Headers/avx10_2niintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead."
#endif

#ifdef __SSE2__

#ifndef __AVX10_2NIINTRIN_H
#define __AVX10_2NIINTRIN_H

/* VMPSADBW */
#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \
((__m128i)__builtin_ia32_selectw_128( \
(__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
(__v8hi)(__m128i)(W)))

#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \
((__m128i)__builtin_ia32_selectw_128( \
(__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \
(__v8hi)_mm_setzero_si128()))

#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \
((__m256i)__builtin_ia32_selectw_256( \
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
(__v16hi)(__m256i)(W)))

#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \
((__m256i)__builtin_ia32_selectw_256( \
(__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \
(__v16hi)_mm256_setzero_si256()))

/* YMM Rounding */
#define _mm256_add_round_pd(A, B, R) \
((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(R)))

#define _mm256_mask_add_round_pd(W, U, A, B, R) \
((__m256d)__builtin_ia32_selectpd_256( \
(__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
(__v4df)(__m256d)(W)))

#define _mm256_maskz_add_round_pd(U, A, B, R) \
((__m256d)__builtin_ia32_selectpd_256( \
(__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \
(__v4df)_mm256_setzero_pd()))

#define _mm256_add_round_ph(A, B, R) \
((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \
(__v16hf)(__m256h)(B), (int)(R)))

#define _mm256_mask_add_round_ph(W, U, A, B, R) \
((__m256h)__builtin_ia32_selectph_256( \
(__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
(__v16hf)(__m256h)(W)))

#define _mm256_maskz_add_round_ph(U, A, B, R) \
((__m256h)__builtin_ia32_selectph_256( \
(__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \
(__v16hf)_mm256_setzero_ph()))

#define _mm256_add_round_ps(A, B, R) \
((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(R)))

#define _mm256_mask_add_round_ps(W, U, A, B, R) \
((__m256)__builtin_ia32_selectps_256( \
(__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
(__v8sf)(__m256)(W)))

#define _mm256_maskz_add_round_ps(U, A, B, R) \
((__m256)__builtin_ia32_selectps_256( \
(__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \
(__v8sf)_mm256_setzero_ps()))

#endif /* __AVX10_2NIINTRIN_H */
#endif /* __SSE2__ */
8 changes: 8 additions & 0 deletions clang/lib/Headers/immintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,14 @@ _storebe_i64(void * __P, long long __D) {
#include <avx512vlvp2intersectintrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
#include <avx10_2niintrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
#include <avx10_2_512niintrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include <enqcmdintrin.h>
#endif
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/Sema/SemaX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_mulps512:
case X86::BI__builtin_ia32_subpd512:
case X86::BI__builtin_ia32_subps512:
case X86::BI__builtin_ia32_vaddpd256_round:
case X86::BI__builtin_ia32_vaddph256_round:
case X86::BI__builtin_ia32_vaddps256_round:
case X86::BI__builtin_ia32_cvtsi2sd64:
case X86::BI__builtin_ia32_cvtsi2ss32:
case X86::BI__builtin_ia32_cvtsi2ss64:
Expand Down
25 changes: 25 additions & 0 deletions clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-512 -emit-llvm -o - | FileCheck %s

#include <immintrin.h>

// VMPSADBW
__m512i test_mm512_mpsadbw_epu8(__m512i __A, __m512i __B) {
// CHECK-LABEL: @test_mm512_mpsadbw_epu8
// CHECK: @llvm.x86.avx10.vmpsadbw.512
return _mm512_mpsadbw_epu8(__A, __B, 17);
}

__m512i test_mm512_mask_mpsadbw_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: @test_mm512_mask_mpsadbw_epu8
// CHECK: @llvm.x86.avx10.vmpsadbw.512
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
return _mm512_mask_mpsadbw_epu8(__W, __U, __A, __B, 17);
}

__m512i test_mm512_maskz_mpsadbw_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: @test_mm512_maskz_mpsadbw_epu8
// CHECK: @llvm.x86.avx10.vmpsadbw.512
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
return _mm512_maskz_mpsadbw_epu8(__U, __A, __B, 17);
}
106 changes: 106 additions & 0 deletions clang/test/CodeGen/X86/avx10_2ni-builtins.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i686 -target-feature +avx10.2-256 -emit-llvm -o - | FileCheck %s

#include <immintrin.h>

// VMPSADBW
__m128i test_mm_mpsadbw_epu8(__m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_mpsadbw_epu8
// CHECK: @llvm.x86.sse41.mpsadbw
return _mm_mpsadbw_epu8(__A, __B, 170);
}

__m128i test_mm_mask_mpsadbw_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_mask_mpsadbw_epu8
// CHECK: @llvm.x86.sse41.mpsadbw
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
return _mm_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
}

__m128i test_mm_maskz_mpsadbw_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_maskz_mpsadbw_epu8
// CHECK: @llvm.x86.sse41.mpsadbw
// CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
return _mm_maskz_mpsadbw_epu8(__U, __A, __B, 170);
}

__m256i test_mm256_mpsadbw_epu8(__m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_mpsadbw_epu8
// CHECK: @llvm.x86.avx2.mpsadbw
return _mm256_mpsadbw_epu8(__A, __B, 170);
}

__m256i test_mm256_mask_mpsadbw_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_mask_mpsadbw_epu8
// CHECK: @llvm.x86.avx2.mpsadbw
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
return _mm256_mask_mpsadbw_epu8(__W, __U, __A, __B, 170);
}

__m256i test_mm256_maskz_mpsadbw_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_maskz_mpsadbw_epu8
// CHECK: @llvm.x86.avx2.mpsadbw
// CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
return _mm256_maskz_mpsadbw_epu8(__U, __A, __B, 170);
}

// YMM Rounding
__m256d test_mm256_add_round_pd(__m256d __A, __m256d __B) {
// CHECK-LABEL: @test_mm256_add_round_pd
// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 11)
return _mm256_add_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}

__m256d test_mm256_mask_add_round_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
// CHECK-LABEL: @test_mm256_mask_add_round_pd
// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 10)
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm256_mask_add_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
}

__m256d test_mm256_maskz_add_round_pd(__mmask8 __U, __m256d __A, __m256d __B) {
// CHECK-LABEL: @test_mm256_maskz_add_round_pd
// CHECK: @llvm.x86.avx10.vaddpd256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i32 9)
// CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
return _mm256_maskz_add_round_pd(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
}

__m256h test_mm256_add_round_ph(__m256h __A, __m256h __B) {
// CHECK-LABEL: @test_mm256_add_round_ph
// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 11)
return _mm256_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}

__m256h test_mm256_mask_add_round_ph(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
// CHECK-LABEL: @test_mm256_mask_add_round_ph
// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 10)
// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
return _mm256_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
}

__m256h test_mm256_maskz_add_round_ph(__mmask8 __U, __m256h __A, __m256h __B) {
// CHECK-LABEL: @test_mm256_maskz_add_round_ph
// CHECK: @llvm.x86.avx10.vaddph256(<16 x half> %{{.*}}, <16 x half> %{{.*}}, i32 9)
// CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
return _mm256_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
}

__m256 test_mm256_add_round_ps(__m256 __A, __m256 __B) {
// CHECK-LABEL: @test_mm256_add_round_ps
// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 11)
return _mm256_add_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}

__m256 test_mm256_mask_add_round_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
// CHECK-LABEL: @test_mm256_mask_add_round_ps
// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 10)
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_add_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
}

__m256 test_mm256_maskz_add_round_ps(__mmask8 __U, __m256 __A, __m256 __B) {
// CHECK-LABEL: @test_mm256_maskz_add_round_ps
// CHECK: @llvm.x86.avx10.vaddps256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i32 9)
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_add_round_ps(__U, __A, __B, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
}
Loading
Loading