-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86][AVX10.2] Support AVX10.2-BF16 new instructions. #101603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
You can test this locally with the following command:git-clang-format --diff b412ec5d3924c7570c2c96106f95a92403a4e09b a6c4fce4069fb81ab4a50d825fe6039d45827d63 --extensions inc,cpp,h,c -- clang/lib/Headers/avx10_2_512bf16intrin.h clang/lib/Headers/avx10_2bf16intrin.h clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c clang/test/CodeGen/X86/avx10_2bf16-builtins.c clang/lib/Basic/Targets/X86.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86InstrFMA3Info.cpp llvm/lib/Target/X86/X86IntrinsicsInfo.h llvm/test/TableGen/x86-fold-tables.inc View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index c242b406f2..8320dfa5ab 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3310,56 +3310,57 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
bool IsVCMP = PatchedName[0] == 'v';
unsigned CCIdx = IsVCMP ? 4 : 3;
unsigned suffixLength = PatchedName.ends_with("pbf16") ? 5 : 2;
- unsigned CC = StringSwitch<unsigned>(
- PatchedName.slice(CCIdx, PatchedName.size() - suffixLength))
- .Case("eq", 0x00)
- .Case("eq_oq", 0x00)
- .Case("lt", 0x01)
- .Case("lt_os", 0x01)
- .Case("le", 0x02)
- .Case("le_os", 0x02)
- .Case("unord", 0x03)
- .Case("unord_q", 0x03)
- .Case("neq", 0x04)
- .Case("neq_uq", 0x04)
- .Case("nlt", 0x05)
- .Case("nlt_us", 0x05)
- .Case("nle", 0x06)
- .Case("nle_us", 0x06)
- .Case("ord", 0x07)
- .Case("ord_q", 0x07)
- /* AVX only from here */
- .Case("eq_uq", 0x08)
- .Case("nge", 0x09)
- .Case("nge_us", 0x09)
- .Case("ngt", 0x0A)
- .Case("ngt_us", 0x0A)
- .Case("false", 0x0B)
- .Case("false_oq", 0x0B)
- .Case("neq_oq", 0x0C)
- .Case("ge", 0x0D)
- .Case("ge_os", 0x0D)
- .Case("gt", 0x0E)
- .Case("gt_os", 0x0E)
- .Case("true", 0x0F)
- .Case("true_uq", 0x0F)
- .Case("eq_os", 0x10)
- .Case("lt_oq", 0x11)
- .Case("le_oq", 0x12)
- .Case("unord_s", 0x13)
- .Case("neq_us", 0x14)
- .Case("nlt_uq", 0x15)
- .Case("nle_uq", 0x16)
- .Case("ord_s", 0x17)
- .Case("eq_us", 0x18)
- .Case("nge_uq", 0x19)
- .Case("ngt_uq", 0x1A)
- .Case("false_os", 0x1B)
- .Case("neq_os", 0x1C)
- .Case("ge_oq", 0x1D)
- .Case("gt_oq", 0x1E)
- .Case("true_us", 0x1F)
- .Default(~0U);
+ unsigned CC =
+ StringSwitch<unsigned>(
+ PatchedName.slice(CCIdx, PatchedName.size() - suffixLength))
+ .Case("eq", 0x00)
+ .Case("eq_oq", 0x00)
+ .Case("lt", 0x01)
+ .Case("lt_os", 0x01)
+ .Case("le", 0x02)
+ .Case("le_os", 0x02)
+ .Case("unord", 0x03)
+ .Case("unord_q", 0x03)
+ .Case("neq", 0x04)
+ .Case("neq_uq", 0x04)
+ .Case("nlt", 0x05)
+ .Case("nlt_us", 0x05)
+ .Case("nle", 0x06)
+ .Case("nle_us", 0x06)
+ .Case("ord", 0x07)
+ .Case("ord_q", 0x07)
+ /* AVX only from here */
+ .Case("eq_uq", 0x08)
+ .Case("nge", 0x09)
+ .Case("nge_us", 0x09)
+ .Case("ngt", 0x0A)
+ .Case("ngt_us", 0x0A)
+ .Case("false", 0x0B)
+ .Case("false_oq", 0x0B)
+ .Case("neq_oq", 0x0C)
+ .Case("ge", 0x0D)
+ .Case("ge_os", 0x0D)
+ .Case("gt", 0x0E)
+ .Case("gt_os", 0x0E)
+ .Case("true", 0x0F)
+ .Case("true_uq", 0x0F)
+ .Case("eq_os", 0x10)
+ .Case("lt_oq", 0x11)
+ .Case("le_oq", 0x12)
+ .Case("unord_s", 0x13)
+ .Case("neq_us", 0x14)
+ .Case("nlt_uq", 0x15)
+ .Case("nle_uq", 0x16)
+ .Case("ord_s", 0x17)
+ .Case("eq_us", 0x18)
+ .Case("nge_uq", 0x19)
+ .Case("ngt_uq", 0x1A)
+ .Case("false_os", 0x1B)
+ .Case("neq_os", 0x1C)
+ .Case("ge_oq", 0x1D)
+ .Case("gt_oq", 0x1E)
+ .Case("true_us", 0x1F)
+ .Default(~0U);
if (CC != ~0U && (IsVCMP || CC < 8) &&
(IsVCMP || PatchedName.back() != 'h')) {
if (PatchedName.ends_with("ss"))
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 8fcc1c10d9..624e53cf4b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -167,15 +167,24 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik:
case X86::VCMPPHZrrib: case X86::VCMPPHZrribk:
case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
- case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri:
- case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri:
- case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri:
- case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
- case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
- case X86::VCMPPBF16Zrmik: case X86::VCMPPBF16Zrrik:
- case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
- case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
- case X86::VCMPPBF16Zrmbi: case X86::VCMPPBF16Zrmbik:
+ case X86::VCMPPBF16Z128rmi:
+ case X86::VCMPPBF16Z128rri:
+ case X86::VCMPPBF16Z256rmi:
+ case X86::VCMPPBF16Z256rri:
+ case X86::VCMPPBF16Zrmi:
+ case X86::VCMPPBF16Zrri:
+ case X86::VCMPPBF16Z128rmik:
+ case X86::VCMPPBF16Z128rrik:
+ case X86::VCMPPBF16Z256rmik:
+ case X86::VCMPPBF16Z256rrik:
+ case X86::VCMPPBF16Zrmik:
+ case X86::VCMPPBF16Zrrik:
+ case X86::VCMPPBF16Z128rmbi:
+ case X86::VCMPPBF16Z128rmbik:
+ case X86::VCMPPBF16Z256rmbi:
+ case X86::VCMPPBF16Z256rmbik:
+ case X86::VCMPPBF16Zrmbi:
+ case X86::VCMPPBF16Zrmbik:
if (Imm >= 0 && Imm <= 31) {
OS << '\t';
printCMPMnemonic(MI, /*IsVCMP*/true, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index e7ba13215f..3e664c5aef 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -309,15 +309,24 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk:
OS << "sh\t";
break;
- case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri:
- case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri:
- case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri:
- case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
- case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
- case X86::VCMPPBF16Zrmik: case X86::VCMPPBF16Zrrik:
- case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
- case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
- case X86::VCMPPBF16Zrmbi: case X86::VCMPPBF16Zrmbik:
+ case X86::VCMPPBF16Z128rmi:
+ case X86::VCMPPBF16Z128rri:
+ case X86::VCMPPBF16Z256rmi:
+ case X86::VCMPPBF16Z256rri:
+ case X86::VCMPPBF16Zrmi:
+ case X86::VCMPPBF16Zrri:
+ case X86::VCMPPBF16Z128rmik:
+ case X86::VCMPPBF16Z128rrik:
+ case X86::VCMPPBF16Z256rmik:
+ case X86::VCMPPBF16Z256rrik:
+ case X86::VCMPPBF16Zrmik:
+ case X86::VCMPPBF16Zrrik:
+ case X86::VCMPPBF16Z128rmbi:
+ case X86::VCMPPBF16Z128rmbik:
+ case X86::VCMPPBF16Z256rmbi:
+ case X86::VCMPPBF16Z256rmbik:
+ case X86::VCMPPBF16Zrmbi:
+ case X86::VCMPPBF16Zrmbik:
OS << "pbf16\t";
break;
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 39600ffcad..399660e3e5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -146,15 +146,24 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik:
case X86::VCMPPHZrrib: case X86::VCMPPHZrribk:
case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
- case X86::VCMPPBF16Z128rmi: case X86::VCMPPBF16Z128rri:
- case X86::VCMPPBF16Z256rmi: case X86::VCMPPBF16Z256rri:
- case X86::VCMPPBF16Zrmi: case X86::VCMPPBF16Zrri:
- case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
- case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
- case X86::VCMPPBF16Zrmik: case X86::VCMPPBF16Zrrik:
- case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
- case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
- case X86::VCMPPBF16Zrmbi: case X86::VCMPPBF16Zrmbik:
+ case X86::VCMPPBF16Z128rmi:
+ case X86::VCMPPBF16Z128rri:
+ case X86::VCMPPBF16Z256rmi:
+ case X86::VCMPPBF16Z256rri:
+ case X86::VCMPPBF16Zrmi:
+ case X86::VCMPPBF16Zrri:
+ case X86::VCMPPBF16Z128rmik:
+ case X86::VCMPPBF16Z128rrik:
+ case X86::VCMPPBF16Z256rmik:
+ case X86::VCMPPBF16Z256rrik:
+ case X86::VCMPPBF16Zrmik:
+ case X86::VCMPPBF16Zrrik:
+ case X86::VCMPPBF16Z128rmbi:
+ case X86::VCMPPBF16Z128rmbik:
+ case X86::VCMPPBF16Z256rmbi:
+ case X86::VCMPPBF16Z256rmbik:
+ case X86::VCMPPBF16Zrmbi:
+ case X86::VCMPPBF16Zrmbik:
if (Imm >= 0 && Imm <= 31) {
OS << '\t';
printCMPMnemonic(MI, /*IsVCMP*/true, OS);
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 94de164d5f..98cdb0632d 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -43,12 +43,12 @@ using namespace llvm;
FMA3GROUP(Name, Suf##m, Attrs) \
FMA3GROUP(Name, Suf##r, Attrs)
-#define FMA3GROUP_PACKED_DHS(Name, Attrs) \
- FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \
- FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \
+#define FMA3GROUP_PACKED_DHS(Name, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \
+ FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \
FMA3GROUP_PACKED_WIDTHS_ALL(Name, PS, Attrs)
-#define FMA3GROUP_PACKED_BF16(Name, Attrs) \
+#define FMA3GROUP_PACKED_BF16(Name, Attrs) \
FMA3GROUP_PACKED_WIDTHS_Z(Name, NEPBF16, Attrs)
#define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
@@ -69,34 +69,30 @@ using namespace llvm;
FMA3GROUP_SCALAR_WIDTHS_Z(Name, SH, Attrs) \
FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SS, Attrs)
-#define FMA3GROUP_FULL(Name, Attrs) \
- FMA3GROUP_PACKED_BF16(Name, Attrs) \
- FMA3GROUP_PACKED_DHS(Name, Attrs) \
+#define FMA3GROUP_FULL(Name, Attrs) \
+ FMA3GROUP_PACKED_BF16(Name, Attrs) \
+ FMA3GROUP_PACKED_DHS(Name, Attrs) \
FMA3GROUP_SCALAR(Name, Attrs)
static const X86InstrFMA3Group Groups[] = {
- FMA3GROUP_FULL(VFMADD, 0)
- FMA3GROUP_PACKED_DHS(VFMADDSUB, 0)
- FMA3GROUP_FULL(VFMSUB, 0)
- FMA3GROUP_PACKED_DHS(VFMSUBADD, 0)
- FMA3GROUP_FULL(VFNMADD, 0)
- FMA3GROUP_FULL(VFNMSUB, 0)
-};
+ FMA3GROUP_FULL(VFMADD, 0) FMA3GROUP_PACKED_DHS(VFMADDSUB, 0)
+ FMA3GROUP_FULL(VFMSUB, 0) FMA3GROUP_PACKED_DHS(VFMSUBADD, 0)
+ FMA3GROUP_FULL(VFNMADD, 0) FMA3GROUP_FULL(VFNMSUB, 0)};
#define FMA3GROUP_PACKED_AVX512_WIDTHS(Name, Type, Suf, Attrs) \
FMA3GROUP_MASKED(Name, Type##Z128##Suf, Attrs) \
FMA3GROUP_MASKED(Name, Type##Z256##Suf, Attrs) \
FMA3GROUP_MASKED(Name, Type##Z##Suf, Attrs)
-#define FMA3GROUP_PACKED_AVX512_ALL(Name, Suf, Attrs) \
- FMA3GROUP_PACKED_AVX512_WIDTHS(Name, NEPBF16, Suf, Attrs) \
- FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
- FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
+#define FMA3GROUP_PACKED_AVX512_ALL(Name, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, NEPBF16, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
-#define FMA3GROUP_PACKED_AVX512_DHS(Name, Suf, Attrs) \
- FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
- FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
+#define FMA3GROUP_PACKED_AVX512_DHS(Name, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
+ FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
#define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs) \
@@ -116,13 +112,12 @@ static const X86InstrFMA3Group Groups[] = {
FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
static const X86InstrFMA3Group BroadcastGroups[] = {
- FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0)
- FMA3GROUP_PACKED_AVX512_DHS(VFMADDSUB, mb, 0)
- FMA3GROUP_PACKED_AVX512_ALL(VFMSUB, mb, 0)
- FMA3GROUP_PACKED_AVX512_DHS(VFMSUBADD, mb, 0)
- FMA3GROUP_PACKED_AVX512_ALL(VFNMADD, mb, 0)
- FMA3GROUP_PACKED_AVX512_ALL(VFNMSUB, mb, 0)
-};
+ FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512_DHS(VFMADDSUB, mb, 0)
+ FMA3GROUP_PACKED_AVX512_ALL(VFMSUB, mb, 0)
+ FMA3GROUP_PACKED_AVX512_DHS(VFMSUBADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512_ALL(VFNMADD, mb, 0)
+ FMA3GROUP_PACKED_AVX512_ALL(VFNMSUB, mb, 0)};
static const X86InstrFMA3Group RoundGroups[] = {
FMA3GROUP_PACKED_AVX512_ROUND(VFMADD, rb, 0)
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-mc Author: Freddy Ye (FreddyLeaf) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/828965 Patch is 1.24 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101603.diff 30 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index e4aa8661b9a806..48376ee0527980 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2261,6 +2261,68 @@ TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:"
TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
+
+// AVX10.2 BF16
+TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
+
#undef BUILTIN
#undef TARGET_BUILTIN
#undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index a9cbdb7b10dff8..62c382b67ad14a 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -306,6 +306,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVX10_1_512 = true;
} else if (Feature == "+avx10.2-256") {
HasAVX10_2 = true;
+ HasFullBFloat16 = true;
} else if (Feature == "+avx10.2-512") {
HasAVX10_2_512 = true;
} else if (Feature == "+avx512cd") {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a733e4d834cfa..94af4e5f723c9a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14728,6 +14728,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_storeups512_mask:
return EmitX86MaskedStore(*this, Ops, Align(1));
+ case X86::BI__builtin_ia32_storesbf16128_mask:
case X86::BI__builtin_ia32_storesh128_mask:
case X86::BI__builtin_ia32_storess128_mask:
case X86::BI__builtin_ia32_storesd128_mask:
@@ -14836,6 +14837,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vfmaddph512_mask:
case X86::BI__builtin_ia32_vfmaddph512_maskz:
case X86::BI__builtin_ia32_vfmaddph512_mask3:
+ case X86::BI__builtin_ia32_vfmaddnepbh128:
+ case X86::BI__builtin_ia32_vfmaddnepbh256:
+ case X86::BI__builtin_ia32_vfmaddnepbh512:
case X86::BI__builtin_ia32_vfmaddps512_mask:
case X86::BI__builtin_ia32_vfmaddps512_maskz:
case X86::BI__builtin_ia32_vfmaddps512_mask3:
@@ -14920,6 +14924,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_loaddqudi512_mask:
return EmitX86MaskedLoad(*this, Ops, Align(1));
+ case X86::BI__builtin_ia32_loadsbf16128_mask:
case X86::BI__builtin_ia32_loadsh128_mask:
case X86::BI__builtin_ia32_loadss128_mask:
case X86::BI__builtin_ia32_loadsd128_mask:
@@ -16074,6 +16079,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_sqrtph256:
case X86::BI__builtin_ia32_sqrtph:
case X86::BI__builtin_ia32_sqrtph512:
+ case X86::BI__builtin_ia32_vsqrtnepbf16256:
+ case X86::BI__builtin_ia32_vsqrtnepbf16:
+ case X86::BI__builtin_ia32_vsqrtnepbf16512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
if (Ops.size() == 2) {
@@ -16293,6 +16301,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_fpclassps128_mask:
case X86::BI__builtin_ia32_fpclassps256_mask:
case X86::BI__builtin_ia32_fpclassps512_mask:
+ case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+ case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+ case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
case X86::BI__builtin_ia32_fpclassph128_mask:
case X86::BI__builtin_ia32_fpclassph256_mask:
case X86::BI__builtin_ia32_fpclassph512_mask:
@@ -16307,6 +16318,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Intrinsic::ID ID;
switch (BuiltinID) {
default: llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+ ID = Intrinsic::x86_avx10_fpclass_nepbf16_128;
+ break;
+ case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+ ID = Intrinsic::x86_avx10_fpclass_nepbf16_256;
+ break;
+ case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
+ ID = Intrinsic::x86_avx10_fpclass_nepbf16_512;
+ break;
case X86::BI__builtin_ia32_fpclassph128_mask:
ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
break;
@@ -16465,6 +16485,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vcmppd256_round_mask:
case X86::BI__builtin_ia32_vcmpps256_round_mask:
case X86::BI__builtin_ia32_vcmpph256_round_mask:
+ case X86::BI__builtin_ia32_vcmppbf16512_mask:
+ case X86::BI__builtin_ia32_vcmppbf16256_mask:
+ case X86::BI__builtin_ia32_vcmppbf16128_mask:
IsMaskFCmp = true;
[[fallthrough]];
case X86::BI__builtin_ia32_cmpps:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5a62538792f301..90d431f8627965 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,11 +147,13 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
avx10_2_512satcvtintrin.h
avx10_2convertintrin.h
+ avx10_2bf16intrin.h
avx10_2minmaxintrin.h
avx10_2niintrin.h
avx10_2satcvtintrin.h
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
new file mode 100644
index 00000000000000..158d5686c8f02f
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -0,0 +1,565 @@
+/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512BF16INTRIN_H
+#define __AVX10_2_512BF16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
+ __min_vector_width__(512)))
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
+ return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
+ return (__m512bh)__builtin_ia32_undef512();
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
+ return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+ bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+ bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
+ __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
+ __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+ __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
+ __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
+ __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
+ __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
+ return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
+ bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
+ bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
+ bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1};
+}
+
+#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
+ bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \
+ bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \
+ bf29, bf30, bf31, bf32) \
+ _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \
+ (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \
+ (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \
+ (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \
+ (bf3), (bf2), (bf1))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_ps(__m512bh __a) {
+ return (__m512)__a;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_pd(__m512bh __a) {
+ return (__m512d)__a;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_si512(__m512bh __a) {
+ return (__m512i)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
+ return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpd_pbh(__m512d __a) {
+ return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castsi512_pbh(__m512i __a) {
+ return (__m512bh)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16512_pbh128(__m512bh __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16512_pbh256(__m512bh __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16128_pbh512(__m128bh __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16256_pbh512(__m256bh __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextpbf16128_pbh512(__m128bh __a) {
+ return __builtin_shufflevector(
+ __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextpbf16256_pbh512(__m256bh __a) {
+ return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
+ return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
+ (__m512i)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_load_pbh(void const *__p) {
+ return *(const __m512bh *)__p;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_loadu_pbh(void const *__p) {
+ struct __loadu_pbh {
+ __m512bh_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbh *)__p)->__v;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
+ __m512bh __A) {
+ *(__m512bh *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
+ __m512bh __A) {
+ struct __storeu_pbh {
+ __m512bh_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbh *)__P)->__v = __A;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
+ return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
+ (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+ (__v32hi)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_addne_pbh(__m512bh __A, __m512bh __B) {
+ return (__m512bh)((__v32bf)__A + (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_selectpbf_512(
+ (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_selectpbf_512(
+ (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B),
+ (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_subne_pbh(__m512bh __A, __m512bh __B) {
+ return (__m512bh)((__v32bf)__A - (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_selectpbf_512(
+ (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_selectpbf_512(
+ (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B),
+ (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mulne_pbh(__m512bh __A, __m512bh __B) {
+ return (__m512bh)((__v32bf)__A * (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_selectpbf_512(
+ (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+ return (__m512bh)__builtin_ia32_sel...
[truncated]
|
Should we add folding in X86InstrFMA3Info.cpp? |
yes, addressed in b9e35f0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965