Skip to content

[X86][AVX10.2] Support AVX10.2-BF16 new instructions. #101603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 4, 2024

Conversation

FreddyLeaf
Copy link
Contributor

@FreddyLeaf FreddyLeaf commented Aug 2, 2024

Copy link

github-actions bot commented Aug 2, 2024

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff b412ec5d3924c7570c2c96106f95a92403a4e09b a6c4fce4069fb81ab4a50d825fe6039d45827d63 --extensions inc,cpp,h,c -- clang/lib/Headers/avx10_2_512bf16intrin.h clang/lib/Headers/avx10_2bf16intrin.h clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c clang/test/CodeGen/X86/avx10_2bf16-builtins.c clang/lib/Basic/Targets/X86.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86InstrFMA3Info.cpp llvm/lib/Target/X86/X86IntrinsicsInfo.h llvm/test/TableGen/x86-fold-tables.inc
View the diff from clang-format here.
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index c242b406f2..8320dfa5ab 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3310,56 +3310,57 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     bool IsVCMP = PatchedName[0] == 'v';
     unsigned CCIdx = IsVCMP ? 4 : 3;
     unsigned suffixLength = PatchedName.ends_with("pbf16") ? 5 : 2;
-    unsigned CC = StringSwitch<unsigned>(
-      PatchedName.slice(CCIdx, PatchedName.size() - suffixLength))
-      .Case("eq",       0x00)
-      .Case("eq_oq",    0x00)
-      .Case("lt",       0x01)
-      .Case("lt_os",    0x01)
-      .Case("le",       0x02)
-      .Case("le_os",    0x02)
-      .Case("unord",    0x03)
-      .Case("unord_q",  0x03)
-      .Case("neq",      0x04)
-      .Case("neq_uq",   0x04)
-      .Case("nlt",      0x05)
-      .Case("nlt_us",   0x05)
-      .Case("nle",      0x06)
-      .Case("nle_us",   0x06)
-      .Case("ord",      0x07)
-      .Case("ord_q",    0x07)
-      /* AVX only from here */
-      .Case("eq_uq",    0x08)
-      .Case("nge",      0x09)
-      .Case("nge_us",   0x09)
-      .Case("ngt",      0x0A)
-      .Case("ngt_us",   0x0A)
-      .Case("false",    0x0B)
-      .Case("false_oq", 0x0B)
-      .Case("neq_oq",   0x0C)
-      .Case("ge",       0x0D)
-      .Case("ge_os",    0x0D)
-      .Case("gt",       0x0E)
-      .Case("gt_os",    0x0E)
-      .Case("true",     0x0F)
-      .Case("true_uq",  0x0F)
-      .Case("eq_os",    0x10)
-      .Case("lt_oq",    0x11)
-      .Case("le_oq",    0x12)
-      .Case("unord_s",  0x13)
-      .Case("neq_us",   0x14)
-      .Case("nlt_uq",   0x15)
-      .Case("nle_uq",   0x16)
-      .Case("ord_s",    0x17)
-      .Case("eq_us",    0x18)
-      .Case("nge_uq",   0x19)
-      .Case("ngt_uq",   0x1A)
-      .Case("false_os", 0x1B)
-      .Case("neq_os",   0x1C)
-      .Case("ge_oq",    0x1D)
-      .Case("gt_oq",    0x1E)
-      .Case("true_us",  0x1F)
-      .Default(~0U);
+    unsigned CC =
+        StringSwitch<unsigned>(
+            PatchedName.slice(CCIdx, PatchedName.size() - suffixLength))
+            .Case("eq", 0x00)
+            .Case("eq_oq", 0x00)
+            .Case("lt", 0x01)
+            .Case("lt_os", 0x01)
+            .Case("le", 0x02)
+            .Case("le_os", 0x02)
+            .Case("unord", 0x03)
+            .Case("unord_q", 0x03)
+            .Case("neq", 0x04)
+            .Case("neq_uq", 0x04)
+            .Case("nlt", 0x05)
+            .Case("nlt_us", 0x05)
+            .Case("nle", 0x06)
+            .Case("nle_us", 0x06)
+            .Case("ord", 0x07)
+            .Case("ord_q", 0x07)
+            /* AVX only from here */
+            .Case("eq_uq", 0x08)
+            .Case("nge", 0x09)
+            .Case("nge_us", 0x09)
+            .Case("ngt", 0x0A)
+            .Case("ngt_us", 0x0A)
+            .Case("false", 0x0B)
+            .Case("false_oq", 0x0B)
+            .Case("neq_oq", 0x0C)
+            .Case("ge", 0x0D)
+            .Case("ge_os", 0x0D)
+            .Case("gt", 0x0E)
+            .Case("gt_os", 0x0E)
+            .Case("true", 0x0F)
+            .Case("true_uq", 0x0F)
+            .Case("eq_os", 0x10)
+            .Case("lt_oq", 0x11)
+            .Case("le_oq", 0x12)
+            .Case("unord_s", 0x13)
+            .Case("neq_us", 0x14)
+            .Case("nlt_uq", 0x15)
+            .Case("nle_uq", 0x16)
+            .Case("ord_s", 0x17)
+            .Case("eq_us", 0x18)
+            .Case("nge_uq", 0x19)
+            .Case("ngt_uq", 0x1A)
+            .Case("false_os", 0x1B)
+            .Case("neq_os", 0x1C)
+            .Case("ge_oq", 0x1D)
+            .Case("gt_oq", 0x1E)
+            .Case("true_us", 0x1F)
+            .Default(~0U);
     if (CC != ~0U && (IsVCMP || CC < 8) &&
         (IsVCMP || PatchedName.back() != 'h')) {
       if (PatchedName.ends_with("ss"))
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 8fcc1c10d9..624e53cf4b 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -167,15 +167,24 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPHZrmbi:     case X86::VCMPPHZrmbik:
   case X86::VCMPPHZrrib:     case X86::VCMPPHZrribk:
   case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
-  case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
-  case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
-  case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
-  case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
-  case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
-  case X86::VCMPPBF16Zrmik:    case X86::VCMPPBF16Zrrik:
-  case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
-  case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
-  case X86::VCMPPBF16Zrmbi:    case X86::VCMPPBF16Zrmbik:
+  case X86::VCMPPBF16Z128rmi:
+  case X86::VCMPPBF16Z128rri:
+  case X86::VCMPPBF16Z256rmi:
+  case X86::VCMPPBF16Z256rri:
+  case X86::VCMPPBF16Zrmi:
+  case X86::VCMPPBF16Zrri:
+  case X86::VCMPPBF16Z128rmik:
+  case X86::VCMPPBF16Z128rrik:
+  case X86::VCMPPBF16Z256rmik:
+  case X86::VCMPPBF16Z256rrik:
+  case X86::VCMPPBF16Zrmik:
+  case X86::VCMPPBF16Zrrik:
+  case X86::VCMPPBF16Z128rmbi:
+  case X86::VCMPPBF16Z128rmbik:
+  case X86::VCMPPBF16Z256rmbi:
+  case X86::VCMPPBF16Z256rmbik:
+  case X86::VCMPPBF16Zrmbi:
+  case X86::VCMPPBF16Zrmbik:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index e7ba13215f..3e664c5aef 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -309,15 +309,24 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
   case X86::VCMPSHZrmi_Intk: case X86::VCMPSHZrri_Intk:
     OS << "sh\t";
     break;
-  case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
-  case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
-  case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
-  case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
-  case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
-  case X86::VCMPPBF16Zrmik:    case X86::VCMPPBF16Zrrik:
-  case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
-  case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
-  case X86::VCMPPBF16Zrmbi:    case X86::VCMPPBF16Zrmbik:
+  case X86::VCMPPBF16Z128rmi:
+  case X86::VCMPPBF16Z128rri:
+  case X86::VCMPPBF16Z256rmi:
+  case X86::VCMPPBF16Z256rri:
+  case X86::VCMPPBF16Zrmi:
+  case X86::VCMPPBF16Zrri:
+  case X86::VCMPPBF16Z128rmik:
+  case X86::VCMPPBF16Z128rrik:
+  case X86::VCMPPBF16Z256rmik:
+  case X86::VCMPPBF16Z256rrik:
+  case X86::VCMPPBF16Zrmik:
+  case X86::VCMPPBF16Zrrik:
+  case X86::VCMPPBF16Z128rmbi:
+  case X86::VCMPPBF16Z128rmbik:
+  case X86::VCMPPBF16Z256rmbi:
+  case X86::VCMPPBF16Z256rmbik:
+  case X86::VCMPPBF16Zrmbi:
+  case X86::VCMPPBF16Zrmbik:
     OS << "pbf16\t";
     break;
   }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 39600ffcad..399660e3e5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -146,15 +146,24 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPHZrmbi:     case X86::VCMPPHZrmbik:
   case X86::VCMPPHZrrib:     case X86::VCMPPHZrribk:
   case X86::VCMPSHZrrib_Int: case X86::VCMPSHZrrib_Intk:
-  case X86::VCMPPBF16Z128rmi:  case X86::VCMPPBF16Z128rri:
-  case X86::VCMPPBF16Z256rmi:  case X86::VCMPPBF16Z256rri:
-  case X86::VCMPPBF16Zrmi:     case X86::VCMPPBF16Zrri:
-  case X86::VCMPPBF16Z128rmik: case X86::VCMPPBF16Z128rrik:
-  case X86::VCMPPBF16Z256rmik: case X86::VCMPPBF16Z256rrik:
-  case X86::VCMPPBF16Zrmik:    case X86::VCMPPBF16Zrrik:
-  case X86::VCMPPBF16Z128rmbi: case X86::VCMPPBF16Z128rmbik:
-  case X86::VCMPPBF16Z256rmbi: case X86::VCMPPBF16Z256rmbik:
-  case X86::VCMPPBF16Zrmbi:    case X86::VCMPPBF16Zrmbik:
+  case X86::VCMPPBF16Z128rmi:
+  case X86::VCMPPBF16Z128rri:
+  case X86::VCMPPBF16Z256rmi:
+  case X86::VCMPPBF16Z256rri:
+  case X86::VCMPPBF16Zrmi:
+  case X86::VCMPPBF16Zrri:
+  case X86::VCMPPBF16Z128rmik:
+  case X86::VCMPPBF16Z128rrik:
+  case X86::VCMPPBF16Z256rmik:
+  case X86::VCMPPBF16Z256rrik:
+  case X86::VCMPPBF16Zrmik:
+  case X86::VCMPPBF16Zrrik:
+  case X86::VCMPPBF16Z128rmbi:
+  case X86::VCMPPBF16Z128rmbik:
+  case X86::VCMPPBF16Z256rmbi:
+  case X86::VCMPPBF16Z256rmbik:
+  case X86::VCMPPBF16Zrmbi:
+  case X86::VCMPPBF16Zrmbik:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 94de164d5f..98cdb0632d 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -43,12 +43,12 @@ using namespace llvm;
   FMA3GROUP(Name, Suf##m, Attrs) \
   FMA3GROUP(Name, Suf##r, Attrs)
 
-#define FMA3GROUP_PACKED_DHS(Name, Attrs) \
-  FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs) \
-  FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs) \
+#define FMA3GROUP_PACKED_DHS(Name, Attrs)                                      \
+  FMA3GROUP_PACKED_WIDTHS_ALL(Name, PD, Attrs)                                 \
+  FMA3GROUP_PACKED_WIDTHS_Z(Name, PH, Attrs)                                   \
   FMA3GROUP_PACKED_WIDTHS_ALL(Name, PS, Attrs)
 
-#define FMA3GROUP_PACKED_BF16(Name, Attrs) \
+#define FMA3GROUP_PACKED_BF16(Name, Attrs)                                     \
   FMA3GROUP_PACKED_WIDTHS_Z(Name, NEPBF16, Attrs)
 
 #define FMA3GROUP_SCALAR_WIDTHS_Z(Name, Suf, Attrs) \
@@ -69,34 +69,30 @@ using namespace llvm;
   FMA3GROUP_SCALAR_WIDTHS_Z(Name, SH, Attrs) \
   FMA3GROUP_SCALAR_WIDTHS_ALL(Name, SS, Attrs)
 
-#define FMA3GROUP_FULL(Name, Attrs) \
-  FMA3GROUP_PACKED_BF16(Name, Attrs) \
-  FMA3GROUP_PACKED_DHS(Name, Attrs) \
+#define FMA3GROUP_FULL(Name, Attrs)                                            \
+  FMA3GROUP_PACKED_BF16(Name, Attrs)                                           \
+  FMA3GROUP_PACKED_DHS(Name, Attrs)                                            \
   FMA3GROUP_SCALAR(Name, Attrs)
 
 static const X86InstrFMA3Group Groups[] = {
-  FMA3GROUP_FULL(VFMADD, 0)
-  FMA3GROUP_PACKED_DHS(VFMADDSUB, 0)
-  FMA3GROUP_FULL(VFMSUB, 0)
-  FMA3GROUP_PACKED_DHS(VFMSUBADD, 0)
-  FMA3GROUP_FULL(VFNMADD, 0)
-  FMA3GROUP_FULL(VFNMSUB, 0)
-};
+    FMA3GROUP_FULL(VFMADD, 0) FMA3GROUP_PACKED_DHS(VFMADDSUB, 0)
+        FMA3GROUP_FULL(VFMSUB, 0) FMA3GROUP_PACKED_DHS(VFMSUBADD, 0)
+            FMA3GROUP_FULL(VFNMADD, 0) FMA3GROUP_FULL(VFNMSUB, 0)};
 
 #define FMA3GROUP_PACKED_AVX512_WIDTHS(Name, Type, Suf, Attrs) \
   FMA3GROUP_MASKED(Name, Type##Z128##Suf, Attrs) \
   FMA3GROUP_MASKED(Name, Type##Z256##Suf, Attrs) \
   FMA3GROUP_MASKED(Name, Type##Z##Suf, Attrs)
 
-#define FMA3GROUP_PACKED_AVX512_ALL(Name, Suf, Attrs) \
-  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, NEPBF16, Suf, Attrs) \
-  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
-  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
+#define FMA3GROUP_PACKED_AVX512_ALL(Name, Suf, Attrs)                          \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, NEPBF16, Suf, Attrs)                    \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs)                         \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs)                         \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
 
-#define FMA3GROUP_PACKED_AVX512_DHS(Name, Suf, Attrs) \
-  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs) \
-  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs) \
+#define FMA3GROUP_PACKED_AVX512_DHS(Name, Suf, Attrs)                          \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PD, Suf, Attrs)                         \
+  FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PH, Suf, Attrs)                         \
   FMA3GROUP_PACKED_AVX512_WIDTHS(Name, PS, Suf, Attrs)
 
 #define FMA3GROUP_PACKED_AVX512_ROUND(Name, Suf, Attrs)                        \
@@ -116,13 +112,12 @@ static const X86InstrFMA3Group Groups[] = {
   FMA3GROUP_MASKED(Name, SSZ##Suf##_Int, Attrs)
 
 static const X86InstrFMA3Group BroadcastGroups[] = {
-  FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0)
-  FMA3GROUP_PACKED_AVX512_DHS(VFMADDSUB, mb, 0)
-  FMA3GROUP_PACKED_AVX512_ALL(VFMSUB, mb, 0)
-  FMA3GROUP_PACKED_AVX512_DHS(VFMSUBADD, mb, 0)
-  FMA3GROUP_PACKED_AVX512_ALL(VFNMADD, mb, 0)
-  FMA3GROUP_PACKED_AVX512_ALL(VFNMSUB, mb, 0)
-};
+    FMA3GROUP_PACKED_AVX512_ALL(VFMADD, mb, 0)
+        FMA3GROUP_PACKED_AVX512_DHS(VFMADDSUB, mb, 0)
+            FMA3GROUP_PACKED_AVX512_ALL(VFMSUB, mb, 0)
+                FMA3GROUP_PACKED_AVX512_DHS(VFMSUBADD, mb, 0)
+                    FMA3GROUP_PACKED_AVX512_ALL(VFNMADD, mb, 0)
+                        FMA3GROUP_PACKED_AVX512_ALL(VFNMSUB, mb, 0)};
 
 static const X86InstrFMA3Group RoundGroups[] = {
   FMA3GROUP_PACKED_AVX512_ROUND(VFMADD, rb, 0)

@FreddyLeaf FreddyLeaf marked this pull request as ready for review August 27, 2024 02:32
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics clang:codegen IR generation bugs: mangling, exceptions, etc. mc Machine (object) code llvm:ir labels Aug 27, 2024
@llvmbot
Copy link
Member

llvmbot commented Aug 27, 2024

@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-clang-codegen
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-mc

Author: Freddy Ye (FreddyLeaf)

Changes

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965


Patch is 1.24 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101603.diff

30 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.def (+62)
  • (modified) clang/lib/Basic/Targets/X86.cpp (+1)
  • (modified) clang/lib/CodeGen/CGBuiltin.cpp (+23)
  • (modified) clang/lib/Headers/CMakeLists.txt (+2)
  • (added) clang/lib/Headers/avx10_2_512bf16intrin.h (+565)
  • (added) clang/lib/Headers/avx10_2bf16intrin.h (+1088)
  • (modified) clang/lib/Headers/immintrin.h (+2)
  • (added) clang/test/CodeGen/X86/avx10_2_512bf16-builtins.c (+1054)
  • (added) clang/test/CodeGen/X86/avx10_2bf16-builtins.c (+2018)
  • (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+410)
  • (modified) llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp (+6-2)
  • (modified) llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp (+11-1)
  • (modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp (+11)
  • (modified) llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp (+9)
  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+33-4)
  • (modified) llvm/lib/Target/X86/X86InstrAVX10.td (+310)
  • (modified) llvm/lib/Target/X86/X86InstrFragmentsSIMD.td (+10)
  • (modified) llvm/lib/Target/X86/X86InstrUtils.td (+3-3)
  • (modified) llvm/lib/Target/X86/X86IntrinsicsInfo.h (+54)
  • (added) llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll (+587)
  • (added) llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll (+296)
  • (added) llvm/test/CodeGen/X86/avx10_2bf16-arith.ll (+1168)
  • (added) llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll (+536)
  • (added) llvm/test/MC/Disassembler/X86/avx10.2-bf16-32.txt (+3015)
  • (added) llvm/test/MC/Disassembler/X86/avx10.2-bf16-64.txt (+3015)
  • (added) llvm/test/MC/X86/avx10.2-bf16-32-att.s (+3014)
  • (added) llvm/test/MC/X86/avx10.2-bf16-32-intel.s (+3014)
  • (added) llvm/test/MC/X86/avx10.2-bf16-64-att.s (+3014)
  • (added) llvm/test/MC/X86/avx10.2-bf16-64-intel.s (+3014)
  • (modified) llvm/test/TableGen/x86-fold-tables.inc (+494)
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index e4aa8661b9a806..48376ee0527980 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2261,6 +2261,68 @@ TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:"
 TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
 TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
 TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
+
+// AVX10.2 BF16
+TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
 #undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index a9cbdb7b10dff8..62c382b67ad14a 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -306,6 +306,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAVX10_1_512 = true;
     } else if (Feature == "+avx10.2-256") {
       HasAVX10_2 = true;
+      HasFullBFloat16 = true;
     } else if (Feature == "+avx10.2-512") {
       HasAVX10_2_512 = true;
     } else if (Feature == "+avx512cd") {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2a733e4d834cfa..94af4e5f723c9a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14728,6 +14728,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_storeups512_mask:
     return EmitX86MaskedStore(*this, Ops, Align(1));
 
+  case X86::BI__builtin_ia32_storesbf16128_mask:
   case X86::BI__builtin_ia32_storesh128_mask:
   case X86::BI__builtin_ia32_storess128_mask:
   case X86::BI__builtin_ia32_storesd128_mask:
@@ -14836,6 +14837,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vfmaddph512_mask:
   case X86::BI__builtin_ia32_vfmaddph512_maskz:
   case X86::BI__builtin_ia32_vfmaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddnepbh128:
+  case X86::BI__builtin_ia32_vfmaddnepbh256:
+  case X86::BI__builtin_ia32_vfmaddnepbh512:
   case X86::BI__builtin_ia32_vfmaddps512_mask:
   case X86::BI__builtin_ia32_vfmaddps512_maskz:
   case X86::BI__builtin_ia32_vfmaddps512_mask3:
@@ -14920,6 +14924,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_loaddqudi512_mask:
     return EmitX86MaskedLoad(*this, Ops, Align(1));
 
+  case X86::BI__builtin_ia32_loadsbf16128_mask:
   case X86::BI__builtin_ia32_loadsh128_mask:
   case X86::BI__builtin_ia32_loadss128_mask:
   case X86::BI__builtin_ia32_loadsd128_mask:
@@ -16074,6 +16079,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_sqrtph256:
   case X86::BI__builtin_ia32_sqrtph:
   case X86::BI__builtin_ia32_sqrtph512:
+  case X86::BI__builtin_ia32_vsqrtnepbf16256:
+  case X86::BI__builtin_ia32_vsqrtnepbf16:
+  case X86::BI__builtin_ia32_vsqrtnepbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
     if (Ops.size() == 2) {
@@ -16293,6 +16301,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_fpclassps128_mask:
   case X86::BI__builtin_ia32_fpclassps256_mask:
   case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+  case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
   case X86::BI__builtin_ia32_fpclassph128_mask:
   case X86::BI__builtin_ia32_fpclassph256_mask:
   case X86::BI__builtin_ia32_fpclassph512_mask:
@@ -16307,6 +16318,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     Intrinsic::ID ID;
     switch (BuiltinID) {
     default: llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
+      ID = Intrinsic::x86_avx10_fpclass_nepbf16_128;
+      break;
+    case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
+      ID = Intrinsic::x86_avx10_fpclass_nepbf16_256;
+      break;
+    case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
+      ID = Intrinsic::x86_avx10_fpclass_nepbf16_512;
+      break;
     case X86::BI__builtin_ia32_fpclassph128_mask:
       ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
       break;
@@ -16465,6 +16485,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_vcmppd256_round_mask:
   case X86::BI__builtin_ia32_vcmpps256_round_mask:
   case X86::BI__builtin_ia32_vcmpph256_round_mask:
+  case X86::BI__builtin_ia32_vcmppbf16512_mask:
+  case X86::BI__builtin_ia32_vcmppbf16256_mask:
+  case X86::BI__builtin_ia32_vcmppbf16128_mask:
     IsMaskFCmp = true;
     [[fallthrough]];
   case X86::BI__builtin_ia32_cmpps:
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5a62538792f301..90d431f8627965 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,11 +147,13 @@ set(x86_files
   amxcomplexintrin.h
   amxfp16intrin.h
   amxintrin.h
+  avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
   avx10_2_512niintrin.h
   avx10_2_512satcvtintrin.h
   avx10_2convertintrin.h
+  avx10_2bf16intrin.h
   avx10_2minmaxintrin.h
   avx10_2niintrin.h
   avx10_2satcvtintrin.h
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
new file mode 100644
index 00000000000000..158d5686c8f02f
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -0,0 +1,565 @@
+/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512BF16INTRIN_H
+#define __AVX10_2_512BF16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
+  return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
+  return (__m512bh)__builtin_ia32_undef512();
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
+  return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
+    __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
+    __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+    __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
+    __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
+    __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
+    __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
+  return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
+                             bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
+                             bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
+                             bf8,  bf7,  bf6,  bf5,  bf4,  bf3,  bf2,  bf1};
+}
+
+#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
+                        bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19,  \
+                        bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28,  \
+                        bf29, bf30, bf31, bf32)                                \
+  _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26),       \
+                 (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19),       \
+                 (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12),       \
+                 (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4),     \
+                 (bf3), (bf2), (bf1))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_ps(__m512bh __a) {
+  return (__m512)__a;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_pd(__m512bh __a) {
+  return (__m512d)__a;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castpbf16_si512(__m512bh __a) {
+  return (__m512i)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpd_pbh(__m512d __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castsi512_pbh(__m512i __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16512_pbh128(__m512bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16512_pbh256(__m512bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16128_pbh512(__m128bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpbf16256_pbh512(__m256bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextpbf16128_pbh512(__m128bh __a) {
+  return __builtin_shufflevector(
+      __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextpbf16256_pbh512(__m256bh __a) {
+  return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
+                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                                 29, 30, 31);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
+  return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
+                                    (__m512i)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_load_pbh(void const *__p) {
+  return *(const __m512bh *)__p;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_loadu_pbh(void const *__p) {
+  struct __loadu_pbh {
+    __m512bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_pbh *)__p)->__v;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
+                                                              __m512bh __A) {
+  *(__m512bh *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
+                                                               __m512bh __A) {
+  struct __storeu_pbh {
+    __m512bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pbh *)__P)->__v = __A;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
+                                                (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+                                                  (__v32hi)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_addne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A + (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_addne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_addne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_addne_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_subne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A - (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_subne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_subne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_subne_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mulne_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)((__v32bf)__A * (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_mulne_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_mulne_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_mulne_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_sel...
[truncated]

@phoebewang
Copy link
Contributor

Should we add folding in X86InstrFMA3Info.cpp?

@FreddyLeaf
Copy link
Contributor Author

Should we add folding in X86InstrFMA3Info.cpp?

yes, addressed in b9e35f0

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@FreddyLeaf FreddyLeaf merged commit 83ad644 into llvm:main Sep 4, 2024
3 of 6 checks passed
@FreddyLeaf FreddyLeaf deleted the avx10-bf16 branch September 6, 2024 07:32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:X86 clang:codegen IR generation bugs: mangling, exceptions, etc. clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics clang Clang issues not falling into any other category llvm:ir mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants