Skip to content

Commit 83ad644

Browse files
authored
[X86][AVX10.2] Support AVX10.2-BF16 new instructions. (#101603)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
1 parent eaa95a1 commit 83ad644

34 files changed

+28058
-24
lines changed

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2261,6 +2261,68 @@ TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:"
22612261
TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
22622262
TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
22632263
TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
2264+
2265+
// AVX10.2 BF16
2266+
TARGET_BUILTIN(__builtin_ia32_loadsbf16128_mask, "V8yV8yC*V8yUc", "nV:128:", "avx10.2-256")
2267+
TARGET_BUILTIN(__builtin_ia32_storesbf16128_mask, "vV8y*V8yUc", "nV:128:", "avx10.2-256")
2268+
TARGET_BUILTIN(__builtin_ia32_vaddnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
2269+
TARGET_BUILTIN(__builtin_ia32_vaddnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
2270+
TARGET_BUILTIN(__builtin_ia32_vaddnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
2271+
TARGET_BUILTIN(__builtin_ia32_vdivnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
2272+
TARGET_BUILTIN(__builtin_ia32_vdivnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
2273+
TARGET_BUILTIN(__builtin_ia32_vdivnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
2274+
TARGET_BUILTIN(__builtin_ia32_vmaxpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
2275+
TARGET_BUILTIN(__builtin_ia32_vmaxpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
2276+
TARGET_BUILTIN(__builtin_ia32_vmaxpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
2277+
TARGET_BUILTIN(__builtin_ia32_vminpbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
2278+
TARGET_BUILTIN(__builtin_ia32_vminpbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
2279+
TARGET_BUILTIN(__builtin_ia32_vminpbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
2280+
TARGET_BUILTIN(__builtin_ia32_vmulnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
2281+
TARGET_BUILTIN(__builtin_ia32_vmulnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
2282+
TARGET_BUILTIN(__builtin_ia32_vmulnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
2283+
TARGET_BUILTIN(__builtin_ia32_vsubnepbf16128, "V8yV8yV8y", "ncV:128:", "avx10.2-256")
2284+
TARGET_BUILTIN(__builtin_ia32_vsubnepbf16256, "V16yV16yV16y", "ncV:256:", "avx10.2-256")
2285+
TARGET_BUILTIN(__builtin_ia32_vsubnepbf16512, "V32yV32yV32y", "ncV:512:", "avx10.2-512")
2286+
TARGET_BUILTIN(__builtin_ia32_vcomsbf16eq, "iV8yV8y", "ncV:128:", "avx10.2-256")
2287+
TARGET_BUILTIN(__builtin_ia32_vcomsbf16lt, "iV8yV8y", "ncV:128:", "avx10.2-256")
2288+
TARGET_BUILTIN(__builtin_ia32_vcomsbf16neq, "iV8yV8y", "ncV:128:", "avx10.2-256")
2289+
TARGET_BUILTIN(__builtin_ia32_vcomsbf16ge, "iV8yV8y", "ncV:128:", "avx10.2-256")
2290+
TARGET_BUILTIN(__builtin_ia32_vcomsbf16gt, "iV8yV8y", "ncV:128:", "avx10.2-256")
2291+
TARGET_BUILTIN(__builtin_ia32_vcomsbf16le, "iV8yV8y", "ncV:128:", "avx10.2-256")
2292+
TARGET_BUILTIN(__builtin_ia32_vcmppbf16512_mask,"UiV32yV32yIiUi", "ncV:512:", "avx10.2-512")
2293+
TARGET_BUILTIN(__builtin_ia32_vcmppbf16256_mask,"UsV16yV16yIiUs", "ncV:256:", "avx10.2-256")
2294+
TARGET_BUILTIN(__builtin_ia32_vcmppbf16128_mask,"UcV8yV8yIiUc", "ncV:128:", "avx10.2-256")
2295+
TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16128_mask, "UcV8yIiUc", "ncV:128:", "avx10.2-256")
2296+
TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16256_mask, "UsV16yIiUs", "ncV:256:", "avx10.2-256")
2297+
TARGET_BUILTIN(__builtin_ia32_vfpclasspbf16512_mask, "UiV32yIiUi", "ncV:512:", "avx10.2-512")
2298+
TARGET_BUILTIN(__builtin_ia32_vscalefpbf16128_mask, "V8yV8yV8yV8yUc", "ncV:128:", "avx10.2-256")
2299+
TARGET_BUILTIN(__builtin_ia32_vscalefpbf16256_mask, "V16yV16yV16yV16yUs", "ncV:256:", "avx10.2-256")
2300+
TARGET_BUILTIN(__builtin_ia32_vscalefpbf16512_mask, "V32yV32yV32yV32yUi", "ncV:512:", "avx10.2-512")
2301+
TARGET_BUILTIN(__builtin_ia32_vrcppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
2302+
TARGET_BUILTIN(__builtin_ia32_vrcppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
2303+
TARGET_BUILTIN(__builtin_ia32_vrcppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
2304+
TARGET_BUILTIN(__builtin_ia32_vgetexppbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
2305+
TARGET_BUILTIN(__builtin_ia32_vgetexppbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
2306+
TARGET_BUILTIN(__builtin_ia32_vgetexppbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
2307+
TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16128_mask, "V8yV8yV8yUc", "ncV:128:", "avx10.2-256")
2308+
TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16256_mask, "V16yV16yV16yUs", "ncV:256:", "avx10.2-256")
2309+
TARGET_BUILTIN(__builtin_ia32_vrsqrtpbf16512_mask, "V32yV32yV32yUi", "ncV:512:", "avx10.2-512")
2310+
TARGET_BUILTIN(__builtin_ia32_vreducenepbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
2311+
TARGET_BUILTIN(__builtin_ia32_vreducenepbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
2312+
TARGET_BUILTIN(__builtin_ia32_vreducenepbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
2313+
TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
2314+
TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
2315+
TARGET_BUILTIN(__builtin_ia32_vrndscalenepbf16_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
2316+
TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16128_mask, "V8yV8yIiV8yUc", "ncV:128:", "avx10.2-256")
2317+
TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16256_mask, "V16yV16yIiV16yUs", "ncV:256:", "avx10.2-256")
2318+
TARGET_BUILTIN(__builtin_ia32_vgetmantpbf16512_mask, "V32yV32yIiV32yUi", "ncV:512:", "avx10.2-512")
2319+
TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16, "V8yV8y", "ncV:128:", "avx10.2-256")
2320+
TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16256, "V16yV16y", "ncV:256:", "avx10.2-256")
2321+
TARGET_BUILTIN(__builtin_ia32_vsqrtnepbf16512, "V32yV32y", "ncV:512:", "avx10.2-512")
2322+
TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh512, "V32yV32yV32yV32y", "ncV:512:", "avx10.2-512")
2323+
TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh256, "V16yV16yV16yV16y", "ncV:256:", "avx10.2-256")
2324+
TARGET_BUILTIN(__builtin_ia32_vfmaddnepbh128, "V8yV8yV8yV8y", "ncV:128:", "avx10.2-256")
2325+
22642326
#undef BUILTIN
22652327
#undef TARGET_BUILTIN
22662328
#undef TARGET_HEADER_BUILTIN

clang/lib/Basic/Targets/X86.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
306306
HasAVX10_1_512 = true;
307307
} else if (Feature == "+avx10.2-256") {
308308
HasAVX10_2 = true;
309+
HasFullBFloat16 = true;
309310
} else if (Feature == "+avx10.2-512") {
310311
HasAVX10_2_512 = true;
311312
} else if (Feature == "+avx512cd") {

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14728,6 +14728,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1472814728
case X86::BI__builtin_ia32_storeups512_mask:
1472914729
return EmitX86MaskedStore(*this, Ops, Align(1));
1473014730

14731+
case X86::BI__builtin_ia32_storesbf16128_mask:
1473114732
case X86::BI__builtin_ia32_storesh128_mask:
1473214733
case X86::BI__builtin_ia32_storess128_mask:
1473314734
case X86::BI__builtin_ia32_storesd128_mask:
@@ -14836,6 +14837,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1483614837
case X86::BI__builtin_ia32_vfmaddph512_mask:
1483714838
case X86::BI__builtin_ia32_vfmaddph512_maskz:
1483814839
case X86::BI__builtin_ia32_vfmaddph512_mask3:
14840+
case X86::BI__builtin_ia32_vfmaddnepbh128:
14841+
case X86::BI__builtin_ia32_vfmaddnepbh256:
14842+
case X86::BI__builtin_ia32_vfmaddnepbh512:
1483914843
case X86::BI__builtin_ia32_vfmaddps512_mask:
1484014844
case X86::BI__builtin_ia32_vfmaddps512_maskz:
1484114845
case X86::BI__builtin_ia32_vfmaddps512_mask3:
@@ -14920,6 +14924,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1492014924
case X86::BI__builtin_ia32_loaddqudi512_mask:
1492114925
return EmitX86MaskedLoad(*this, Ops, Align(1));
1492214926

14927+
case X86::BI__builtin_ia32_loadsbf16128_mask:
1492314928
case X86::BI__builtin_ia32_loadsh128_mask:
1492414929
case X86::BI__builtin_ia32_loadss128_mask:
1492514930
case X86::BI__builtin_ia32_loadsd128_mask:
@@ -16074,6 +16079,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1607416079
case X86::BI__builtin_ia32_sqrtph256:
1607516080
case X86::BI__builtin_ia32_sqrtph:
1607616081
case X86::BI__builtin_ia32_sqrtph512:
16082+
case X86::BI__builtin_ia32_vsqrtnepbf16256:
16083+
case X86::BI__builtin_ia32_vsqrtnepbf16:
16084+
case X86::BI__builtin_ia32_vsqrtnepbf16512:
1607716085
case X86::BI__builtin_ia32_sqrtps512:
1607816086
case X86::BI__builtin_ia32_sqrtpd512: {
1607916087
if (Ops.size() == 2) {
@@ -16293,6 +16301,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1629316301
case X86::BI__builtin_ia32_fpclassps128_mask:
1629416302
case X86::BI__builtin_ia32_fpclassps256_mask:
1629516303
case X86::BI__builtin_ia32_fpclassps512_mask:
16304+
case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
16305+
case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
16306+
case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
1629616307
case X86::BI__builtin_ia32_fpclassph128_mask:
1629716308
case X86::BI__builtin_ia32_fpclassph256_mask:
1629816309
case X86::BI__builtin_ia32_fpclassph512_mask:
@@ -16307,6 +16318,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1630716318
Intrinsic::ID ID;
1630816319
switch (BuiltinID) {
1630916320
default: llvm_unreachable("Unsupported intrinsic!");
16321+
case X86::BI__builtin_ia32_vfpclasspbf16128_mask:
16322+
ID = Intrinsic::x86_avx10_fpclass_nepbf16_128;
16323+
break;
16324+
case X86::BI__builtin_ia32_vfpclasspbf16256_mask:
16325+
ID = Intrinsic::x86_avx10_fpclass_nepbf16_256;
16326+
break;
16327+
case X86::BI__builtin_ia32_vfpclasspbf16512_mask:
16328+
ID = Intrinsic::x86_avx10_fpclass_nepbf16_512;
16329+
break;
1631016330
case X86::BI__builtin_ia32_fpclassph128_mask:
1631116331
ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
1631216332
break;
@@ -16465,6 +16485,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
1646516485
case X86::BI__builtin_ia32_vcmppd256_round_mask:
1646616486
case X86::BI__builtin_ia32_vcmpps256_round_mask:
1646716487
case X86::BI__builtin_ia32_vcmpph256_round_mask:
16488+
case X86::BI__builtin_ia32_vcmppbf16512_mask:
16489+
case X86::BI__builtin_ia32_vcmppbf16256_mask:
16490+
case X86::BI__builtin_ia32_vcmppbf16128_mask:
1646816491
IsMaskFCmp = true;
1646916492
[[fallthrough]];
1647016493
case X86::BI__builtin_ia32_cmpps:

clang/lib/Headers/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,12 @@ set(x86_files
147147
amxcomplexintrin.h
148148
amxfp16intrin.h
149149
amxintrin.h
150+
avx10_2_512bf16intrin.h
150151
avx10_2_512convertintrin.h
151152
avx10_2_512minmaxintrin.h
152153
avx10_2_512niintrin.h
153154
avx10_2_512satcvtintrin.h
155+
avx10_2bf16intrin.h
154156
avx10_2convertintrin.h
155157
avx10_2minmaxintrin.h
156158
avx10_2niintrin.h

0 commit comments

Comments
 (0)