-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86][AMX] Support AMX-AVX512 #114070
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86][AMX] Support AMX-AVX512 #114070
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang-driver Author: Phoebe Wang (phoebewang) ChangesPatch is 81.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114070.diff 31 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ce046a305c89b6..d45bd1240dd173 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -611,6 +611,8 @@ X86 Support
* Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
``*_(mask(z)))_minmax_s[s|d|h]``.
+- Support ISA of ``AMX-AVX512``.
+
- All intrinsics in adcintrin.h can now be used in constant expressions.
- All intrinsics in adxintrin.h can now be used in constant expressions.
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 2c591edb2835cd..70644f3f6b6054 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -128,6 +128,12 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -148,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512")
+
TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ddb2f5312148e..fd200abebceb11 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6277,6 +6277,8 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias<mno_x87>;
def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
+def mamx_avx512 : Flag<["-"], "mamx-avx512">, Group<m_x86_Features_Group>;
+def mno_amx_avx512 : Flag<["-"], "mno-amx-avx512">, Group<m_x86_Features_Group>;
def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 5448bd841959f4..52cab65cbd9451 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXTILE = true;
} else if (Feature == "+amx-complex") {
HasAMXCOMPLEX = true;
+ } else if (Feature == "+amx-avx512") {
+ HasAMXAVX512 = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
@@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP16__");
if (HasAMXCOMPLEX)
Builder.defineMacro("__AMX_COMPLEX__");
+ if (HasAMXAVX512)
+ Builder.defineMacro("__AMX_AVX512__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
@@ -1060,6 +1064,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
return llvm::StringSwitch<bool>(Name)
.Case("adx", true)
.Case("aes", true)
+ .Case("amx-avx512", true)
.Case("amx-bf16", true)
.Case("amx-complex", true)
.Case("amx-fp16", true)
@@ -1177,6 +1182,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
return llvm::StringSwitch<bool>(Feature)
.Case("adx", HasADX)
.Case("aes", HasAES)
+ .Case("amx-avx512", HasAMXAVX512)
.Case("amx-bf16", HasAMXBF16)
.Case("amx-complex", HasAMXCOMPLEX)
.Case("amx-fp16", HasAMXFP16)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index a99ae62984c7d5..ce7458ae99ad64 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
+ bool HasAMXAVX512 = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..88e8f282bd7ec2 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -146,6 +146,7 @@ set(x86_files
adcintrin.h
adxintrin.h
ammintrin.h
+ amxavx512intrin.h
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
new file mode 100644
index 00000000000000..f819696f8086b7
--- /dev/null
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -0,0 +1,381 @@
+/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the int32 source elements to fp32. The row of the tile is selected by an
+/// 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param row
+/// The row of the source tile
+#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to bf16. It places the resulting bf16 elements
+/// in the high 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+0] := 0
+/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2pbf16h(tsrc, row) \
+ __builtin_ia32_tcvtrowps2pbf16h(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to bf16. It places the resulting bf16 elements
+/// in the low 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+1] := 0
+/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2pbf16l(tsrc, row) \
+ __builtin_ia32_tcvtrowps2pbf16l(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to fp16. It places the resulting fp16 elements
+/// in the high 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+0] := 0
+/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to fp16. It places the resulting fp16 elements
+/// in the low 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+1] := 0
+/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m512 _tile_movrow(__tile a, unsigned b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16f32 data. Size is 64 Bytes.
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL>>3
+/// row_index := b&0xffff
+/// row_chunk := ((b>>16)&0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes-1)
+/// IF (row_chunk + i >= a.colsb)
+/// dst.byte[i] := 0
+/// ELSE
+/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
+/// ENDFOR
+/// \endcode
+#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
+}
+
+/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
+/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
+/// MXCSR.RC=RNE. Embedded rounding is not supported.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16f32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16i32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
+ return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+#endif // __x86_64__
+#endif // __AMX_AVX512INTRIN_H
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 3fbabffa98df20..84e56238fcf2dc 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -638,6 +638,10 @@ _storebe_i64(void * __P, long long __D) {
#include <amxcomplexintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
+#include <amxavx512intrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 6a4d78f0ca9084..fba901473e6e18 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -631,6 +631,12 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCa...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesPatch is 81.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114070.diff 31 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ce046a305c89b6..d45bd1240dd173 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -611,6 +611,8 @@ X86 Support
* Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
``*_(mask(z)))_minmax_s[s|d|h]``.
+- Support ISA of ``AMX-AVX512``.
+
- All intrinsics in adcintrin.h can now be used in constant expressions.
- All intrinsics in adxintrin.h can now be used in constant expressions.
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 2c591edb2835cd..70644f3f6b6054 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -128,6 +128,12 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -148,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512")
+
TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ddb2f5312148e..fd200abebceb11 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6277,6 +6277,8 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias<mno_x87>;
def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
+def mamx_avx512 : Flag<["-"], "mamx-avx512">, Group<m_x86_Features_Group>;
+def mno_amx_avx512 : Flag<["-"], "mno-amx-avx512">, Group<m_x86_Features_Group>;
def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 5448bd841959f4..52cab65cbd9451 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXTILE = true;
} else if (Feature == "+amx-complex") {
HasAMXCOMPLEX = true;
+ } else if (Feature == "+amx-avx512") {
+ HasAMXAVX512 = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
@@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP16__");
if (HasAMXCOMPLEX)
Builder.defineMacro("__AMX_COMPLEX__");
+ if (HasAMXAVX512)
+ Builder.defineMacro("__AMX_AVX512__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
@@ -1060,6 +1064,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
return llvm::StringSwitch<bool>(Name)
.Case("adx", true)
.Case("aes", true)
+ .Case("amx-avx512", true)
.Case("amx-bf16", true)
.Case("amx-complex", true)
.Case("amx-fp16", true)
@@ -1177,6 +1182,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
return llvm::StringSwitch<bool>(Feature)
.Case("adx", HasADX)
.Case("aes", HasAES)
+ .Case("amx-avx512", HasAMXAVX512)
.Case("amx-bf16", HasAMXBF16)
.Case("amx-complex", HasAMXCOMPLEX)
.Case("amx-fp16", HasAMXFP16)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index a99ae62984c7d5..ce7458ae99ad64 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
+ bool HasAMXAVX512 = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..88e8f282bd7ec2 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -146,6 +146,7 @@ set(x86_files
adcintrin.h
adxintrin.h
ammintrin.h
+ amxavx512intrin.h
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
new file mode 100644
index 00000000000000..f819696f8086b7
--- /dev/null
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -0,0 +1,381 @@
+/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the int32 source elements to fp32. The row of the tile is selected by an
+/// 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param row
+/// The row of the source tile
+#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to bf16. It places the resulting bf16 elements
+/// in the high 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+0] := 0
+/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2pbf16h(tsrc, row) \
+ __builtin_ia32_tcvtrowps2pbf16h(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to bf16. It places the resulting bf16 elements
+/// in the low 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+1] := 0
+/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2pbf16l(tsrc, row) \
+ __builtin_ia32_tcvtrowps2pbf16l(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to fp16. It places the resulting fp16 elements
+/// in the high 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+0] := 0
+/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to fp16. It places the resulting fp16 elements
+/// in the low 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+1] := 0
+/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m512 _tile_movrow(__tile a, unsigned b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16f32 data. Size is 64 Bytes.
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL>>3
+/// row_index := b&0xffff
+/// row_chunk := ((b>>16)&0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes-1)
+/// IF (row_chunk + i >= a.colsb)
+/// dst.byte[i] := 0
+/// ELSE
+/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
+/// ENDFOR
+/// \endcode
+#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
+}
+
+/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
+/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
+/// MXCSR.RC=RNE. Embedded rounding is not supported.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16f32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16i32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
+ return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+#endif // __x86_64__
+#endif // __AMX_AVX512INTRIN_H
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 3fbabffa98df20..84e56238fcf2dc 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -638,6 +638,10 @@ _storebe_i64(void * __P, long long __D) {
#include <amxcomplexintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
+#include <amxavx512intrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 6a4d78f0ca9084..fba901473e6e18 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -631,6 +631,12 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCa...
[truncated]
|
clang/lib/Headers/amxavx512intrin.h
Outdated
__attribute__((__always_inline__, __nodebug__, __target__("amx-avx512"))) | ||
|
||
/// Moves a row from a tile register to a zmm destination register, converting | ||
/// the int32 source elements to fp32. The row of the tile is selected by an |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
an -> a.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
clang/lib/Headers/amxavx512intrin.h
Outdated
/// This intrinsic corresponds to the \c TCVTROWD2PS instruction. | ||
/// | ||
/// \param tsrc | ||
/// The 1st source tile. Max size is 1024 Bytes. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove "1st" since there is only one source tile. The following should be updated as this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
clang/lib/Headers/amxavx512intrin.h
Outdated
/// Moves a row from a tile register to a zmm destination register, converting | ||
/// the fp32 source elements to bf16. It places the resulting bf16 elements | ||
/// in the high 16 bits within each dword. The row of the tile is selected | ||
/// by an 32b GPR. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
an -> a. The following "an 32b" should be updated to "a 32b" too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Opc = X86::TILEMOVROWrri; | ||
break; | ||
default: | ||
llvm_unreachable("Impossible Opcode!"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to change "Impossible" to "Invalid".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
You can test this locally with the following command:git-clang-format --diff 37ce18951fded6be1de319b05b968918cb45c00b ffcb6a7452a8ef41e62e47ce71747559eca1c83a --extensions c,cpp,h -- clang/lib/Headers/amxavx512intrin.h clang/test/CodeGen/X86/amx_avx512_api.c clang/test/CodeGen/X86/amxavx512-builtins.c clang/lib/Basic/Targets/X86.cpp clang/lib/Basic/Targets/X86.h clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp clang/test/CodeGen/attr-target-x86.c clang/test/Driver/x86-target-features.c clang/test/Preprocessor/x86_target_features.c llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/TargetParser/Host.cpp llvm/lib/TargetParser/X86TargetParser.cpp View the diff from clang-format here.diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
index 945edea543..a01c6eff93 100644
--- a/clang/lib/Headers/amxavx512intrin.h
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -37,7 +37,8 @@
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
/// dst.dword[i] := 0
/// ELSE
-/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+/// dst.f32[i] :=
+/// CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
/// FI
/// ENDFOR
/// dst[MAX_VL-1:VL] := 0
@@ -73,7 +74,8 @@
/// dst.dword[i] := 0
/// ELSE
/// dst.word[2*i+0] := 0
-/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// dst.bf16[2*i+1] :=
+/// CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
/// FI
/// ENDFOR
/// dst[MAX_VL-1:VL] := 0
@@ -110,7 +112,8 @@
/// dst.dword[i] := 0
/// ELSE
/// dst.word[2*i+1] := 0
-/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// dst.bf16[2*i+0] :=
+/// CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
/// FI
/// ENDFOR
/// dst[MAX_VL-1:VL] := 0
@@ -147,7 +150,8 @@
/// dst.dword[i] := 0
/// ELSE
/// dst.word[2*i+0] := 0
-/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// dst.fp16[2*i+1] :=
+/// CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
/// FI
/// ENDFOR
/// dst[MAX_VL-1:VL] := 0
@@ -183,7 +187,8 @@
/// dst.dword[i] := 0
/// ELSE
/// dst.word[2*i+1] := 0
-/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// dst.fp16[2*i+0] :=
+/// CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
/// FI
/// ENDFOR
/// dst[MAX_VL-1:VL] := 0
|
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", | |||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") | |||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") | |||
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") | |||
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is "avx10.2-512" feature needed to be added for the intrinsics here and there?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch! Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it necessary to add avx10.2-512 feature for these internal APIs? With that, we may detect errors for new APIs if there is no AVX10.2-512 target feature.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, sorry I missed that parts.
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", | |||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") | |||
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose") | |||
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose") | |||
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it necessary to add avx10.2-512 feature for these internal APIs? With that, we may detect errors for new APIs if there is no AVX10.2-512 target feature.
clang/lib/Headers/amxavx512intrin.h
Outdated
#ifdef __x86_64__ | ||
|
||
#define __DEFAULT_FN_ATTRS_AVX512 \ | ||
__attribute__((__always_inline__, __nodebug__, __target__("amx-avx512"))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If AVX10.2-512 feature dependency is needed for internal APIs, we may create another attribute with AVX10.2-512 and add it to internal APIs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we need them all. Good catch!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM except the last place probably missing avx10.2-512 dependency.
llvm/lib/Target/X86/X86InstrAMX.td
Outdated
@@ -369,3 +369,150 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { | |||
} | |||
} | |||
} // HasAMXTILE, HasAMXTRANSPOSE | |||
|
|||
multiclass m_tcvtrowd2ps { | |||
let Predicates = [HasAMXAVX512, In64BitMode] in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add HasAVX10_2_512 in line 374, 390 and 454?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/52/builds/3564 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/42/builds/1812 Here is the relevant piece of the build log for the reference
|
FYI this is causing Chrome X86 MacOS builds to fail due to |
As I mentioned in https://crbug.com/378111077#comment3, the issue is that we pull in avx512bf16intrin.h because |
This is causing similar errors for us as well. |
Figured out a repro -
Test was on an x64 linux system. |
Reverts #114070 Reason: Causes `immintrin.h` to fail to compile if `-msse` and `-mno-sse2` are passed to clang: #114070 (comment)
Thanks all! Fixed in #115581 |
Reverts llvm#114070 Reason: Causes `immintrin.h` to fail to compile if `-msse` and `-mno-sse2` are passed to clang: llvm#114070 (comment)
No description provided.