-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86][AMX] Support AMX-TRANSPOSE #113532
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86][AMX] Support AMX-TRANSPOSE #113532
Conversation
@llvm/pr-subscribers-clang-driver @llvm/pr-subscribers-clang-codegen Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/671368 Patch is 184.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113532.diff 57 Files Affected:
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ce046a305c89b6..dc58f98af55cc9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -623,6 +623,8 @@ X86 Support
- All intrinsics in tbmintrin.h can now be used in constant expressions.
+- Support ISA of ``AMX-TRANSPOSE``.
+
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 2c591edb2835cd..4e95a8a73d550a 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+
TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ddb2f5312148e..c55f2b86f4cb1f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6287,6 +6287,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
+def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
+def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 5448bd841959f4..fe5b600e6777fb 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXTILE = true;
} else if (Feature == "+amx-complex") {
HasAMXCOMPLEX = true;
+ } else if (Feature == "+amx-transpose") {
+ HasAMXTRANSPOSE = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
@@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP16__");
if (HasAMXCOMPLEX)
Builder.defineMacro("__AMX_COMPLEX__");
+ if (HasAMXTRANSPOSE)
+ Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
@@ -1065,6 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-fp16", true)
.Case("amx-int8", true)
.Case("amx-tile", true)
+ .Case("amx-transpose", true)
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
@@ -1182,6 +1187,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tile", HasAMXTILE)
+ .Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index a99ae62984c7d5..3e1fb41082950c 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
+ bool HasAMXTRANSPOSE = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3f28b7f26c36fe..67d28ccec0f373 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16920,6 +16920,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// instruction, but it will create a memset that won't be optimized away.
return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
}
+ // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
+ case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0_internal;
+ break;
+ case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
+ break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1_internal;
+ break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
+ break;
+ }
+
+ // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
+ Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
+ {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
+
+ auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
+ assert(PtrTy && "arg3 must be of pointer type");
+ QualType PtreeTy = PtrTy->getPointeeType();
+ llvm::Type *TyPtee = ConvertType(PtreeTy);
+
+ // Bitcast amx type (x86_amx) to vector type (256 x i32)
+ // Then store tile0 into DstPtr0
+ Value *T0 = Builder.CreateExtractValue(Call, 0);
+ Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+ {TyPtee}, {T0});
+ Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
+
+ // Then store tile1 into DstPtr1
+ Value *T1 = Builder.CreateExtractValue(Call, 1);
+ Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+ {TyPtee}, {T1});
+ Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
+
+ // Note: Here we escape directly use x86_tilestored64_internal to store
+ // the results due to it can't make sure the Mem writen scope. This may
+ // cause shapes reloads after first amx intrinsic, which current amx reg-
+ // ister allocation has no ability to handle it.
+
+ return Store;
+ }
case X86::BI__ud2:
// llvm.trap makes a ud2a instruction on x86.
return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..708525198324bb 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -149,6 +149,7 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index baa56f5b28e8e5..f07a5689011853 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+typedef int _tile1024i_1024a
+ __attribute__((__vector_size__(1024), __aligned__(1024)));
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
new file mode 100644
index 00000000000000..d5dc68f4152848
--- /dev/null
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The 1st source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+///
+/// FOR i := 0 TO (dst.rows-1)
+/// tmp[511:0] := 0
+/// FOR j := 0 TO (dst.colsb/4-1)
+/// tmp.dword[j] := src.row[j].dword[i]
+/// ENDFOR
+/// dst.row[i] := tmp
+/// ENDFOR
+///
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ // Use __tile1024i_1024a* to escape the alignment check in
+ // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+ __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+ (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz0t1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+ (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1t1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
+_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
+ return __builtin_ia32_ttransposed_internal(m, n, src);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Transpose 32-bit elements from src and write the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic cor...
[truncated]
|
3f9b733
to
6abc376
Compare
6abc376
to
1b42b13
Compare
@@ -34,9 +34,31 @@ class ShapeT { | |||
if (MRI) | |||
deduceImm(MRI); | |||
} | |||
// When ShapeT has mult shapes, we only use Shapes (never use Row and Col) | |||
// and ImmShapes. Due to the most case is only one shape (just simply use | |||
// Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vertor -> vector?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
MBB.erase(MBBI); | ||
return true; | ||
} | ||
// Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Smilar -> Similar
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
if (Reg.isVirtual() && | ||
MRI->getRegClass(Reg)->getID() == X86::TILERegClassID) | ||
return true; | ||
if (Reg.isVirtual()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this piece of code be replaced with calling getTileDefNum?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function is in a different file. I don't find a proper header to store it and we should not call it across files.
/// \headerfile <immintrin.h> | ||
/// | ||
/// \code | ||
/// void __tile_transposed(__tile dst, __tile src); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove extra underline: __tile_transposed -> _tile_transposed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// \param dst | ||
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param src | ||
/// The 1st source tile. Max size is 1024 Bytes. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed "1st" from description since there is only 1 source tile.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// \param dst | ||
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param src | ||
/// The 1st source tile. Max size is 1024 Bytes. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove "1st" as above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
clang/lib/CodeGen/CGBuiltin.cpp
Outdated
Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]); | ||
|
||
// Note: Here we escape directly use x86_tilestored64_internal to store | ||
// the results due to it can't make sure the Mem writen scope. This may |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
writen -> written
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
ShapeT() | ||
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape), | ||
ColImm(InvalidImmShape) {} | ||
// TODO: We need to extern cmp operator for muti-shapes if |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
muti->multi
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -34,9 +34,31 @@ class ShapeT { | |||
if (MRI) | |||
deduceImm(MRI); | |||
} | |||
// When ShapeT has mult shapes, we only use Shapes (never use Row and Col) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mult -> multiple
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand { | |||
Inst.addOperand(MCOperand::createReg(Reg)); | |||
} | |||
|
|||
bool isTILEPair() const { | |||
return Kind == Register && | |||
X86MCRegisterClasses[X86::TILERegClassID].contains(getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should X86::TILERegClassID be X86::TILEPAIRRegClassID?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
X86::TILERegClassID
is correct. We use single tile register to represent tile pair in the assembly.
cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V)); | ||
} else { | ||
// When it is not a const value and it is a function argument, we create | ||
// Row at the entry bb. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Row -> Column
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Row is correct.
|
||
assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!"); | ||
|
||
// TODO: Specially handle the mult-use case. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mult->multi. The same below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
da7baea
to
73fce52
Compare
73fce52
to
5e76227
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Looks like this causes a significant compile-time regression, but only for ReleaseLTO-g: https://llvm-compile-time-tracker.com/compare.php?from=1e19f0f9d92b5e9c43d53893e387341835d3d96b&to=c72a751dabff4260dcc309e48008941d51b31d21&stat=instructions:u I wouldn't really expect this change to only affect that configuration -- I guess that means that optimized debuginfo processing has become a lot slower? Any idea what is going on here? |
Thanks for the report, should be solved by #114642 |
Another try to fix compile regression by llvm#113532
Another try to fix compile regression by llvm#113532
3rd try to fix compile regression by llvm#113532
3rd try to fix compile regression by llvm#113532
3rd try to fix compile regression by llvm#113532
Found during review llvm#115151
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368