[X86][AMX] Support AMX-TRANSPOSE #113532

phoebewang · 2024-10-24T06:55:31Z

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

llvmbot · 2024-10-24T06:56:09Z

@llvm/pr-subscribers-clang-driver
@llvm/pr-subscribers-llvm-support
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-clang-codegen

Author: Phoebe Wang (phoebewang)

Changes

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

Patch is 184.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113532.diff

57 Files Affected:

(modified) clang/docs/ReleaseNotes.rst (+2)
(modified) clang/include/clang/Basic/BuiltinsX86_64.def (+11)
(modified) clang/include/clang/Driver/Options.td (+2)
(modified) clang/lib/Basic/Targets/X86.cpp (+6)
(modified) clang/lib/Basic/Targets/X86.h (+1)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+52)
(modified) clang/lib/Headers/CMakeLists.txt (+1)
(modified) clang/lib/Headers/amxintrin.h (+2)
(added) clang/lib/Headers/amxtransposeintrin.h (+248)
(modified) clang/lib/Headers/immintrin.h (+4)
(modified) clang/lib/Sema/SemaX86.cpp (+6)
(added) clang/test/CodeGen/X86/amx_transpose.c (+36)
(added) clang/test/CodeGen/X86/amx_transpose_api.c (+66)
(added) clang/test/CodeGen/X86/amx_transpose_errors.c (+31)
(modified) clang/test/Driver/x86-target-features.c (+7)
(modified) clang/test/Preprocessor/x86_target_features.c (+12)
(modified) llvm/include/llvm/CodeGen/TileShapeInfo.h (+80-7)
(modified) llvm/include/llvm/IR/IntrinsicsX86.td (+37)
(modified) llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h (+1)
(modified) llvm/include/llvm/TargetParser/X86TargetParser.def (+1)
(modified) llvm/lib/Target/X86/AsmParser/X86Operand.h (+31)
(modified) llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp (+5)
(modified) llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h (+7)
(modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp (+19)
(modified) llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h (+1)
(modified) llvm/lib/Target/X86/X86.td (+3)
(modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+125)
(modified) llvm/lib/Target/X86/X86FastPreTileConfig.cpp (+38-15)
(modified) llvm/lib/Target/X86/X86FastTileConfig.cpp (+28-12)
(modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+70)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+94)
(modified) llvm/lib/Target/X86/X86InstrAMX.td (+63)
(modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+10-2)
(modified) llvm/lib/Target/X86/X86InstrOperands.td (+7)
(modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1)
(modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+194-52)
(modified) llvm/lib/Target/X86/X86PreTileConfig.cpp (+33-12)
(modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+56-4)
(modified) llvm/lib/Target/X86/X86RegisterInfo.td (+9)
(modified) llvm/lib/Target/X86/X86TileConfig.cpp (+71-11)
(modified) llvm/lib/TargetParser/Host.cpp (+4)
(modified) llvm/lib/TargetParser/X86TargetParser.cpp (+1)
(added) llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll (+136)
(added) llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir (+165)
(added) llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir (+153)
(added) llvm/test/CodeGen/X86/amx_tile_pair_copy.mir (+97)
(added) llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll (+86)
(added) llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll (+60)
(added) llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir (+134)
(added) llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir (+113)
(added) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+150)
(modified) llvm/test/CodeGen/X86/ipra-reg-usage.ll (+2-2)
(added) llvm/test/MC/Disassembler/X86/amx-transpose-att.s (+57)
(added) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+58)
(added) llvm/test/MC/Disassembler/X86/amx-transpose-intel.s (+57)
(modified) llvm/unittests/CodeGen/InstrRefLDVTest.cpp (+3-3)
(modified) llvm/utils/TableGen/X86RecognizableInstr.cpp (+4)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ce046a305c89b6..dc58f98af55cc9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -623,6 +623,8 @@ X86 Support
 
 - All intrinsics in tbmintrin.h can now be used in constant expressions.
 
+- Support ISA of ``AMX-TRANSPOSE``.
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 2c591edb2835cd..4e95a8a73d550a 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
 TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
 TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
 TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
 TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
 
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+
 TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
 TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
 TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ddb2f5312148e..c55f2b86f4cb1f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6287,6 +6287,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
+def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
+def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
 def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
 def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 5448bd841959f4..fe5b600e6777fb 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXTILE = true;
     } else if (Feature == "+amx-complex") {
       HasAMXCOMPLEX = true;
+    } else if (Feature == "+amx-transpose") {
+      HasAMXTRANSPOSE = true;
     } else if (Feature == "+cmpccxadd") {
       HasCMPCCXADD = true;
     } else if (Feature == "+raoint") {
@@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_FP16__");
   if (HasAMXCOMPLEX)
     Builder.defineMacro("__AMX_COMPLEX__");
+  if (HasAMXTRANSPOSE)
+    Builder.defineMacro("__AMX_TRANSPOSE__");
   if (HasCMPCCXADD)
     Builder.defineMacro("__CMPCCXADD__");
   if (HasRAOINT)
@@ -1065,6 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-fp16", true)
       .Case("amx-int8", true)
       .Case("amx-tile", true)
+      .Case("amx-transpose", true)
       .Case("avx", true)
       .Case("avx10.1-256", true)
       .Case("avx10.1-512", true)
@@ -1182,6 +1187,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-fp16", HasAMXFP16)
       .Case("amx-int8", HasAMXINT8)
       .Case("amx-tile", HasAMXTILE)
+      .Case("amx-transpose", HasAMXTRANSPOSE)
       .Case("avx", SSELevel >= AVX)
       .Case("avx10.1-256", HasAVX10_1)
       .Case("avx10.1-512", HasAVX10_1_512)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index a99ae62984c7d5..3e1fb41082950c 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXINT8 = false;
   bool HasAMXBF16 = false;
   bool HasAMXCOMPLEX = false;
+  bool HasAMXTRANSPOSE = false;
   bool HasSERIALIZE = false;
   bool HasTSXLDTRK = false;
   bool HasUSERMSR = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3f28b7f26c36fe..67d28ccec0f373 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16920,6 +16920,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     // instruction, but it will create a memset that won't be optimized away.
     return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
   }
+  // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
+  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+    Intrinsic::ID IID;
+    switch (BuiltinID) {
+    default:
+      llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+      IID = Intrinsic::x86_t2rpntlvwz0_internal;
+      break;
+    case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
+      break;
+    case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz1_internal;
+      break;
+    case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
+      break;
+    }
+
+    // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
+    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
+                                     {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
+
+    auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
+    assert(PtrTy && "arg3 must be of pointer type");
+    QualType PtreeTy = PtrTy->getPointeeType();
+    llvm::Type *TyPtee = ConvertType(PtreeTy);
+
+    // Bitcast amx type (x86_amx) to vector type (256 x i32)
+    // Then store tile0 into DstPtr0
+    Value *T0 = Builder.CreateExtractValue(Call, 0);
+    Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+                                           {TyPtee}, {T0});
+    Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
+
+    // Then store tile1 into DstPtr1
+    Value *T1 = Builder.CreateExtractValue(Call, 1);
+    Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+                                           {TyPtee}, {T1});
+    Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
+
+    // Note: Here we escape directly use x86_tilestored64_internal to store
+    // the results due to it can't make sure the Mem writen scope. This may
+    // cause shapes reloads after first amx intrinsic, which current amx reg-
+    // ister allocation has no ability to handle it.
+
+    return Store;
+  }
   case X86::BI__ud2:
     // llvm.trap makes a ud2a instruction on x86.
     return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..708525198324bb 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -149,6 +149,7 @@ set(x86_files
   amxcomplexintrin.h
   amxfp16intrin.h
   amxintrin.h
+  amxtransposeintrin.h
   avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index baa56f5b28e8e5..f07a5689011853 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 /// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+typedef int _tile1024i_1024a
+    __attribute__((__vector_size__(1024), __aligned__(1024)));
 
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
new file mode 100644
index 00000000000000..d5dc68f4152848
--- /dev/null
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride)                                   \
+  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride)                                   \
+  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// 	The 1st source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+///
+/// FOR i := 0 TO (dst.rows-1)
+/// 	tmp[511:0] := 0
+/// 	FOR j := 0 TO (dst.colsb/4-1)
+/// 		tmp.dword[j] := src.row[j].dword[i]
+/// 	ENDFOR
+/// 	dst.row[i] := tmp
+/// ENDFOR
+///
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  // Use __tile1024i_1024a* to escape the alignment check in
+  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+                                      (_tile1024i_1024a *)dst1, base,
+                                      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz0t1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+                                      (_tile1024i_1024a *)dst1, base,
+                                      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1t1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
+_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_ttransposed_internal(m, n, src);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
+                              const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                            &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
+                              const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                            &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Transpose 32-bit elements from src and write the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic cor...
[truncated]

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

fzou1 · 2024-10-28T11:26:54Z

llvm/include/llvm/CodeGen/TileShapeInfo.h

@@ -34,9 +34,31 @@ class ShapeT {
    if (MRI)
      deduceImm(MRI);
  }
+  // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)
+  // and ImmShapes. Due to the most case is only one shape (just simply use
+  // Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor


vertor -> vector?

Done, thanks!

fzou1 · 2024-10-28T11:30:14Z

llvm/lib/Target/X86/X86ExpandPseudo.cpp

+    MBB.erase(MBBI);
+    return true;
+  }
+  // Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no


Smilar -> Similar

Done, thanks!

fzou1 · 2024-10-28T11:40:58Z

llvm/lib/Target/X86/X86FastTileConfig.cpp

-    if (Reg.isVirtual() &&
-        MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
-      return true;
+    if (Reg.isVirtual()) {


Can this piece of code be replaced with calling getTileDefNum?

The function is in a different file. I don't find a proper header to store it and we should not call it across files.

fzou1 · 2024-10-31T12:00:19Z

clang/lib/Headers/amxtransposeintrin.h

+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);


Remove extra underline: __tile_transposed -> _tile_transposed.

fzou1 · 2024-10-31T12:00:55Z

clang/lib/Headers/amxtransposeintrin.h

+/// \param dst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// 	The 1st source tile. Max size is 1024 Bytes.


Removed "1st" from description since there is only 1 source tile.

fzou1 · 2024-10-31T12:08:13Z

clang/lib/Headers/amxtransposeintrin.h

+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src
+///    The 1st source tile. Max size is 1024 Bytes.


Remove "1st" as above.

fzou1 · 2024-10-31T12:10:31Z

clang/lib/CodeGen/CGBuiltin.cpp

+    Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
+
+    // Note: Here we escape directly use x86_tilestored64_internal to store
+    // the results due to it can't make sure the Mem writen scope. This may


writen -> written

fzou1 · 2024-10-31T12:19:17Z

llvm/include/llvm/CodeGen/TileShapeInfo.h

  ShapeT()
      : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
        ColImm(InvalidImmShape) {}
+  // TODO: We need to extern cmp operator for muti-shapes if


muti->multi

fzou1 · 2024-10-31T12:19:32Z

llvm/include/llvm/CodeGen/TileShapeInfo.h

@@ -34,9 +34,31 @@ class ShapeT {
    if (MRI)
      deduceImm(MRI);
  }
+  // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)


mult -> multiple

fzou1 · 2024-10-31T13:54:39Z

llvm/lib/Target/X86/AsmParser/X86Operand.h

@@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand {
    Inst.addOperand(MCOperand::createReg(Reg));
  }

+  bool isTILEPair() const {
+    return Kind == Register &&
+           X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());


Should X86::TILERegClassID be X86::TILEPAIRRegClassID?

X86::TILERegClassID is correct. We use single tile register to represent tile pair in the assembly.

fzou1 · 2024-10-31T14:11:25Z

llvm/lib/Target/X86/X86LowerAMXType.cpp

+    cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V));
+  } else {
+    // When it is not a const value and it is a function argument, we create
+    // Row at the entry bb.


Row -> Column

Row is correct.

fzou1 · 2024-10-31T15:27:54Z

llvm/lib/Target/X86/X86LowerAMXType.cpp

+
+  assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!");
+
+  // TODO: Specially handle the mult-use case.


mult->multi. The same below.

fzou1

LGTM

nikic · 2024-11-01T20:55:34Z

Looks like this causes a significant compile-time regression, but only for ReleaseLTO-g: https://llvm-compile-time-tracker.com/compare.php?from=1e19f0f9d92b5e9c43d53893e387341835d3d96b&to=c72a751dabff4260dcc309e48008941d51b31d21&stat=instructions:u

I wouldn't really expect this change to only affect that configuration -- I guess that means that optimized debuginfo processing has become a lot slower? Any idea what is going on here?

phoebewang · 2024-11-02T05:27:45Z

Looks like this causes a significant compile-time regression, but only for ReleaseLTO-g: https://llvm-compile-time-tracker.com/compare.php?from=1e19f0f9d92b5e9c43d53893e387341835d3d96b&to=c72a751dabff4260dcc309e48008941d51b31d21&stat=instructions:u

I wouldn't really expect this change to only affect that configuration -- I guess that means that optimized debuginfo processing has become a lot slower? Any idea what is going on here?

Thanks for the report, should be solved by #114642

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

Another try to fix compile regression by llvm#113532

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

3rd try to fix compile regression by llvm#113532

Found during review #115151

Found during review llvm#115151

phoebewang requested review from RKSimon, xiangzh1 and FreddyLeaf October 24, 2024 06:55

llvm deleted a comment from llvmbot Oct 24, 2024

phoebewang force-pushed the AMX-TRANSPOSE branch from 3f9b733 to 6abc376 Compare October 24, 2024 09:01

[X86][AMX] Support AMX-TRANSPOSE

1b42b13

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

phoebewang force-pushed the AMX-TRANSPOSE branch from 6abc376 to 1b42b13 Compare October 25, 2024 10:45

fzou1 reviewed Oct 28, 2024

View reviewed changes

Address review comment

f822950

fzou1 reviewed Oct 31, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into AMX-TRANSPOSE

86f82f6

phoebewang force-pushed the AMX-TRANSPOSE branch 2 times, most recently from da7baea to 73fce52 Compare November 1, 2024 07:15

Address review comments

5e76227

phoebewang force-pushed the AMX-TRANSPOSE branch from 73fce52 to 5e76227 Compare November 1, 2024 07:44

fzou1 approved these changes Nov 1, 2024

View reviewed changes

phoebewang merged commit c72a751 into llvm:main Nov 1, 2024
9 checks passed

phoebewang deleted the AMX-TRANSPOSE branch November 1, 2024 08:45

smallp-o-p pushed a commit to smallp-o-p/llvm-project that referenced this pull request Nov 3, 2024

[X86][AMX] Support AMX-TRANSPOSE (llvm#113532)

66dcc11

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

phoebewang added a commit to phoebewang/llvm-project that referenced this pull request Nov 3, 2024

[X86][AMX] Reserve all pair registers when AMXTRANSPOSE is not ready

fc714a9

Another try to fix compile regression by llvm#113532

phoebewang mentioned this pull request Nov 3, 2024

[X86][AMX] Reserve all pair registers when AMXTRANSPOSE is not ready #114706

Closed

nikic pushed a commit to nikic/llvm-project that referenced this pull request Nov 3, 2024

Reserve all pair registers when AMXTRANSPOSE is not ready

c426f98

Another try to fix compile regression by llvm#113532

NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024

[X86][AMX] Support AMX-TRANSPOSE (llvm#113532)

f5120a2

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

phoebewang added a commit to phoebewang/llvm-project that referenced this pull request Nov 5, 2024

[LiveDebugValues] Use getNumSupportedRegs to reduce compile time, NFCI

976aac2

3rd try to fix compile regression by llvm#113532

phoebewang mentioned this pull request Nov 5, 2024

[LiveDebugValues] Use getNumSupportedRegs to reduce compile time, NFCI #114944

Draft

phoebewang added a commit to phoebewang/llvm-project that referenced this pull request Nov 5, 2024

[LiveDebugValues] Use getNumSupportedRegs to reduce compile time, NFCI

4388802

3rd try to fix compile regression by llvm#113532

nikic pushed a commit to nikic/llvm-project that referenced this pull request Nov 5, 2024

Use getNumSupportedRegs to reduce compile time, NFCI

a3ffd3d

3rd try to fix compile regression by llvm#113532

phoebewang added a commit to phoebewang/llvm-project that referenced this pull request Nov 10, 2024

Fix mistake in llvm#113532

7d8faae

phoebewang added a commit that referenced this pull request Nov 10, 2024

Fix mistakes in #113532 (#115631)

08af115

Found during review #115151

Groverkss pushed a commit to iree-org/llvm-project that referenced this pull request Nov 15, 2024

Fix mistakes in llvm#113532 (llvm#115631)

9bdbd16

Found during review llvm#115151


		assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!");

		// TODO: Specially handle the mult-use case.

[X86][AMX] Support AMX-TRANSPOSE #113532

[X86][AMX] Support AMX-TRANSPOSE #113532

Uh oh!

Conversation

phoebewang commented Oct 24, 2024

Uh oh!

llvmbot commented Oct 24, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

phoebewang Nov 1, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

fzou1 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

nikic commented Nov 1, 2024

Uh oh!

phoebewang commented Nov 2, 2024

Uh oh!

Uh oh!

llvmbot commented Oct 24, 2024 •

edited

Loading

phoebewang Nov 1, 2024 •

edited

Loading