Skip to content

[X86][APX] Support APX + AMX-MOVRS/AMX-TRANSPOSE #123267

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 17, 2025

Conversation

phoebewang
Copy link
Contributor

@llvmbot llvmbot added backend:X86 mc Machine (object) code labels Jan 17, 2025
@llvmbot
Copy link
Member

llvmbot commented Jan 17, 2025

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/784266


Patch is 55.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123267.diff

13 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+10-10)
  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+13-11)
  • (modified) llvm/lib/Target/X86/X86InstrAMX.td (+47-5)
  • (modified) llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll (+89)
  • (modified) llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll (+30)
  • (modified) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+146)
  • (modified) llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt (+96)
  • (modified) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+48)
  • (modified) llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s (+89-1)
  • (modified) llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s (+96)
  • (modified) llvm/test/MC/X86/amx-transpose-att.s (+48)
  • (modified) llvm/test/MC/X86/amx-transpose-intel.s (+48)
  • (modified) llvm/test/TableGen/x86-instr-mapping.inc (+10)
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index fc8a0eaed140d0..7fbba7f05e0a5e 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -578,10 +578,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PTILELOADDRSV:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1V:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     case X86::PTILELOADDV:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -737,28 +737,28 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PT2RPNTLVWZ0V:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1V:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1V:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1V:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RSV:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1V:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RSV:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1V:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     default:
       llvm_unreachable("Impossible Opcode!");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90e3e15b1fb46c..6d69665c17565a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37800,14 +37800,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTILESTORED:
       Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
       break;
-#undef GET_EGPR_IF_ENABLED
     case X86::PTILELOADDRS:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     unsigned CurOp = 0;
@@ -37838,34 +37838,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PT2RPNTLVWZ1RST1: {
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
+#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected instruction!");
     case X86::PT2RPNTLVWZ0:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RS:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RS:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
     MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
 
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index a055ba91d3e171..b5d99f52f15c23 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -349,22 +349,22 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
     def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
                         (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
-                        []>, VEX, WIG, T8,PS;
+                        []>, VEX, T8, PS;
 
     def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
                           (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PS;
+                          []>, VEX, T8, PS;
 
     def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
                         (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
-                        []>, VEX, T8,PD;
+                        []>, VEX, T8, PD;
 
     def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
                           (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PD;
+                          []>, VEX, T8, PD;
 
     def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS;
+                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
     let isPseudo = true in {
       def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
                                   (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -554,6 +554,48 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
   }
 } // HasAMXMOVRS, In64BitMode
 
+let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in {
+  def T2RPNTLVWZ0_EVEX   : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                             (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
+                             []>, EVEX, NoCD8, T8, PS;
+
+  def T2RPNTLVWZ0T1_EVEX : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                             (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
+                             []>, EVEX, NoCD8, T8, PS;
+
+  def T2RPNTLVWZ1_EVEX   : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                             (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
+                             []>, EVEX, NoCD8, T8, PD;
+
+  def T2RPNTLVWZ1T1_EVEX : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                             (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
+                             []>, EVEX, NoCD8, T8, PD;
+} // HasAMXTRANSPOSE, HasEGPR, In64BitMode
+
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in {
+  def T2RPNTLVWZ0RS_EVEX   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                               (ins sibmem:$src1), "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
+                               []>, EVEX, NoCD8, T_MAP5;
+  def T2RPNTLVWZ0RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                               (ins sibmem:$src1), "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
+                               []>, EVEX, NoCD8, T_MAP5;
+  def T2RPNTLVWZ1RS_EVEX   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                               (ins sibmem:$src1), "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
+                               []>, EVEX, NoCD8, T_MAP5, PD;
+  def T2RPNTLVWZ1RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                               (ins sibmem:$src1), "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
+                               []>, EVEX, NoCD8, T_MAP5, PD;
+} // HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode
+
+let Predicates = [HasAMXMOVRS, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in {
+  def TILELOADDRS_EVEX   : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+                             (ins sibmem:$src1), "tileloaddrs\t{$src1, $dst|$dst, $src1}",
+                             []>, EVEX, NoCD8, T8, XD;
+  def TILELOADDRST1_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+                             (ins sibmem:$src1), "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
+                             []>, EVEX, NoCD8, T8, PD;
+} // HasAMXMOVRS, HasEGPR, In64BitMode
+
 multiclass m_tcvtrowd2ps {
   let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
     let SchedRW = [WriteSystem] in {
diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
index da212a1850964e..67688326c17500 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=APXF
 
 define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-LABEL: test_amx_internal:
@@ -35,6 +36,44 @@ define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; APXF-LABEL: test_amx_internal:
+; APXF:       # %bb.0: # %entry
+; APXF-NEXT:    pushq %rbp # encoding: [0x55]
+; APXF-NEXT:    .cfi_def_cfa_offset 16
+; APXF-NEXT:    .cfi_offset %rbp, -16
+; APXF-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; APXF-NEXT:    .cfi_def_cfa_register %rbp
+; APXF-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; APXF-NEXT:    # imm = 0xFC00
+; APXF-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; APXF-NEXT:    # imm = 0xC00
+; APXF-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; APXF-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; APXF-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; APXF-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; APXF-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; APXF-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; APXF-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; APXF-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; APXF-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; APXF-NEXT:    # implicit-def: $al
+; APXF-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; APXF-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; APXF-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; APXF-NEXT:    tileloaddrs (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x04,0x32]
+; APXF-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; APXF-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; APXF-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; APXF-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; APXF-NEXT:    popq %rbp # encoding: [0x5d]
+; APXF-NEXT:    .cfi_def_cfa %rsp, 8
+; APXF-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; APXF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -48,6 +87,12 @@ define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrs (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; APXF-LABEL: test_amx_old:
+; APXF:       # %bb.0: # %entry
+; APXF-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; APXF-NEXT:    tileloaddrs (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x14,0x02]
+; APXF-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
   ret void
@@ -88,6 +133,44 @@ define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; APXF-LABEL: test_amx_t1_internal:
+; APXF:       # %bb.0: # %entry
+; APXF-NEXT:    pushq %rbp # encoding: [0x55]
+; APXF-NEXT:    .cfi_def_cfa_offset 16
+; APXF-NEXT:    .cfi_offset %rbp, -16
+; APXF-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; APXF-NEXT:    .cfi_def_cfa_register %rbp
+; APXF-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; APXF-NEXT:    # imm = 0xFC00
+; APXF-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; APXF-NEXT:    # imm = 0xC00
+; APXF-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; APXF-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; APXF-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; APXF-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; APXF-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; APXF-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; APXF-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; APXF-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; APXF-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; APXF-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; APXF-NEXT:    # implicit-def: $al
+; APXF-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; APXF-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; APXF-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; APXF-NEXT:    tileloaddrst1 (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x04,0x32]
+; APXF-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; APXF-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; APXF-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; APXF-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; APXF-NEXT:    popq %rbp # encoding: [0x5d]
+; APXF-NEXT:    .cfi_def_cfa %rsp, 8
+; APXF-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; APXF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -101,6 +184,12 @@ define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; APXF-LABEL: test_amx_t1_old:
+; APXF:       # %bb.0: # %entry
+; APXF-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; APXF-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x14,0x02]
+; APXF-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
   ret void
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
index 146b69773eb186..0d5b85f2bb1088 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
 ; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=APXF
 
 define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-LABEL: test_amx:
@@ -10,6 +11,14 @@ define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
 ; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
 ; CHECK-NEXT:    retq
+;
+; APXF-LABEL: test_amx:
+; APXF:       # %bb.0:
+; APXF-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
+; APXF-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
+; APXF-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
+; APXF-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
+; APXF-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
@@ -80,6 +89,27 @@ define void @test_amx2(i8* %base, i64 %stride) #0 {
 ; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
 ; O2-NEXT:    tilerelease
 ; O2-NEXT:    retq
+;
+; APXF-LABEL: test_amx2:
+; APXF:       # %bb.0:
+; APXF-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; APXF-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
+; APXF-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
+; APXF-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
+; APXF-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
+; APXF-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; APXF-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; APXF-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; APXF-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
+; APXF-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
+; APXF-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; APXF-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; APXF-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
+; APXF-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
+; APXF-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
+; APXF-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
+; APXF-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; APXF-NEXT:    retq # encoding: [0xc3]
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %...
[truncated]

@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=APXF
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefix=EGPR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

@@ -554,6 +554,48 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
}
} // HasAMXMOVRS, In64BitMode

let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in {
def T2RPNTLVWZ0_EVEX : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Share code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Contributor

@KanRobert KanRobert left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@phoebewang phoebewang merged commit fbb9d49 into llvm:main Jan 17, 2025
8 checks passed
@phoebewang phoebewang deleted the AMX-TRANSPOSE branch January 17, 2025 09:51
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:X86 mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants