Skip to content

[X86][APX]Support lowering for APX promoted AMX-TILE instructions #78689

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions llvm/lib/Target/X86/X86ExpandPseudo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -556,16 +556,18 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
#define GET_EGPR_IF_ENABLED(OPC) (STI->hasEGPR() ? OPC##_EVEX : OPC)
case X86::PLDTILECFGV: {
MI.setDesc(TII->get(X86::LDTILECFG));
MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG)));
return true;
}
case X86::PTILELOADDV:
case X86::PTILELOADDT1V: {
for (unsigned i = 2; i > 0; --i)
MI.removeOperand(i);
unsigned Opc =
Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
unsigned Opc = Opcode == X86::PTILELOADDV
? GET_EGPR_IF_ENABLED(X86::TILELOADD)
: GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
MI.setDesc(TII->get(Opc));
return true;
}
Expand Down Expand Up @@ -599,9 +601,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::PTILESTOREDV: {
for (int i = 1; i >= 0; --i)
MI.removeOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));
MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::TILESTORED)));
return true;
}
#undef GET_EGPR_IF_ENABLED
case X86::PTILEZEROV: {
for (int i = 2; i > 0; --i) // Remove row, col
MI.removeOperand(i);
Expand Down
18 changes: 13 additions & 5 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36374,14 +36374,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("illegal opcode!");
case X86::PTILELOADD: Opc = X86::TILELOADD; break;
case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
case X86::PTILESTORED: Opc = X86::TILESTORED; break;
#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
case X86::PTILELOADD:
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
break;
case X86::PTILELOADDT1:
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
break;
case X86::PTILESTORED:
Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
break;
#undef GET_EGPR_IF_ENABLED
}

MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
unsigned CurOp = 0;
if (Opc != X86::TILESTORED)
if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
RegState::Define);

Expand All @@ -36391,7 +36399,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MIB.add(MI.getOperand(CurOp++)); // displacement
MIB.add(MI.getOperand(CurOp++)); // segment

if (Opc == X86::TILESTORED)
if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
RegState::Undef);

Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4382,7 +4382,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
case 1024:
assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
return Load ? X86::TILELOADD : X86::TILESTORED;
#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
: GET_EGPR_IF_ENABLED(X86::TILESTORED);
#undef GET_EGPR_IF_ENABLED
}
}

Expand Down Expand Up @@ -4575,6 +4578,8 @@ static bool isAMXOpcode(unsigned Opc) {
return false;
case X86::TILELOADD:
case X86::TILESTORED:
case X86::TILELOADD_EVEX:
case X86::TILESTORED_EVEX:
return true;
}
}
Expand All @@ -4586,7 +4591,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
switch (Opc) {
default:
llvm_unreachable("Unexpected special opcode!");
case X86::TILESTORED: {
case X86::TILESTORED:
case X86::TILESTORED_EVEX: {
// tilestored %tmm, (%sp, %idx)
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
Expand All @@ -4599,7 +4605,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
MO.setIsKill(true);
break;
}
case X86::TILELOADD: {
case X86::TILELOADD:
case X86::TILELOADD_EVEX: {
// tileloadd (%sp, %idx), %tmm
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/X86/X86LowerTileCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,17 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
// mov 64 %rax
BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
// tilestored %tmm, (%sp, %idx)
unsigned Opc = X86::TILESTORED;
#define GET_EGPR_IF_ENABLED(OPC) (ST.hasEGPR() ? OPC##_EVEX : OPC)
unsigned Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
MachineInstr *NewMI =
addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
.addReg(SrcReg, getKillRegState(SrcMO.isKill()));
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(GR64Cand);
MO.setIsKill(true);
// tileloadd (%sp, %idx), %tmm
Opc = X86::TILELOADD;
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
#undef GET_EGPR_IF_ENABLED
NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
TileSS);
// restore %rax
Expand Down
128 changes: 128 additions & 0 deletions llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f,+egpr --show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=EGPR

define dso_local void @test1(ptr%buf) nounwind {
; CHECK-LABEL: test1:
Expand Down Expand Up @@ -63,6 +64,79 @@ define dso_local void @test1(ptr%buf) nounwind {
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
;
; EGPR-LABEL: test1:
; EGPR: # %bb.0: # %entry
; EGPR-NEXT: pushq %rbp # encoding: [0x55]
; EGPR-NEXT: pushq %r15 # encoding: [0x41,0x57]
; EGPR-NEXT: pushq %r14 # encoding: [0x41,0x56]
; EGPR-NEXT: pushq %rbx # encoding: [0x53]
; EGPR-NEXT: subq $4056, %rsp # encoding: [0x48,0x81,0xec,0xd8,0x0f,0x00,0x00]
; EGPR-NEXT: # imm = 0xFD8
; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0f]
; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf0,0x03,0x00,0x00,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd0,0x03,0x00,0x00,0x08,0x00]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf1,0x03,0x00,0x00,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd2,0x03,0x00,0x00,0x08,0x00]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf2,0x03,0x00,0x00,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd4,0x03,0x00,0x00,0x08,0x00]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf3,0x03,0x00,0x00,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd6,0x03,0x00,0x00,0x08,0x00]
; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
; EGPR-NEXT: movl $64, %eax # encoding: [0xb8,0x40,0x00,0x00,0x00]
; EGPR-NEXT: movw $8, %bp # encoding: [0x66,0xbd,0x08,0x00]
; EGPR-NEXT: tileloadd (%rdi,%rax), %tmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x1c,0x07]
; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
; EGPR-NEXT: testb %al, %al # encoding: [0x84,0xc0]
; EGPR-NEXT: jne .LBB0_3 # encoding: [0x75,A]
; EGPR-NEXT: # fixup A - offset: 1, value: .LBB0_3-1, kind: FK_PCRel_1
; EGPR-NEXT: # %bb.1: # %loop.header.preheader
; EGPR-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
; EGPR-NEXT: xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6]
; EGPR-NEXT: movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00]
; EGPR-NEXT: .p2align 4, 0x90
; EGPR-NEXT: .LBB0_2: # %loop.header
; EGPR-NEXT: # =>This Inner Loop Header: Depth=1
; EGPR-NEXT: movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
; EGPR-NEXT: tilestored %tmm3, 3024(%rsp,%rax) # 1024-byte Folded Spill
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; EGPR-NEXT: callq foo # encoding: [0xe8,A,A,A,A]
; EGPR-NEXT: # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
; EGPR-NEXT: movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
; EGPR-NEXT: tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
; EGPR-NEXT: # implicit-def: $rax
; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; EGPR-NEXT: # encoding: [0x48,0x89,0x84,0x24,0xb8,0x03,0x00,0x00]
; EGPR-NEXT: movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
; EGPR-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0x00,0x04,0x00,0x00]
; EGPR-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x94,0x24,0x00,0x04,0x00,0x00]
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; EGPR-NEXT: # encoding: [0x48,0x8b,0x84,0x24,0xb8,0x03,0x00,0x00]
; EGPR-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
; EGPR-NEXT: tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
; EGPR-NEXT: incl %r14d # encoding: [0x41,0xff,0xc6]
; EGPR-NEXT: cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64]
; EGPR-NEXT: jl .LBB0_2 # encoding: [0x7c,A]
; EGPR-NEXT: # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
; EGPR-NEXT: .LBB0_3: # %exit
; EGPR-NEXT: addq $4056, %rsp # encoding: [0x48,0x81,0xc4,0xd8,0x0f,0x00,0x00]
; EGPR-NEXT: # imm = 0xFD8
; EGPR-NEXT: popq %rbx # encoding: [0x5b]
; EGPR-NEXT: popq %r14 # encoding: [0x41,0x5e]
; EGPR-NEXT: popq %r15 # encoding: [0x41,0x5f]
; EGPR-NEXT: popq %rbp # encoding: [0x5d]
; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; EGPR-NEXT: retq # encoding: [0xc3]
entry:
%t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 64)
br i1 undef, label %loop.header, label %exit
Expand Down Expand Up @@ -139,6 +213,60 @@ define dso_local void @test2(ptr%buf) nounwind {
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
;
; EGPR-LABEL: test2:
; EGPR: # %bb.0: # %entry
; EGPR-NEXT: pushq %rbp # encoding: [0x55]
; EGPR-NEXT: pushq %r15 # encoding: [0x41,0x57]
; EGPR-NEXT: pushq %r14 # encoding: [0x41,0x56]
; EGPR-NEXT: pushq %rbx # encoding: [0x53]
; EGPR-NEXT: subq $72, %rsp # encoding: [0x48,0x83,0xec,0x48]
; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x84,0x24,0x08,0x00,0x00,0x00]
; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x08,0x01]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x38,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x18,0x08,0x00]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x39,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x1a,0x08,0x00]
; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x3a,0x08]
; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x1c,0x08,0x00]
; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
; EGPR-NEXT: movw $8, %bp # encoding: [0x66,0xbd,0x08,0x00]
; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
; EGPR-NEXT: testb %al, %al # encoding: [0x84,0xc0]
; EGPR-NEXT: jne .LBB1_3 # encoding: [0x75,A]
; EGPR-NEXT: # fixup A - offset: 1, value: .LBB1_3-1, kind: FK_PCRel_1
; EGPR-NEXT: # %bb.1: # %loop.header.preheader
; EGPR-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
; EGPR-NEXT: xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6]
; EGPR-NEXT: movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00]
; EGPR-NEXT: .p2align 4, 0x90
; EGPR-NEXT: .LBB1_2: # %loop.header
; EGPR-NEXT: # =>This Inner Loop Header: Depth=1
; EGPR-NEXT: tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; EGPR-NEXT: callq foo # encoding: [0xe8,A,A,A,A]
; EGPR-NEXT: # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
; EGPR-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
; EGPR-NEXT: tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
; EGPR-NEXT: incl %r14d # encoding: [0x41,0xff,0xc6]
; EGPR-NEXT: cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64]
; EGPR-NEXT: jl .LBB1_2 # encoding: [0x7c,A]
; EGPR-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
; EGPR-NEXT: .LBB1_3: # %exit
; EGPR-NEXT: addq $72, %rsp # encoding: [0x48,0x83,0xc4,0x48]
; EGPR-NEXT: popq %rbx # encoding: [0x5b]
; EGPR-NEXT: popq %r14 # encoding: [0x41,0x5e]
; EGPR-NEXT: popq %r15 # encoding: [0x41,0x5f]
; EGPR-NEXT: popq %rbp # encoding: [0x5d]
; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; EGPR-NEXT: retq # encoding: [0xc3]
entry:
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
br i1 undef, label %loop.header, label %exit
Expand Down
Loading