Skip to content

Commit 8aa33f1

Browse files
authored
[X86][AMX] Check also AMX register live out for copy lowering (#93692)
Another bug fix for #83628.
1 parent 875e911 commit 8aa33f1

File tree

2 files changed

+44
-12
lines changed

2 files changed

+44
-12
lines changed

llvm/lib/Target/X86/X86LowerTileCopy.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,26 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
8181
bool Changed = false;
8282

8383
for (MachineBasicBlock &MBB : MF) {
84-
// There won't be a tile copy if no tile register live in.
84+
// There won't be a tile copy if neither tile register live in nor live out.
8585
bool HasTileCopy = false;
8686
for (const auto &LI : MBB.liveins()) {
8787
if (TILERegs.test(LI.PhysReg)) {
8888
HasTileCopy = true;
8989
break;
9090
}
9191
}
92-
if (!HasTileCopy)
93-
continue;
9492
LiveRegUnits UsedRegs(*TRI);
9593
UsedRegs.addLiveOuts(MBB);
94+
if (!HasTileCopy) {
95+
for (auto RegT : TILERegs.set_bits()) {
96+
if (UsedRegs.available(RegT)) {
97+
HasTileCopy = true;
98+
break;
99+
}
100+
}
101+
}
102+
if (!HasTileCopy)
103+
continue;
96104
for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
97105
UsedRegs.stepBackward(MI);
98106
if (!MI.isCopy())

llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,26 +52,18 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
5252
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
5353
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
5454

55-
define void @PR90954(ptr %0, ptr %1, i32 %2) {
55+
define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
5656
; CHECK-LABEL: PR90954:
5757
; CHECK: # %bb.0:
5858
; CHECK-NEXT: pushq %rbp
59-
; CHECK-NEXT: .cfi_def_cfa_offset 16
60-
; CHECK-NEXT: .cfi_offset %rbp, -16
6159
; CHECK-NEXT: movq %rsp, %rbp
62-
; CHECK-NEXT: .cfi_def_cfa_register %rbp
6360
; CHECK-NEXT: pushq %r15
6461
; CHECK-NEXT: pushq %r14
6562
; CHECK-NEXT: pushq %r13
6663
; CHECK-NEXT: pushq %r12
6764
; CHECK-NEXT: pushq %rbx
6865
; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
6966
; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
70-
; CHECK-NEXT: .cfi_offset %rbx, -56
71-
; CHECK-NEXT: .cfi_offset %r12, -48
72-
; CHECK-NEXT: .cfi_offset %r13, -40
73-
; CHECK-NEXT: .cfi_offset %r14, -32
74-
; CHECK-NEXT: .cfi_offset %r15, -24
7567
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
7668
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
7769
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -202,5 +194,37 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
202194
br label %6
203195
}
204196

197+
define void @multi_use() nounwind {
198+
; CHECK-LABEL: multi_use:
199+
; CHECK: # %bb.0:
200+
; CHECK-NEXT: pushq %rbp
201+
; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
202+
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
203+
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
204+
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
205+
; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
206+
; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
207+
; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
208+
; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
209+
; CHECK-NEXT: movw $64, %ax
210+
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
211+
; CHECK-NEXT: movw $16, %cx
212+
; CHECK-NEXT: tilezero %tmm0
213+
; CHECK-NEXT: movabsq $64, %rbp
214+
; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
215+
; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
216+
; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm1
217+
; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm0
218+
; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
219+
; CHECK-NEXT: popq %rbp
220+
; CHECK-NEXT: tilerelease
221+
; CHECK-NEXT: vzeroupper
222+
; CHECK-NEXT: retq
223+
%1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
224+
%2 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
225+
%3 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %1, x86_amx %1, x86_amx %1)
226+
ret void
227+
}
228+
205229
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
206230
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)

0 commit comments

Comments
 (0)