Skip to content

Commit 5afa0fa

Browse files
[X86] Prevent APX NDD compression when it creates a partial write (#132051)
APX NDD instructions may be compressed when the result is also a source. For 8/16b instructions, this may create partial register write hazards if a previous super-register def is within the partial reg update clearance, or incorrect code if the super-register is not dead. This change prevents compression when the super-register is marked as an implicit define, which the virtual rewriter already adds in the case where a subregister is defined but the super-register is not dead. The BreakFalseDeps interface is also updated to add implicit super-register defs for NDD instructions that would incur partial-write stalls if compressed to legacy ops.
1 parent b858ba0 commit 5afa0fa

File tree

4 files changed

+220
-9
lines changed

4 files changed

+220
-9
lines changed

llvm/lib/Target/X86/X86CompressEVEX.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,23 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
237237
return 0;
238238
return I->NewOpc;
239239
};
240+
241+
// Redundant NDD ops cannot be safely compressed if either:
242+
// - the legacy op would introduce a partial write that BreakFalseDeps
243+
// identified as a potential stall, or
244+
// - the op is writing to a subregister of a live register, i.e. the
245+
// full (zeroed) result is used.
246+
// Both cases are indicated by an implicit def of the superregister.
247+
if (IsRedundantNDD) {
248+
Register Dst = MI.getOperand(0).getReg();
249+
if (Dst &&
250+
(X86::GR16RegClass.contains(Dst) || X86::GR8RegClass.contains(Dst))) {
251+
Register Super = getX86SubSuperRegister(Dst, 64);
252+
if (MI.definesRegister(Super, /*TRI=*/nullptr))
253+
IsRedundantNDD = false;
254+
}
255+
}
256+
240257
// NonNF -> NF only if it's not a compressible NDD instruction and eflags is
241258
// dead.
242259
unsigned NewOpc = IsRedundantNDD

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6793,19 +6793,37 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
67936793
unsigned X86InstrInfo::getPartialRegUpdateClearance(
67946794
const MachineInstr &MI, unsigned OpNum,
67956795
const TargetRegisterInfo *TRI) const {
6796-
if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
6796+
6797+
if (OpNum != 0)
6798+
return 0;
6799+
6800+
// NDD ops with 8/16b results may appear to be partial register
6801+
// updates after register allocation.
6802+
bool HasNDDPartialWrite = false;
6803+
if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6804+
Register Reg = MI.getOperand(0).getReg();
6805+
if (!Reg.isVirtual())
6806+
HasNDDPartialWrite =
6807+
X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6808+
}
6809+
6810+
if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
67976811
return 0;
67986812

6799-
// If MI is marked as reading Reg, the partial register update is wanted.
6813+
// Check if the result register is also used as a source.
6814+
// For non-NDD ops, this means a partial update is wanted, hence we return 0.
6815+
// For NDD ops, this means it is possible to compress the instruction
6816+
// to a legacy form in CompressEVEX, which would create an unwanted partial
6817+
// update, so we return the clearance.
68006818
const MachineOperand &MO = MI.getOperand(0);
68016819
Register Reg = MO.getReg();
6802-
if (Reg.isVirtual()) {
6803-
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
6804-
return 0;
6805-
} else {
6806-
if (MI.readsRegister(Reg, TRI))
6807-
return 0;
6808-
}
6820+
bool ReadsReg = false;
6821+
if (Reg.isVirtual())
6822+
ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6823+
else
6824+
ReadsReg = MI.readsRegister(Reg, TRI);
6825+
if (ReadsReg != HasNDDPartialWrite)
6826+
return 0;
68096827

68106828
// If any instructions in the clearance range are reading Reg, insert a
68116829
// dependency breaking instruction, which is inexpensive and is likely to
@@ -7229,6 +7247,17 @@ void X86InstrInfo::breakPartialRegDependency(
72297247
.addReg(Reg, RegState::Undef)
72307248
.addReg(Reg, RegState::Undef);
72317249
MI.addRegisterKilled(Reg, TRI, true);
7250+
} else if ((X86::GR16RegClass.contains(Reg) ||
7251+
X86::GR8RegClass.contains(Reg)) &&
7252+
X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7253+
// This case is only expected for NDD ops which appear to be partial
7254+
// writes, but are not due to the zeroing of the upper part. Here
7255+
// we add an implicit def of the superegister, which prevents
7256+
// CompressEVEX from converting this to a legacy form.
7257+
Register SuperReg = getX86SubSuperRegister(Reg, 64);
7258+
MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7259+
if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7260+
BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
72327261
}
72337262
}
72347263

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s
3+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,RC1 %s
4+
#
5+
# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
6+
# if compressed to a legacy op. MIR has been modified to force different register assignments.
7+
#
8+
# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
9+
# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
10+
#
11+
# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
12+
# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
13+
#
14+
--- |
15+
define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
16+
; RCDEFAULT-LABEL: partial_write:
17+
; RCDEFAULT: # %bb.0: # %entry
18+
; RCDEFAULT-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
19+
; RCDEFAULT-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07]
20+
; RCDEFAULT-NEXT: addw %cx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xc8]
21+
; RCDEFAULT-NEXT: retq # encoding: [0xc3]
22+
;
23+
; RC1-LABEL: partial_write:
24+
; RC1: # %bb.0: # %entry
25+
; RC1-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
26+
; RC1-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07]
27+
; RC1-NEXT: addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8]
28+
; RC1-NEXT: retq # encoding: [0xc3]
29+
entry:
30+
%add = add nsw i32 %b, %a
31+
store i32 %add, ptr %p, align 4
32+
%add1 = trunc i32 %add to i16
33+
%add2 = add i16 %add1, %x
34+
ret i16 %add2
35+
}
36+
37+
define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
38+
; CHECK-LABEL: no_partial_write:
39+
; CHECK: # %bb.0: # %entry
40+
; CHECK-NEXT: addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2]
41+
; CHECK-NEXT: movl %edx, (%rdi) # encoding: [0x89,0x17]
42+
; CHECK-NEXT: addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca]
43+
; CHECK-NEXT: retq # encoding: [0xc3]
44+
entry:
45+
%add = add nsw i32 %b, %a
46+
store i32 %add, ptr %p, align 4
47+
%add1 = trunc i32 %add to i16
48+
%add2 = add i16 %add1, %x
49+
ret i16 %add2
50+
}
51+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
52+
...
53+
---
54+
name: partial_write
55+
tracksRegLiveness: true
56+
noVRegs: true
57+
noPhis: true
58+
isSSA: false
59+
body: |
60+
bb.0.entry:
61+
liveins: $ecx, $edx, $esi, $rdi
62+
renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
63+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
64+
renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
65+
RET64 $ax
66+
...
67+
---
68+
name: no_partial_write
69+
tracksRegLiveness: true
70+
noVRegs: true
71+
noPhis: true
72+
isSSA: false
73+
body: |
74+
bb.0.entry:
75+
liveins: $ecx, $edx, $esi, $rdi
76+
77+
renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
78+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
79+
renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
80+
RET64 $ax
81+
...
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s
3+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RC1 %s
4+
#
5+
# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
6+
# if compressed to a legacy op. MIR has been modified to force different register assignments.
7+
#
8+
# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
9+
# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
10+
#
11+
# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
12+
# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
13+
#
14+
--- |
15+
define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
16+
entry:
17+
%add = add nsw i32 %b, %a
18+
store i32 %add, ptr %p, align 4
19+
%add1 = trunc i32 %add to i16
20+
%add2 = add i16 %add1, %x
21+
ret i16 %add2
22+
}
23+
24+
define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
25+
entry:
26+
%add = add nsw i32 %b, %a
27+
store i32 %add, ptr %p, align 4
28+
%add1 = trunc i32 %add to i16
29+
%add2 = add i16 %add1, %x
30+
ret i16 %add2
31+
}
32+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
33+
...
34+
---
35+
name: partial_write
36+
tracksRegLiveness: true
37+
noVRegs: true
38+
noPhis: true
39+
isSSA: false
40+
body: |
41+
bb.0.entry:
42+
liveins: $ecx, $edx, $esi, $rdi
43+
; RCDEFAULT-LABEL: name: partial_write
44+
; RCDEFAULT: liveins: $ecx, $edx, $esi, $rdi
45+
; RCDEFAULT-NEXT: {{ $}}
46+
; RCDEFAULT-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
47+
; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
48+
; RCDEFAULT-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax, implicit-def $rax
49+
; RCDEFAULT-NEXT: RET64 $ax
50+
;
51+
; RC1-LABEL: name: partial_write
52+
; RC1: liveins: $ecx, $edx, $esi, $rdi
53+
; RC1-NEXT: {{ $}}
54+
; RC1-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
55+
; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
56+
; RC1-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
57+
; RC1-NEXT: RET64 $ax
58+
renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
59+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
60+
renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
61+
RET64 $ax
62+
...
63+
---
64+
name: no_partial_write
65+
tracksRegLiveness: true
66+
noVRegs: true
67+
noPhis: true
68+
isSSA: false
69+
body: |
70+
bb.0.entry:
71+
liveins: $ecx, $edx, $esi, $rdi
72+
73+
; CHECK-LABEL: name: no_partial_write
74+
; CHECK: liveins: $ecx, $edx, $esi, $rdi
75+
; CHECK-NEXT: {{ $}}
76+
; CHECK-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
77+
; CHECK-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
78+
; CHECK-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
79+
; CHECK-NEXT: RET64 $ax
80+
renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
81+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
82+
renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
83+
RET64 $ax
84+
...

0 commit comments

Comments
 (0)