-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Prevent APX NDD compression when it creates a partial write #132051
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
phoebewang
merged 4 commits into
llvm:main
from
daniel-zabawa:review/x86-ndd-partial-write
Mar 21, 2025
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
ae57ca5
[X86] Prevent APX NDD compression when it creates a partial write
daniel-zabawa 1a64994
add verify-machineinstrs to tests, remove unneeded checks
daniel-zabawa 53d490a
remove vreg case from NDD check, reword comments and remove chaff fro…
daniel-zabawa f485dc2
cleanup of tests
daniel-zabawa File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s | ||
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,RC1 %s | ||
# | ||
# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write | ||
# if compressed to a legacy op. MIR has been modified to force different register assignments. | ||
# | ||
# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression. | ||
# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold. | ||
# | ||
# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write. | ||
# This case checks that an implicit-def of eax is not added by breakPartialRegDependency. | ||
# | ||
--- | | ||
define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 { | ||
; RCDEFAULT-LABEL: partial_write: | ||
; RCDEFAULT: # %bb.0: # %entry | ||
; RCDEFAULT-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2] | ||
; RCDEFAULT-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07] | ||
; RCDEFAULT-NEXT: addw %cx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xc8] | ||
; RCDEFAULT-NEXT: retq # encoding: [0xc3] | ||
; | ||
; RC1-LABEL: partial_write: | ||
; RC1: # %bb.0: # %entry | ||
; RC1-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2] | ||
; RC1-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07] | ||
; RC1-NEXT: addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8] | ||
; RC1-NEXT: retq # encoding: [0xc3] | ||
entry: | ||
%add = add nsw i32 %b, %a | ||
store i32 %add, ptr %p, align 4 | ||
%add1 = trunc i32 %add to i16 | ||
%add2 = add i16 %add1, %x | ||
ret i16 %add2 | ||
} | ||
|
||
define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 { | ||
; CHECK-LABEL: no_partial_write: | ||
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2] | ||
; CHECK-NEXT: movl %edx, (%rdi) # encoding: [0x89,0x17] | ||
; CHECK-NEXT: addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca] | ||
; CHECK-NEXT: retq # encoding: [0xc3] | ||
entry: | ||
%add = add nsw i32 %b, %a | ||
store i32 %add, ptr %p, align 4 | ||
%add1 = trunc i32 %add to i16 | ||
%add2 = add i16 %add1, %x | ||
ret i16 %add2 | ||
} | ||
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" } | ||
... | ||
--- | ||
name: partial_write | ||
tracksRegLiveness: true | ||
noVRegs: true | ||
noPhis: true | ||
isSSA: false | ||
body: | | ||
bb.0.entry: | ||
liveins: $ecx, $edx, $esi, $rdi | ||
renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p) | ||
renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax | ||
RET64 $ax | ||
... | ||
--- | ||
name: no_partial_write | ||
tracksRegLiveness: true | ||
noVRegs: true | ||
noPhis: true | ||
isSSA: false | ||
body: | | ||
bb.0.entry: | ||
liveins: $ecx, $edx, $esi, $rdi | ||
|
||
renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p) | ||
renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx | ||
RET64 $ax | ||
... |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 | ||
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s | ||
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RC1 %s | ||
# | ||
# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write | ||
# if compressed to a legacy op. MIR has been modified to force different register assignments. | ||
# | ||
# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression. | ||
# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold. | ||
# | ||
# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write. | ||
# This case checks that an implicit-def of eax is not added by breakPartialRegDependency. | ||
# | ||
--- | | ||
define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 { | ||
entry: | ||
%add = add nsw i32 %b, %a | ||
store i32 %add, ptr %p, align 4 | ||
%add1 = trunc i32 %add to i16 | ||
%add2 = add i16 %add1, %x | ||
ret i16 %add2 | ||
} | ||
|
||
define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 { | ||
entry: | ||
%add = add nsw i32 %b, %a | ||
store i32 %add, ptr %p, align 4 | ||
%add1 = trunc i32 %add to i16 | ||
%add2 = add i16 %add1, %x | ||
ret i16 %add2 | ||
} | ||
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" } | ||
... | ||
--- | ||
name: partial_write | ||
tracksRegLiveness: true | ||
noVRegs: true | ||
noPhis: true | ||
isSSA: false | ||
body: | | ||
bb.0.entry: | ||
liveins: $ecx, $edx, $esi, $rdi | ||
; RCDEFAULT-LABEL: name: partial_write | ||
; RCDEFAULT: liveins: $ecx, $edx, $esi, $rdi | ||
; RCDEFAULT-NEXT: {{ $}} | ||
; RCDEFAULT-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p) | ||
; RCDEFAULT-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax, implicit-def $rax | ||
; RCDEFAULT-NEXT: RET64 $ax | ||
; | ||
; RC1-LABEL: name: partial_write | ||
; RC1: liveins: $ecx, $edx, $esi, $rdi | ||
; RC1-NEXT: {{ $}} | ||
; RC1-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p) | ||
; RC1-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax | ||
; RC1-NEXT: RET64 $ax | ||
renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p) | ||
renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax | ||
RET64 $ax | ||
... | ||
--- | ||
name: no_partial_write | ||
tracksRegLiveness: true | ||
noVRegs: true | ||
noPhis: true | ||
isSSA: false | ||
body: | | ||
bb.0.entry: | ||
liveins: $ecx, $edx, $esi, $rdi | ||
|
||
; CHECK-LABEL: name: no_partial_write | ||
; CHECK: liveins: $ecx, $edx, $esi, $rdi | ||
; CHECK-NEXT: {{ $}} | ||
; CHECK-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
; CHECK-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p) | ||
; CHECK-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx | ||
; CHECK-NEXT: RET64 $ax | ||
renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags | ||
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p) | ||
renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx | ||
RET64 $ax | ||
... |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you check it with large applications like benchmark? I'm not sure if we can arbitrarily adding operand to a MI. Maybe add
-verify-machineinstrs
in the tests to check first.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can add verification to the tests. I don't believe it's arbitrary - an NDD op always writes to the full (physical) register, so this accurately reflects the semantics post-regalloc. The idea is to indicate in MIR that the zero-upper behavior is intentional and needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not about the register relationship. It is the machine instrcution operands match its defination. We use
Defs/Uses
for implict operands as well, as a result, the number of operands of a machine instrcution is constant vaule during its life time.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I checked Geekbench 6 and coremark-pro again. I was using Geekbench during development because it showed several instances of 8/16b ops generated with the ndd feature enabled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!