-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][GISel] Always fold G_SHL into addressing mode where possible, unless the subtarget has addr-lsl-slow-14 #96603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-globalisel Author: Him188 (Him188) ChangesThis patch fixes GISel 15% regression in TSVC kernel s482. It also brings regression in s291 from 20% to 10%. Full diff: https://github.com/llvm/llvm-project/pull/96603.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0357a7206c478..efbef473a9e5a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6717,6 +6717,20 @@ AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
return select12BitValueWithLeftShift(Immed);
}
+/// Returns true if the def of MI is only used by memory operations.
+/// If the def is G_SHL, we also check indirect usages through G_PTR_ADD.
+static bool onlyUsedInMemoryOps(MachineInstr &MI, const MachineRegisterInfo &MRI) {
+ const Register DefReg = MI.getOperand(0).getReg();
+ return all_of(MRI.use_nodbg_instructions(DefReg),
+ [&](MachineInstr &Use) {
+ if (MI.getOpcode() == AArch64::G_SHL &&
+ Use.getOpcode() == AArch64::G_PTR_ADD &&
+ onlyUsedInMemoryOps(Use, MRI))
+ return true;
+ return Use.mayLoadOrStore();
+ });
+}
+
/// Return true if it is worth folding MI into an extended register. That is,
/// if it's safe to pull it into the addressing mode of a load or store as a
/// shift.
@@ -6734,8 +6748,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
// We have a fastpath, so folding a shift in and potentially computing it
// many times may be beneficial. Check if this is only used in memory ops.
// If it is, then we should fold.
- return all_of(MRI.use_nodbg_instructions(DefReg),
- [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+ return onlyUsedInMemoryOps(MI, MRI);
}
static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/addressing-modes-multiple.mir b/llvm/test/CodeGen/AArch64/GlobalISel/addressing-modes-multiple.mir
new file mode 100644
index 0000000000000..cfcdcf84be9dd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/addressing-modes-multiple.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-linux-gnu -O3 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
+
+---
+name: shl_multiple_ptr_add
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: shl_ptr_add
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+ ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64))
+ ; CHECK-NEXT: STRXroX [[LDRXroX]], [[COPY1]], [[COPY]], 0, 1 :: (store (s64))
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64) ; %2 used by multiple G_PTR_ADD
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_PTR_ADD %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load (s64))
+ %ptr:gpr(p0) = G_PTR_ADD %3, %2
+ G_STORE %5, %ptr :: (store (s64))
+...
|
@llvm/pr-subscribers-backend-aarch64 Author: Him188 (Him188) ChangesThis patch fixes GISel 15% regression in TSVC kernel s482. It also brings regression in s291 from 20% to 10%. Full diff: https://github.com/llvm/llvm-project/pull/96603.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0357a7206c478..efbef473a9e5a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6717,6 +6717,20 @@ AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
return select12BitValueWithLeftShift(Immed);
}
+/// Returns true if the def of MI is only used by memory operations.
+/// If the def is G_SHL, we also check indirect usages through G_PTR_ADD.
+static bool onlyUsedInMemoryOps(MachineInstr &MI, const MachineRegisterInfo &MRI) {
+ const Register DefReg = MI.getOperand(0).getReg();
+ return all_of(MRI.use_nodbg_instructions(DefReg),
+ [&](MachineInstr &Use) {
+ if (MI.getOpcode() == AArch64::G_SHL &&
+ Use.getOpcode() == AArch64::G_PTR_ADD &&
+ onlyUsedInMemoryOps(Use, MRI))
+ return true;
+ return Use.mayLoadOrStore();
+ });
+}
+
/// Return true if it is worth folding MI into an extended register. That is,
/// if it's safe to pull it into the addressing mode of a load or store as a
/// shift.
@@ -6734,8 +6748,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
// We have a fastpath, so folding a shift in and potentially computing it
// many times may be beneficial. Check if this is only used in memory ops.
// If it is, then we should fold.
- return all_of(MRI.use_nodbg_instructions(DefReg),
- [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
+ return onlyUsedInMemoryOps(MI, MRI);
}
static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/addressing-modes-multiple.mir b/llvm/test/CodeGen/AArch64/GlobalISel/addressing-modes-multiple.mir
new file mode 100644
index 0000000000000..cfcdcf84be9dd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/addressing-modes-multiple.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-linux-gnu -O3 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
+
+---
+name: shl_multiple_ptr_add
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $x0, $x1, $x2
+ ; CHECK-LABEL: name: shl_ptr_add
+ ; CHECK: liveins: $x0, $x1, $x2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+ ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64))
+ ; CHECK-NEXT: STRXroX [[LDRXroX]], [[COPY1]], [[COPY]], 0, 1 :: (store (s64))
+ %0:gpr(s64) = COPY $x0
+ %1:gpr(s64) = G_CONSTANT i64 3
+ %2:gpr(s64) = G_SHL %0, %1(s64) ; %2 used by multiple G_PTR_ADD
+ %3:gpr(p0) = COPY $x1
+ %4:gpr(p0) = G_PTR_ADD %3, %2
+ %5:gpr(s64) = G_LOAD %4(p0) :: (load (s64))
+ %ptr:gpr(p0) = G_PTR_ADD %3, %2
+ G_STORE %5, %ptr :: (store (s64))
+...
|
See #86894 for how this should work. Some addressing modes are effectively free, so I don't believe it should matter of there are other uses. There are some target features that control what is considered free (as-in |
@davemgreen Does the updated solution solves your concerns? I will polish it if we are on the right way. |
I will have to check the details, but yeah I think this looks decent. It would be good to have a .ll test file that tests various cases with GISel and SDAG run lines to check that they are folding at the same times. |
if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI); | ||
Worth.has_value()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The .has_value check is redundant
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are more oddities, but let's first wait for convergence on the solution.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@davemgreen I have included one .ll test, part of which is copied from SDAG |
/// beneficial or not. | ||
/// | ||
/// Returns: | ||
/// - true if fodling MI would be beneficial. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
folding
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
if (IsAddrOperand) { | ||
// If we are already sure that folding MI is good or bad, return the result. | ||
if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI)) | ||
return Worth.value(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
optionals are odd, see https://en.cppreference.com/w/cpp/utility/optional.
return *Worth;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think optional is fine here as we need to represent a state where we are not sure if folding is benefitial.
// If we are also sure about whether folding is beneficial or not, | ||
// return the result. | ||
if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI)) | ||
return Worth.value(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return *Worth;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
You can pre-commit the tests (don't have to have a PR) so that this change shows the code differences more clearly. |
1cea7af
to
c6ec76e
Compare
@@ -12,17 +12,16 @@ declare void @foo() | |||
define i16 @halfword(ptr %ctx, i32 %xor72) nounwind { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test can be moved up a folder, if it's not just gisel. I would add the SDAG lines first too.
Can you remove the calls? It should still be possible to have different uses if it stores a different value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was actually copied from (part of) AArch64/aarch64-fold-lslfast.ll
. Let's add GISEL RUN lines to the original file and delete the new file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The calls are originally there so I would prefer keeping them. Instead I added a new case multi_use_half_word
to show multiple uses with stores.
The original case with calls is renamed from multi_use_non_memory
to multi_use_non_memory_call
.
c6ec76e
to
93edeea
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. I think this looks OK to me.
std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode( | ||
MachineInstr &MI, const MachineRegisterInfo &MRI) const { | ||
if (MI.getOpcode() == AArch64::G_SHL) { | ||
// Address operands with shifts are free, except for running on sub targets |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"sub targets" -> subtargets
@@ -1,43 +1,74 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0 | |||
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3 | |||
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK,CHECK0,CHECK0-GISEL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you put the SDAG tests first, for consistency with the other tests we have.
93edeea
to
655cb6a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to understand why we still have differences to SDAG in the tests at some point.
…e, unless the subtarget has addr-lsl-slow-14
655cb6a
to
ee8a1af
Compare
Thanks. NFC test cases merged separately. Will merge this after successful CI. |
…e, unless the subtarget has addr-lsl-slow-14 (#96603) Summary: Before this patch, we fold G_SHL into addressing mode lsl only when there is exactly one usage, or all the usages are memory ops, or we are optimizing for size. However, lsl is free on all aarch64 targets except those with FeatureAddrLSLSlow14. This patch uses this fact and always folds G_SHL into lsl for memory ops, with exceptions for FeatureAddrLSLSlow14. This patch also fixes GISel 15% regression in TSVC kernel s482, and brings regression in s291 from 20% to 10%. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250816
Before this patch, we fold G_SHL into addressing mode lsl only when there is exactly one usage, or all the usages are memory ops, or we are optimizing for size. However, lsl is free on all aarch64 targets except those with FeatureAddrLSLSlow14.
This patch uses this fact and always folds G_SHL into lsl for memory ops, with exceptions for FeatureAddrLSLSlow14.
This patch also fixes GISel 15% regression in TSVC kernel s482, and brings regression in s291 from 20% to 10%.