Skip to content

Enable SelectOpt for Apple CPUs #10134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,13 @@ class TargetTransformInfoImplBase {
bool enableSelectOptimize() const { return true; }

bool shouldTreatInstructionLikeSelect(const Instruction *I) {
// A select with two constant operands will usually be better left as a
// select.
using namespace llvm::PatternMatch;
if (match(I, m_Select(m_Value(), m_Constant(), m_Constant())))
return false;
// If the select is a logical-and/logical-or then it is better treated as a
// and/or by the backend.
using namespace llvm::PatternMatch;
return isa<SelectInst>(I) &&
!match(I, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
m_LogicalOr(m_Value(), m_Value())));
Expand Down
519 changes: 303 additions & 216 deletions llvm/lib/CodeGen/SelectOptimize.cpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Processors.td
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureDisableLatencySchedHeuristic,
FeatureEnableSelectOptimize,
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
Expand All @@ -354,6 +355,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureDisableLatencySchedHeuristic,
FeatureEnableSelectOptimize,
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
Expand All @@ -370,6 +372,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureDisableLatencySchedHeuristic,
FeatureEnableSelectOptimize,
FeatureFuseAddress,
FeatureFuseAdrpAdd,
FeatureFuseAES,
Expand All @@ -387,6 +390,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureDisableLatencySchedHeuristic,
FeatureEnableSelectOptimize,
FeatureFuseAddress,
FeatureFuseAdrpAdd,
FeatureFuseAES,
Expand All @@ -404,6 +408,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
FeatureDisableLatencySchedHeuristic,
FeatureEnableSelectOptimize,
FeatureFuseAddress,
FeatureFuseAES,
FeatureFuseArithmeticLogic,
Expand Down
37 changes: 29 additions & 8 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4572,14 +4572,35 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
}

bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
// For the binary operators (e.g. or) we need to be more careful than
// selects, here we only transform them if they are already at a natural
// break point in the code - the end of a block with an unconditional
// terminator.
if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
isa<BranchInst>(I->getNextNode()) &&
cast<BranchInst>(I->getNextNode())->isUnconditional())
return true;
if (EnableOrLikeSelectOpt) {
switch (ST->getProcFamily()) {
case AArch64Subtarget::AppleA14:
case AArch64Subtarget::AppleA15:
case AArch64Subtarget::AppleA16:
case AArch64Subtarget::AppleM4:
// Only treat Adds feeding pointers as select-like.
if (I->getOpcode() == Instruction::Add ||
I->getOpcode() == Instruction::Sub)
return any_of(I->getOperand(0)->users(),
[](User *U) { return isa<GetElementPtrInst>(U); });
return false;
default:
break;
}

// For the binary operators (e.g. or) we need to be more careful than
// selects, here we only transform them if they are already at a natural
// break point in the code - the end of a block with an unconditional
// terminator.
if (I->getOpcode() == Instruction::Or &&
isa<BranchInst>(I->getNextNode()) &&
cast<BranchInst>(I->getNextNode())->isUnconditional())
return true;

if (I->getOpcode() == Instruction::Add ||
I->getOpcode() == Instruction::Sub)
return true;
}
return BaseT::shouldTreatInstructionLikeSelect(I);
}

Expand Down
74 changes: 74 additions & 0 deletions llvm/test/CodeGen/AArch64/selectopt-apple-defaults.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-a7 -o - %s | FileCheck --check-prefix=DISABLED %s
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m1 -o - %s | FileCheck --check-prefix=ENABLED %s
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m2 -o - %s | FileCheck --check-prefix=ENABLED %s
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m3 -o - %s | FileCheck --check-prefix=ENABLED %s
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m4 -o - %s | FileCheck --check-prefix=ENABLED %s

define void @test_select_opt(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.start) {
; DISABLED-LABEL: test_select_opt:
; DISABLED: ; %bb.0: ; %entry
; DISABLED-NEXT: add x8, x2, #1
; DISABLED-NEXT: LBB0_1: ; %loop
; DISABLED-NEXT: ; =>This Inner Loop Header: Depth=1
; DISABLED-NEXT: ldr x9, [x1, x4, lsl #3]
; DISABLED-NEXT: ldr x10, [x1, x2, lsl #3]
; DISABLED-NEXT: cmp x9, x10
; DISABLED-NEXT: cset w9, lo
; DISABLED-NEXT: cinc x2, x2, lo
; DISABLED-NEXT: sub x9, x4, x9
; DISABLED-NEXT: str x2, [x0, x9, lsl #3]
; DISABLED-NEXT: mov x4, x2
; DISABLED-NEXT: subs x8, x8, #1
; DISABLED-NEXT: b.ne LBB0_1
; DISABLED-NEXT: ; %bb.2: ; %exit
; DISABLED-NEXT: ret
;
; ENABLED-LABEL: test_select_opt:
; ENABLED: ; %bb.0: ; %entry
; ENABLED-NEXT: add x8, x2, #1
; ENABLED-NEXT: b LBB0_2
; ENABLED-NEXT: LBB0_1: ; %select.end
; ENABLED-NEXT: ; in Loop: Header=BB0_2 Depth=1
; ENABLED-NEXT: str x2, [x0, x4, lsl #3]
; ENABLED-NEXT: mov x4, x2
; ENABLED-NEXT: subs x8, x8, #1
; ENABLED-NEXT: b.eq LBB0_4
; ENABLED-NEXT: LBB0_2: ; %loop
; ENABLED-NEXT: ; =>This Inner Loop Header: Depth=1
; ENABLED-NEXT: ldr x9, [x1, x4, lsl #3]
; ENABLED-NEXT: ldr x10, [x1, x2, lsl #3]
; ENABLED-NEXT: cmp x9, x10
; ENABLED-NEXT: b.hs LBB0_1
; ENABLED-NEXT: ; %bb.3: ; %select.true.sink
; ENABLED-NEXT: ; in Loop: Header=BB0_2 Depth=1
; ENABLED-NEXT: add x2, x2, #1
; ENABLED-NEXT: sub x4, x4, #1
; ENABLED-NEXT: b LBB0_1
; ENABLED-NEXT: LBB0_4: ; %exit
; ENABLED-NEXT: ret
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%j = phi i64 [ %j.start, %entry ], [ %j.next, %loop ]
%i = phi i64 [ %i.start, %entry ], [ %j.next, %loop ]
%gep.i = getelementptr inbounds ptr, ptr %src, i64 %i
%l.i = load ptr, ptr %gep.i, align 8
%gep.j = getelementptr inbounds ptr, ptr %src, i64 %j
%l.j = load ptr, ptr %gep.j, align 8
%cmp3 = icmp ult ptr %l.i, %l.j
%dec = zext i1 %cmp3 to i64
%dec.i = sext i1 %cmp3 to i64
%j.next = add nsw i64 %j, %dec
%i.next = add nsw i64 %i, %dec.i
%gep.dst = getelementptr inbounds ptr, ptr %dst, i64 %i.next
store i64 %j.next, ptr %gep.dst, align 8
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, %j.start
br i1 %ec, label %exit, label %loop

exit:
ret void
}
Loading