Skip to content

Commit 64e2812

Browse files
authored
Merge pull request #10134 from fhahn/selectopt-as-stable
Enable SelectOpt for Apple CPUs rdar://135131828
2 parents aec968d + 6d7c09c commit 64e2812

File tree

8 files changed

+1395
-246
lines changed

8 files changed

+1395
-246
lines changed

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,13 @@ class TargetTransformInfoImplBase {
399399
bool enableSelectOptimize() const { return true; }
400400

401401
bool shouldTreatInstructionLikeSelect(const Instruction *I) {
402+
// A select with two constant operands will usually be better left as a
403+
// select.
404+
using namespace llvm::PatternMatch;
405+
if (match(I, m_Select(m_Value(), m_Constant(), m_Constant())))
406+
return false;
402407
// If the select is a logical-and/logical-or then it is better treated as a
403408
// and/or by the backend.
404-
using namespace llvm::PatternMatch;
405409
return isa<SelectInst>(I) &&
406410
!match(I, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
407411
m_LogicalOr(m_Value(), m_Value())));

llvm/lib/CodeGen/SelectOptimize.cpp

Lines changed: 303 additions & 216 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
338338
FeatureArithmeticBccFusion,
339339
FeatureArithmeticCbzFusion,
340340
FeatureDisableLatencySchedHeuristic,
341+
FeatureEnableSelectOptimize,
341342
FeatureFuseAddress,
342343
FeatureFuseAES,
343344
FeatureFuseArithmeticLogic,
@@ -354,6 +355,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
354355
FeatureArithmeticBccFusion,
355356
FeatureArithmeticCbzFusion,
356357
FeatureDisableLatencySchedHeuristic,
358+
FeatureEnableSelectOptimize,
357359
FeatureFuseAddress,
358360
FeatureFuseAES,
359361
FeatureFuseArithmeticLogic,
@@ -370,6 +372,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
370372
FeatureArithmeticBccFusion,
371373
FeatureArithmeticCbzFusion,
372374
FeatureDisableLatencySchedHeuristic,
375+
FeatureEnableSelectOptimize,
373376
FeatureFuseAddress,
374377
FeatureFuseAdrpAdd,
375378
FeatureFuseAES,
@@ -387,6 +390,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
387390
FeatureArithmeticBccFusion,
388391
FeatureArithmeticCbzFusion,
389392
FeatureDisableLatencySchedHeuristic,
393+
FeatureEnableSelectOptimize,
390394
FeatureFuseAddress,
391395
FeatureFuseAdrpAdd,
392396
FeatureFuseAES,
@@ -404,6 +408,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
404408
FeatureArithmeticBccFusion,
405409
FeatureArithmeticCbzFusion,
406410
FeatureDisableLatencySchedHeuristic,
411+
FeatureEnableSelectOptimize,
407412
FeatureFuseAddress,
408413
FeatureFuseAES,
409414
FeatureFuseArithmeticLogic,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4572,14 +4572,35 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
45724572
}
45734573

45744574
bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
4575-
// For the binary operators (e.g. or) we need to be more careful than
4576-
// selects, here we only transform them if they are already at a natural
4577-
// break point in the code - the end of a block with an unconditional
4578-
// terminator.
4579-
if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4580-
isa<BranchInst>(I->getNextNode()) &&
4581-
cast<BranchInst>(I->getNextNode())->isUnconditional())
4582-
return true;
4575+
if (EnableOrLikeSelectOpt) {
4576+
switch (ST->getProcFamily()) {
4577+
case AArch64Subtarget::AppleA14:
4578+
case AArch64Subtarget::AppleA15:
4579+
case AArch64Subtarget::AppleA16:
4580+
case AArch64Subtarget::AppleM4:
4581+
// Only treat Adds feeding pointers as select-like.
4582+
if (I->getOpcode() == Instruction::Add ||
4583+
I->getOpcode() == Instruction::Sub)
4584+
return any_of(I->getOperand(0)->users(),
4585+
[](User *U) { return isa<GetElementPtrInst>(U); });
4586+
return false;
4587+
default:
4588+
break;
4589+
}
4590+
4591+
// For the binary operators (e.g. or) we need to be more careful than
4592+
// selects, here we only transform them if they are already at a natural
4593+
// break point in the code - the end of a block with an unconditional
4594+
// terminator.
4595+
if (I->getOpcode() == Instruction::Or &&
4596+
isa<BranchInst>(I->getNextNode()) &&
4597+
cast<BranchInst>(I->getNextNode())->isUnconditional())
4598+
return true;
4599+
4600+
if (I->getOpcode() == Instruction::Add ||
4601+
I->getOpcode() == Instruction::Sub)
4602+
return true;
4603+
}
45834604
return BaseT::shouldTreatInstructionLikeSelect(I);
45844605
}
45854606

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-a7 -o - %s | FileCheck --check-prefix=DISABLED %s
3+
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m1 -o - %s | FileCheck --check-prefix=ENABLED %s
4+
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m2 -o - %s | FileCheck --check-prefix=ENABLED %s
5+
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m3 -o - %s | FileCheck --check-prefix=ENABLED %s
6+
; RUN: llc -mtriple=arm64-apple-macosx -O3 -early-ifcvt-limit=0 -select-opti-loop-cycle-gain-threshold=2 -select-opti-loop-gradient-gain-threshold=10 -mcpu=apple-m4 -o - %s | FileCheck --check-prefix=ENABLED %s
7+
8+
define void @test_select_opt(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.start) {
9+
; DISABLED-LABEL: test_select_opt:
10+
; DISABLED: ; %bb.0: ; %entry
11+
; DISABLED-NEXT: add x8, x2, #1
12+
; DISABLED-NEXT: LBB0_1: ; %loop
13+
; DISABLED-NEXT: ; =>This Inner Loop Header: Depth=1
14+
; DISABLED-NEXT: ldr x9, [x1, x4, lsl #3]
15+
; DISABLED-NEXT: ldr x10, [x1, x2, lsl #3]
16+
; DISABLED-NEXT: cmp x9, x10
17+
; DISABLED-NEXT: cset w9, lo
18+
; DISABLED-NEXT: cinc x2, x2, lo
19+
; DISABLED-NEXT: sub x9, x4, x9
20+
; DISABLED-NEXT: str x2, [x0, x9, lsl #3]
21+
; DISABLED-NEXT: mov x4, x2
22+
; DISABLED-NEXT: subs x8, x8, #1
23+
; DISABLED-NEXT: b.ne LBB0_1
24+
; DISABLED-NEXT: ; %bb.2: ; %exit
25+
; DISABLED-NEXT: ret
26+
;
27+
; ENABLED-LABEL: test_select_opt:
28+
; ENABLED: ; %bb.0: ; %entry
29+
; ENABLED-NEXT: add x8, x2, #1
30+
; ENABLED-NEXT: b LBB0_2
31+
; ENABLED-NEXT: LBB0_1: ; %select.end
32+
; ENABLED-NEXT: ; in Loop: Header=BB0_2 Depth=1
33+
; ENABLED-NEXT: str x2, [x0, x4, lsl #3]
34+
; ENABLED-NEXT: mov x4, x2
35+
; ENABLED-NEXT: subs x8, x8, #1
36+
; ENABLED-NEXT: b.eq LBB0_4
37+
; ENABLED-NEXT: LBB0_2: ; %loop
38+
; ENABLED-NEXT: ; =>This Inner Loop Header: Depth=1
39+
; ENABLED-NEXT: ldr x9, [x1, x4, lsl #3]
40+
; ENABLED-NEXT: ldr x10, [x1, x2, lsl #3]
41+
; ENABLED-NEXT: cmp x9, x10
42+
; ENABLED-NEXT: b.hs LBB0_1
43+
; ENABLED-NEXT: ; %bb.3: ; %select.true.sink
44+
; ENABLED-NEXT: ; in Loop: Header=BB0_2 Depth=1
45+
; ENABLED-NEXT: add x2, x2, #1
46+
; ENABLED-NEXT: sub x4, x4, #1
47+
; ENABLED-NEXT: b LBB0_1
48+
; ENABLED-NEXT: LBB0_4: ; %exit
49+
; ENABLED-NEXT: ret
50+
entry:
51+
br label %loop
52+
53+
loop:
54+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
55+
%j = phi i64 [ %j.start, %entry ], [ %j.next, %loop ]
56+
%i = phi i64 [ %i.start, %entry ], [ %j.next, %loop ]
57+
%gep.i = getelementptr inbounds ptr, ptr %src, i64 %i
58+
%l.i = load ptr, ptr %gep.i, align 8
59+
%gep.j = getelementptr inbounds ptr, ptr %src, i64 %j
60+
%l.j = load ptr, ptr %gep.j, align 8
61+
%cmp3 = icmp ult ptr %l.i, %l.j
62+
%dec = zext i1 %cmp3 to i64
63+
%dec.i = sext i1 %cmp3 to i64
64+
%j.next = add nsw i64 %j, %dec
65+
%i.next = add nsw i64 %i, %dec.i
66+
%gep.dst = getelementptr inbounds ptr, ptr %dst, i64 %i.next
67+
store i64 %j.next, ptr %gep.dst, align 8
68+
%iv.next = add i64 %iv, 1
69+
%ec = icmp eq i64 %iv, %j.start
70+
br i1 %ec, label %exit, label %loop
71+
72+
exit:
73+
ret void
74+
}

0 commit comments

Comments
 (0)