Skip to content

Commit 320097d

Browse files
committed
[AArch64] Sink vscale calls into loops for better isel
For more recent sve capable CPUs it is beneficial to use the inc* instruction to increment a value by vscale (potentially shifted or multiplied) even in short loops. This patch tells codegenprepare to sink appropriate vscale calls into blocks where they are used so that isel can match them.
1 parent 5c3beb7 commit 320097d

File tree

3 files changed

+245
-3
lines changed

3 files changed

+245
-3
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14520,6 +14520,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
1452014520
return true;
1452114521
}
1452214522

14523+
/// We want to sink following cases:
14524+
/// (add|sub) A, ((mul|shl) vscale, imm); (add|sub) A, vscale
14525+
static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
14526+
if (match(Op, m_VScale()))
14527+
return true;
14528+
if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
14529+
match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
14530+
Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
14531+
return true;
14532+
}
14533+
return false;
14534+
}
14535+
1452314536
/// Check if sinking \p I's operands to I's basic block is profitable, because
1452414537
/// the operands can be folded into a target instruction, e.g.
1452514538
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14636,12 +14649,29 @@ bool AArch64TargetLowering::shouldSinkOperands(
1463614649
}
1463714650
}
1463814651

14639-
if (!I->getType()->isVectorTy())
14640-
return false;
14641-
1464214652
switch (I->getOpcode()) {
1464314653
case Instruction::Sub:
1464414654
case Instruction::Add: {
14655+
// If the subtarget wants to make use of sve inc* instructions, then sink
14656+
// vscale intrinsic (along with any shifts or multiplies) so that the
14657+
// appropriate folds can be made.
14658+
if (Subtarget->useScalarIncVL()) {
14659+
bool Sink = false;
14660+
if (shouldSinkVScale(I->getOperand(0), Ops)) {
14661+
Ops.push_back(&I->getOperandUse(0));
14662+
Sink = true;
14663+
}
14664+
14665+
if (shouldSinkVScale(I->getOperand(1), Ops)) {
14666+
Ops.push_back(&I->getOperandUse(1));
14667+
Sink = true;
14668+
}
14669+
14670+
if (Sink)
14671+
return true;
14672+
}
14673+
if (!I->getType()->isVectorTy())
14674+
return false;
1464514675
if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
1464614676
return false;
1464714677

@@ -14660,6 +14690,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
1466014690
return true;
1466114691
}
1466214692
case Instruction::Or: {
14693+
if (!I->getType()->isVectorTy())
14694+
return false;
1466314695
// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
1466414696
// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
1466514697
if (Subtarget->hasNEON()) {
@@ -14697,6 +14729,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
1469714729
return false;
1469814730
}
1469914731
case Instruction::Mul: {
14732+
if (!I->getType()->isVectorTy())
14733+
return false;
1470014734
int NumZExts = 0, NumSExts = 0;
1470114735
for (auto &Op : I->operands()) {
1470214736
// Make sure we are not already sinking this operand
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc < %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
7+
; CHECK-LABEL: inc_add:
8+
; CHECK: // %bb.0: // %entry
9+
; CHECK-NEXT: ptrue p0.s
10+
; CHECK-NEXT: mov x8, xzr
11+
; CHECK-NEXT: mov w9, w1
12+
; CHECK-NEXT: .LBB0_1: // %vector.body
13+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
14+
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x2, x8, lsl #2]
15+
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x3, x8, lsl #2]
16+
; CHECK-NEXT: fmul z0.s, z0.s, z1.s
17+
; CHECK-NEXT: st1w { z0.s }, p0, [x4, x8, lsl #2]
18+
; CHECK-NEXT: incw x8
19+
; CHECK-NEXT: cmp x9, x8
20+
; CHECK-NEXT: b.ne .LBB0_1
21+
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
22+
; CHECK-NEXT: ret
23+
entry:
24+
%wide.trip.count = zext i32 %N to i64
25+
%0 = tail call i64 @llvm.vscale.i64()
26+
%1 = shl nuw nsw i64 %0, 2
27+
br label %vector.body
28+
29+
vector.body:
30+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
31+
%2 = getelementptr inbounds float, ptr %in1, i64 %index
32+
%wide.load = load <vscale x 4 x float>, ptr %2, align 4
33+
%3 = getelementptr inbounds float, ptr %in2, i64 %index
34+
%wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
35+
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
36+
%5 = getelementptr inbounds float, ptr %out, i64 %index
37+
store <vscale x 4 x float> %4, ptr %5, align 4
38+
%index.next = add nuw i64 %index, %1
39+
%6 = icmp eq i64 %index.next, %wide.trip.count
40+
br i1 %6, label %for.cond.cleanup, label %vector.body
41+
42+
for.cond.cleanup:
43+
ret void
44+
}
45+
46+
define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
47+
; CHECK-LABEL: dec_sub:
48+
; CHECK: // %bb.0: // %entry
49+
; CHECK-NEXT: ptrue p0.s
50+
; CHECK-NEXT: rdvl x9, #-1
51+
; CHECK-NEXT: mov w8, w1
52+
; CHECK-NEXT: add x11, x9, #4
53+
; CHECK-NEXT: add x9, x2, x11
54+
; CHECK-NEXT: add x10, x3, x11
55+
; CHECK-NEXT: add x11, x4, x11
56+
; CHECK-NEXT: .LBB1_1: // %vector.body
57+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
58+
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
59+
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
60+
; CHECK-NEXT: fmul z0.s, z0.s, z1.s
61+
; CHECK-NEXT: st1w { z0.s }, p0, [x11, x8, lsl #2]
62+
; CHECK-NEXT: decw x8
63+
; CHECK-NEXT: cbnz x8, .LBB1_1
64+
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
65+
; CHECK-NEXT: ret
66+
entry:
67+
%0 = zext i32 %N to i64
68+
%1 = tail call i64 @llvm.vscale.i64()
69+
%2 = shl nuw nsw i64 %1, 2
70+
%3 = sub nsw i64 1, %2
71+
%invariant.gep = getelementptr float, ptr %in1, i64 %3
72+
%invariant.gep20 = getelementptr float, ptr %in2, i64 %3
73+
%invariant.gep22 = getelementptr float, ptr %out, i64 %3
74+
br label %vector.body
75+
76+
vector.body:
77+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
78+
%offset.idx = sub i64 %0, %index
79+
%gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
80+
%wide.load = load <vscale x 4 x float>, ptr %gep, align 4
81+
%gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
82+
%wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
83+
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
84+
%gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
85+
store <vscale x 4 x float> %4, ptr %gep23, align 4
86+
%index.next = add nuw i64 %index, %2
87+
%5 = icmp eq i64 %index.next, %0
88+
br i1 %5, label %for.cond.cleanup, label %vector.body
89+
90+
for.cond.cleanup:
91+
ret void
92+
}
93+
94+
declare i64 @llvm.vscale.i64()
95+
96+
attributes #0 = { "target-features"="+sve2" }
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2+
; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
7+
; CHECK-LABEL: define void @inc_add
8+
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: entry:
10+
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
11+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
12+
; CHECK: vector.body:
13+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
15+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
16+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
17+
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
18+
; CHECK-NEXT: [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
19+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
20+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
21+
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
22+
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
23+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
24+
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
25+
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
26+
; CHECK: for.cond.cleanup:
27+
; CHECK-NEXT: ret void
28+
;
29+
entry:
30+
%wide.trip.count = zext i32 %N to i64
31+
%0 = tail call i64 @llvm.vscale.i64()
32+
%1 = shl nuw nsw i64 %0, 2
33+
br label %vector.body
34+
35+
vector.body:
36+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
37+
%2 = getelementptr inbounds float, ptr %in1, i64 %index
38+
%wide.load = load <vscale x 4 x float>, ptr %2, align 4
39+
%3 = getelementptr inbounds float, ptr %in2, i64 %index
40+
%wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
41+
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
42+
%5 = getelementptr inbounds float, ptr %out, i64 %index
43+
store <vscale x 4 x float> %4, ptr %5, align 4
44+
%index.next = add nuw i64 %index, %1
45+
%6 = icmp eq i64 %index.next, %wide.trip.count
46+
br i1 %6, label %for.cond.cleanup, label %vector.body
47+
48+
for.cond.cleanup:
49+
ret void
50+
}
51+
52+
define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
53+
; CHECK-LABEL: define void @dec_sub
54+
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
55+
; CHECK-NEXT: entry:
56+
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
57+
; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
58+
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
59+
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
60+
; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
61+
; CHECK-NEXT: [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
62+
; CHECK-NEXT: [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
63+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
64+
; CHECK: vector.body:
65+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
66+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
67+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
68+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
69+
; CHECK-NEXT: [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
70+
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
71+
; CHECK-NEXT: [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
72+
; CHECK-NEXT: [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
73+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
74+
; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
75+
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
76+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
77+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
78+
; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
79+
; CHECK: for.cond.cleanup:
80+
; CHECK-NEXT: ret void
81+
;
82+
entry:
83+
%0 = zext i32 %N to i64
84+
%1 = tail call i64 @llvm.vscale.i64()
85+
%2 = shl nuw nsw i64 %1, 2
86+
%3 = sub nsw i64 1, %2
87+
%invariant.gep = getelementptr float, ptr %in1, i64 %3
88+
%invariant.gep20 = getelementptr float, ptr %in2, i64 %3
89+
%invariant.gep22 = getelementptr float, ptr %out, i64 %3
90+
br label %vector.body
91+
92+
vector.body:
93+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
94+
%offset.idx = sub i64 %0, %index
95+
%gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
96+
%wide.load = load <vscale x 4 x float>, ptr %gep, align 4
97+
%gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
98+
%wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
99+
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
100+
%gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
101+
store <vscale x 4 x float> %4, ptr %gep23, align 4
102+
%index.next = add nuw i64 %index, %2
103+
%5 = icmp eq i64 %index.next, %0
104+
br i1 %5, label %for.cond.cleanup, label %vector.body
105+
106+
for.cond.cleanup:
107+
ret void
108+
}
109+
110+
declare i64 @llvm.vscale.i64()
111+
112+
attributes #0 = { "target-features"="+sve2" }

0 commit comments

Comments
 (0)