Skip to content

Commit b18161d

Browse files
committed
[AArch64] Handle vector with two different values
If vector has two different values and it can be splitted into two sub vectors with same length, generate two DUP and CONCAT_VECTORS/VECTOR_SHUFFLE. For example, t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23, t24, t24, t24, t24, t24, t24, t24, t24 ==> t26: v8i8 = AArch64ISD::DUP t23 t28: v8i8 = AArch64ISD::DUP t24 t29: v16i8 = concat_vectors t26, t28 Differential Revision: https://reviews.llvm.org/D148347
1 parent ea228bd commit b18161d

File tree

2 files changed

+121
-74
lines changed

2 files changed

+121
-74
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12353,6 +12353,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
1235312353
unsigned NumUndefLanes = 0;
1235412354
SDValue Value;
1235512355
SDValue ConstantValue;
12356+
SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
12357+
unsigned ConsecutiveValCount = 0;
12358+
SDValue PrevVal;
1235612359
for (unsigned i = 0; i < NumElts; ++i) {
1235712360
SDValue V = Op.getOperand(i);
1235812361
if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
@@ -12380,6 +12383,24 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
1238012383
usesOnlyOneValue = false;
1238112384
++NumDifferentLanes;
1238212385
}
12386+
12387+
if (PrevVal != V) {
12388+
ConsecutiveValCount = 0;
12389+
PrevVal = V;
12390+
}
12391+
12392+
// Keep different values and its last consecutive count. For example,
12393+
//
12394+
// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
12395+
// t24, t24, t24, t24, t24, t24, t24, t24
12396+
// t23 = consecutive count 8
12397+
// t24 = consecutive count 8
12398+
// ------------------------------------------------------------------
12399+
// t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
12400+
// t24, t24, t24, t24, t24, t24, t24, t24
12401+
// t23 = consecutive count 5
12402+
// t24 = consecutive count 9
12403+
DifferentValueMap[V] = ++ConsecutiveValCount;
1238312404
}
1238412405

1238512406
if (!Value.getNode()) {
@@ -12585,6 +12606,82 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
1258512606
return NewVector;
1258612607
}
1258712608

12609+
// If vector consists of two different values, try to generate two DUPs and
12610+
// (CONCAT_VECTORS or VECTOR_SHUFFLE).
12611+
if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
12612+
SmallVector<SDValue, 2> Vals;
12613+
// Check the consecutive count of the value is the half number of vector
12614+
// elements. In this case, we can use CONCAT_VECTORS. For example,
12615+
//
12616+
// canUseVECTOR_CONCAT = true;
12617+
// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
12618+
// t24, t24, t24, t24, t24, t24, t24, t24
12619+
//
12620+
// canUseVECTOR_CONCAT = false;
12621+
// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
12622+
// t24, t24, t24, t24, t24, t24, t24, t24
12623+
bool canUseVECTOR_CONCAT = true;
12624+
for (auto Pair : DifferentValueMap) {
12625+
// Check different values have same length which is NumElts / 2.
12626+
if (Pair.second != NumElts / 2)
12627+
canUseVECTOR_CONCAT = false;
12628+
Vals.push_back(Pair.first);
12629+
}
12630+
12631+
// If canUseVECTOR_CONCAT is true, we can generate two DUPs and
12632+
// CONCAT_VECTORs. For example,
12633+
//
12634+
// t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
12635+
// t24, t24, t24, t24, t24, t24, t24, t24
12636+
// ==>
12637+
// t26: v8i8 = AArch64ISD::DUP t23
12638+
// t28: v8i8 = AArch64ISD::DUP t24
12639+
// t29: v16i8 = concat_vectors t26, t28
12640+
if (canUseVECTOR_CONCAT) {
12641+
EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12642+
if (isTypeLegal(SubVT) && SubVT.isVector() &&
12643+
SubVT.getVectorNumElements() >= 2) {
12644+
SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
12645+
SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
12646+
SDValue DUP1 =
12647+
LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
12648+
SDValue DUP2 =
12649+
LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
12650+
SDValue CONCAT_VECTORS =
12651+
DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
12652+
return CONCAT_VECTORS;
12653+
}
12654+
}
12655+
12656+
// Let's try to generate two DUPs and VECTOR_SHUFFLE. For example,
12657+
//
12658+
// t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
12659+
// ==>
12660+
// t28: v8i8 = AArch64ISD::DUP t25
12661+
// t30: v8i8 = AArch64ISD::DUP t26
12662+
// t31: v8i8 = vector_shuffle<0,0,0,0,8,8,8,8> t28, t30
12663+
if (NumElts >= 8) {
12664+
SmallVector<int, 16> MaskVec;
12665+
// Build mask for VECTOR_SHUFLLE.
12666+
SDValue FirstLaneVal = Op.getOperand(0);
12667+
for (unsigned i = 0; i < NumElts; ++i) {
12668+
SDValue Val = Op.getOperand(i);
12669+
if (FirstLaneVal == Val)
12670+
MaskVec.push_back(0);
12671+
else
12672+
MaskVec.push_back(NumElts);
12673+
}
12674+
12675+
SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
12676+
SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
12677+
SDValue DUP1 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops1), DAG);
12678+
SDValue DUP2 = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops2), DAG);
12679+
SDValue VECTOR_SHUFFLE =
12680+
DAG.getVectorShuffle(VT, dl, DUP1, DUP2, MaskVec);
12681+
return VECTOR_SHUFFLE;
12682+
}
12683+
}
12684+
1258812685
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
1258912686
// know the default expansion would otherwise fall back on something even
1259012687
// worse. For a vector with one or two non-undef values, that's

llvm/test/CodeGen/AArch64/build-vector-two-dup.ll

Lines changed: 24 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,9 @@
44
define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
55
; CHECK-LABEL: test1:
66
; CHECK: // %bb.0: // %entry
7-
; CHECK-NEXT: ldrb w8, [x0]
8-
; CHECK-NEXT: fmov s0, w8
9-
; CHECK-NEXT: mov v0.b[1], w8
10-
; CHECK-NEXT: mov v0.b[2], w8
11-
; CHECK-NEXT: mov v0.b[3], w8
12-
; CHECK-NEXT: mov v0.b[4], w8
13-
; CHECK-NEXT: mov v0.b[5], w8
14-
; CHECK-NEXT: mov v0.b[6], w8
15-
; CHECK-NEXT: mov v0.b[7], w8
16-
; CHECK-NEXT: ldrb w8, [x1]
17-
; CHECK-NEXT: mov v0.b[8], w8
18-
; CHECK-NEXT: mov v0.b[9], w8
19-
; CHECK-NEXT: mov v0.b[10], w8
20-
; CHECK-NEXT: mov v0.b[11], w8
21-
; CHECK-NEXT: mov v0.b[12], w8
22-
; CHECK-NEXT: mov v0.b[13], w8
23-
; CHECK-NEXT: mov v0.b[14], w8
24-
; CHECK-NEXT: mov v0.b[15], w8
7+
; CHECK-NEXT: ld1r { v1.8b }, [x1]
8+
; CHECK-NEXT: ld1r { v0.8b }, [x0]
9+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
2510
; CHECK-NEXT: ret
2611
entry:
2712
%0 = load i8, ptr %a, align 1
@@ -75,24 +60,9 @@ entry:
7560
define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
7661
; CHECK-LABEL: test4:
7762
; CHECK: // %bb.0: // %entry
78-
; CHECK-NEXT: ldrb w8, [x1]
79-
; CHECK-NEXT: fmov s0, w8
80-
; CHECK-NEXT: mov v0.b[1], w8
81-
; CHECK-NEXT: mov v0.b[2], w8
82-
; CHECK-NEXT: mov v0.b[3], w8
83-
; CHECK-NEXT: mov v0.b[4], w8
84-
; CHECK-NEXT: mov v0.b[5], w8
85-
; CHECK-NEXT: mov v0.b[6], w8
86-
; CHECK-NEXT: mov v0.b[7], w8
87-
; CHECK-NEXT: ldrb w8, [x0]
88-
; CHECK-NEXT: mov v0.b[8], w8
89-
; CHECK-NEXT: mov v0.b[9], w8
90-
; CHECK-NEXT: mov v0.b[10], w8
91-
; CHECK-NEXT: mov v0.b[11], w8
92-
; CHECK-NEXT: mov v0.b[12], w8
93-
; CHECK-NEXT: mov v0.b[13], w8
94-
; CHECK-NEXT: mov v0.b[14], w8
95-
; CHECK-NEXT: mov v0.b[15], w8
63+
; CHECK-NEXT: ld1r { v1.8b }, [x0]
64+
; CHECK-NEXT: ld1r { v0.8b }, [x1]
65+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
9666
; CHECK-NEXT: ret
9767
entry:
9868
%0 = load i8, ptr %a, align 1
@@ -128,17 +98,12 @@ entry:
12898
define <8 x i8> @test6(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
12999
; CHECK-LABEL: test6:
130100
; CHECK: // %bb.0: // %entry
131-
; CHECK-NEXT: ldrb w8, [x0]
132-
; CHECK-NEXT: fmov s0, w8
133-
; CHECK-NEXT: mov v0.b[1], w8
134-
; CHECK-NEXT: mov v0.b[2], w8
135-
; CHECK-NEXT: mov v0.b[3], w8
136-
; CHECK-NEXT: ldrb w8, [x1]
137-
; CHECK-NEXT: mov v0.b[4], w8
138-
; CHECK-NEXT: mov v0.b[5], w8
139-
; CHECK-NEXT: mov v0.b[6], w8
140-
; CHECK-NEXT: mov v0.b[7], w8
141-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
101+
; CHECK-NEXT: ld1r { v0.8b }, [x1]
102+
; CHECK-NEXT: adrp x8, .LCPI5_0
103+
; CHECK-NEXT: ld1r { v1.8b }, [x0]
104+
; CHECK-NEXT: mov v1.d[1], v0.d[0]
105+
; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI5_0]
106+
; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
142107
; CHECK-NEXT: ret
143108
entry:
144109
%0 = load i8, ptr %a, align 1
@@ -154,17 +119,12 @@ entry:
154119
define <8 x i8> @test7(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
155120
; CHECK-LABEL: test7:
156121
; CHECK: // %bb.0: // %entry
157-
; CHECK-NEXT: ldrb w8, [x1]
158-
; CHECK-NEXT: fmov s0, w8
159-
; CHECK-NEXT: mov v0.b[1], w8
160-
; CHECK-NEXT: mov v0.b[2], w8
161-
; CHECK-NEXT: mov v0.b[3], w8
162-
; CHECK-NEXT: ldrb w8, [x0]
163-
; CHECK-NEXT: mov v0.b[4], w8
164-
; CHECK-NEXT: mov v0.b[5], w8
165-
; CHECK-NEXT: mov v0.b[6], w8
166-
; CHECK-NEXT: mov v0.b[7], w8
167-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
122+
; CHECK-NEXT: ld1r { v0.8b }, [x0]
123+
; CHECK-NEXT: adrp x8, .LCPI6_0
124+
; CHECK-NEXT: ld1r { v1.8b }, [x1]
125+
; CHECK-NEXT: mov v1.d[1], v0.d[0]
126+
; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI6_0]
127+
; CHECK-NEXT: tbl v0.8b, { v1.16b }, v0.8b
168128
; CHECK-NEXT: ret
169129
entry:
170130
%0 = load i8, ptr %a, align 1
@@ -180,16 +140,9 @@ entry:
180140
define <8 x i16> @test8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
181141
; CHECK-LABEL: test8:
182142
; CHECK: // %bb.0: // %entry
183-
; CHECK-NEXT: ldrh w8, [x0]
184-
; CHECK-NEXT: fmov s0, w8
185-
; CHECK-NEXT: mov v0.h[1], w8
186-
; CHECK-NEXT: mov v0.h[2], w8
187-
; CHECK-NEXT: mov v0.h[3], w8
188-
; CHECK-NEXT: ldrh w8, [x1]
189-
; CHECK-NEXT: mov v0.h[4], w8
190-
; CHECK-NEXT: mov v0.h[5], w8
191-
; CHECK-NEXT: mov v0.h[6], w8
192-
; CHECK-NEXT: mov v0.h[7], w8
143+
; CHECK-NEXT: ld1r { v1.4h }, [x1]
144+
; CHECK-NEXT: ld1r { v0.4h }, [x0]
145+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
193146
; CHECK-NEXT: ret
194147
entry:
195148
%0 = load i16, ptr %a, align 1
@@ -205,12 +158,9 @@ entry:
205158
define <4 x i32> @test9(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
206159
; CHECK-LABEL: test9:
207160
; CHECK: // %bb.0: // %entry
208-
; CHECK-NEXT: ldr w8, [x0]
209-
; CHECK-NEXT: fmov s0, w8
210-
; CHECK-NEXT: mov v0.s[1], w8
211-
; CHECK-NEXT: ldr w8, [x1]
212-
; CHECK-NEXT: mov v0.s[2], w8
213-
; CHECK-NEXT: mov v0.s[3], w8
161+
; CHECK-NEXT: ld1r { v1.2s }, [x1]
162+
; CHECK-NEXT: ld1r { v0.2s }, [x0]
163+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
214164
; CHECK-NEXT: ret
215165
entry:
216166
%0 = load i32, ptr %a, align 1

0 commit comments

Comments
 (0)