Skip to content

Commit 643c383

Browse files
authored
[AArch64] Remove EXT instr before UZP when extracting elements from vector (llvm#91328)
Assembly generated for getting odd/even elements from vector contained extra EXT instruction. This was due to way llvm constructs DAGs when vector_shuffling from larger type to smaller. This patch optimises DAG in these situations, allowing for correct assembly to be emitted.
1 parent 2cbfe4a commit 643c383

File tree

7 files changed

+87
-71
lines changed

7 files changed

+87
-71
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21448,6 +21448,29 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
2144821448
SDValue Op1 = N->getOperand(1);
2144921449
EVT ResVT = N->getValueType(0);
2145021450

21451+
// uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
21452+
if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21453+
Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21454+
Op0.getOperand(0) == Op1.getOperand(0)) {
21455+
21456+
SDValue SourceVec = Op0.getOperand(0);
21457+
uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
21458+
uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
21459+
uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
21460+
if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
21461+
EVT OpVT = Op0.getOperand(1).getValueType();
21462+
EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21463+
SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
21464+
DAG.getUNDEF(WidenedResVT));
21465+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
21466+
DAG.getConstant(0, DL, OpVT));
21467+
}
21468+
}
21469+
21470+
// Following optimizations only work with uzp1.
21471+
if (N->getOpcode() == AArch64ISD::UZP2)
21472+
return SDValue();
21473+
2145121474
// uzp1(x, undef) -> concat(truncate(x), undef)
2145221475
if (Op1.getOpcode() == ISD::UNDEF) {
2145321476
EVT BCVT = MVT::Other, HalfVT = MVT::Other;
@@ -24665,6 +24688,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2466524688
case AArch64ISD::UUNPKHI:
2466624689
return performUnpackCombine(N, DAG, Subtarget);
2466724690
case AArch64ISD::UZP1:
24691+
case AArch64ISD::UZP2:
2466824692
return performUzpCombine(N, DAG, Subtarget);
2466924693
case AArch64ISD::SETCC_MERGE_ZERO:
2467024694
return performSetccMergeZeroCombine(N, DCI);

llvm/test/CodeGen/AArch64/aarch64-vuzp.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,18 @@
33
declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
44

55
; CHECK-LABEL: fun1:
6-
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
6+
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
77
define i32 @fun1() {
88
entry:
99
%vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
10-
%vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
11-
%scevgep = getelementptr <8 x i8>, ptr undef, i64 1
12-
store <8 x i8> %vuzp.i212.1, ptr %scevgep, align 1
10+
%vuzp.i212.1 = shufflevector <16 x i8> %vtbl1.i.1, <16 x i8> %vtbl1.i.1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
11+
%scevgep = getelementptr <16 x i8>, ptr undef, i64 1
12+
store <16 x i8> %vuzp.i212.1, ptr %scevgep, align 1
1313
ret i32 undef
1414
}
1515

1616
; CHECK-LABEL: fun2:
17-
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
17+
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1818
define i32 @fun2() {
1919
entry:
2020
%vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@ target triple = "aarch64"
77
define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
88
; CHECK-LABEL: complex_add_v4f16:
99
; CHECK: // %bb.0: // %entry
10-
; CHECK-NEXT: uunpkhi z2.d, z0.s
10+
; CHECK-NEXT: uzp1 z2.s, z0.s, z0.s
11+
; CHECK-NEXT: uzp2 z0.s, z0.s, z0.s
12+
; CHECK-NEXT: ptrue p0.d
13+
; CHECK-NEXT: uzp2 z3.s, z1.s, z0.s
14+
; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
15+
; CHECK-NEXT: uunpklo z2.d, z2.s
1116
; CHECK-NEXT: uunpklo z0.d, z0.s
12-
; CHECK-NEXT: uunpkhi z3.d, z1.s
17+
; CHECK-NEXT: uunpklo z3.d, z3.s
1318
; CHECK-NEXT: uunpklo z1.d, z1.s
14-
; CHECK-NEXT: ptrue p0.d
15-
; CHECK-NEXT: uzp1 z4.d, z0.d, z2.d
16-
; CHECK-NEXT: uzp2 z0.d, z0.d, z2.d
17-
; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d
18-
; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d
1919
; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h
20-
; CHECK-NEXT: movprfx z1, z2
21-
; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z4.h
20+
; CHECK-NEXT: movprfx z1, z3
21+
; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z2.h
2222
; CHECK-NEXT: zip2 z2.d, z0.d, z1.d
2323
; CHECK-NEXT: zip1 z0.d, z0.d, z1.d
2424
; CHECK-NEXT: uzp1 z0.s, z0.s, z2.s

llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,22 @@ target triple = "aarch64"
77
define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
88
; CHECK-LABEL: complex_mul_v4f16:
99
; CHECK: // %bb.0: // %entry
10-
; CHECK-NEXT: uunpkhi z2.d, z0.s
10+
; CHECK-NEXT: uzp2 z2.s, z0.s, z0.s
11+
; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
12+
; CHECK-NEXT: ptrue p0.d
13+
; CHECK-NEXT: uzp2 z3.s, z1.s, z0.s
1114
; CHECK-NEXT: uunpklo z0.d, z0.s
12-
; CHECK-NEXT: uunpkhi z3.d, z1.s
15+
; CHECK-NEXT: uunpklo z2.d, z2.s
16+
; CHECK-NEXT: uunpklo z3.d, z3.s
17+
; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
1318
; CHECK-NEXT: uunpklo z1.d, z1.s
14-
; CHECK-NEXT: ptrue p0.d
15-
; CHECK-NEXT: uzp2 z4.d, z0.d, z2.d
16-
; CHECK-NEXT: uzp1 z0.d, z0.d, z2.d
17-
; CHECK-NEXT: uzp2 z2.d, z1.d, z3.d
18-
; CHECK-NEXT: uzp1 z1.d, z1.d, z3.d
19-
; CHECK-NEXT: movprfx z5, z2
20-
; CHECK-NEXT: fmul z5.h, p0/m, z5.h, z0.h
21-
; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z4.h
22-
; CHECK-NEXT: movprfx z3, z5
23-
; CHECK-NEXT: fmla z3.h, p0/m, z1.h, z4.h
24-
; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z2.h
25-
; CHECK-NEXT: zip2 z1.d, z0.d, z3.d
26-
; CHECK-NEXT: zip1 z0.d, z0.d, z3.d
19+
; CHECK-NEXT: movprfx z4, z3
20+
; CHECK-NEXT: fmul z4.h, p0/m, z4.h, z0.h
21+
; CHECK-NEXT: fmul z3.h, p0/m, z3.h, z2.h
22+
; CHECK-NEXT: fmad z2.h, p0/m, z1.h, z4.h
23+
; CHECK-NEXT: fnmsb z0.h, p0/m, z1.h, z3.h
24+
; CHECK-NEXT: zip2 z1.d, z0.d, z2.d
25+
; CHECK-NEXT: zip1 z0.d, z0.d, z2.d
2726
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
2827
; CHECK-NEXT: ret
2928
entry:

llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,13 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
3030
}
3131

3232
define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) {
33-
; CHECK-SD-LABEL: vector_deinterleave_v4f16_v8f16:
34-
; CHECK-SD: // %bb.0:
35-
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
36-
; CHECK-SD-NEXT: uzp1 v2.4h, v0.4h, v1.4h
37-
; CHECK-SD-NEXT: uzp2 v1.4h, v0.4h, v1.4h
38-
; CHECK-SD-NEXT: fmov d0, d2
39-
; CHECK-SD-NEXT: ret
40-
;
41-
; CHECK-GI-LABEL: vector_deinterleave_v4f16_v8f16:
42-
; CHECK-GI: // %bb.0:
43-
; CHECK-GI-NEXT: uzp1 v2.8h, v0.8h, v0.8h
44-
; CHECK-GI-NEXT: uzp2 v1.8h, v0.8h, v0.8h
45-
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1
46-
; CHECK-GI-NEXT: fmov d0, d2
47-
; CHECK-GI-NEXT: ret
33+
; CHECK-LABEL: vector_deinterleave_v4f16_v8f16:
34+
; CHECK: // %bb.0:
35+
; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h
36+
; CHECK-NEXT: uzp2 v1.8h, v0.8h, v0.8h
37+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
38+
; CHECK-NEXT: fmov d0, d2
39+
; CHECK-NEXT: ret
4840
%retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
4941
ret {<4 x half>, <4 x half>} %retval
5042
}

llvm/test/CodeGen/AArch64/neon-perm.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4092,9 +4092,9 @@ entry:
40924092
define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
40934093
; CHECK-SD-LABEL: test_uzp:
40944094
; CHECK-SD: // %bb.0:
4095-
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
4096-
; CHECK-SD-NEXT: uzp1 v2.8b, v0.8b, v1.8b
4097-
; CHECK-SD-NEXT: uzp2 v1.8b, v0.8b, v1.8b
4095+
; CHECK-SD-NEXT: xtn v2.8b, v0.8h
4096+
; CHECK-SD-NEXT: uzp2 v1.16b, v0.16b, v0.16b
4097+
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
40984098
; CHECK-SD-NEXT: fmov d0, d2
40994099
; CHECK-SD-NEXT: ret
41004100
;
@@ -4106,6 +4106,7 @@ define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
41064106
; CHECK-GI-NEXT: fmov d0, d2
41074107
; CHECK-GI-NEXT: ret
41084108

4109+
41094110
%vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
41104111
%vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
41114112
%.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0

llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
55
; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: uunpkhi z1.d, z0.s
8-
; CHECK-NEXT: uunpklo z2.d, z0.s
9-
; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d
10-
; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d
7+
; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s
8+
; CHECK-NEXT: uzp2 z2.s, z0.s, z0.s
9+
; CHECK-NEXT: uunpklo z0.d, z1.s
10+
; CHECK-NEXT: uunpklo z1.d, z2.s
1111
; CHECK-NEXT: ret
1212
%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
1313
ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
@@ -16,10 +16,10 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
1616
define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
1717
; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
1818
; CHECK: // %bb.0:
19-
; CHECK-NEXT: uunpkhi z1.s, z0.h
20-
; CHECK-NEXT: uunpklo z2.s, z0.h
21-
; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s
22-
; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s
19+
; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h
20+
; CHECK-NEXT: uzp2 z2.h, z0.h, z0.h
21+
; CHECK-NEXT: uunpklo z0.s, z1.h
22+
; CHECK-NEXT: uunpklo z1.s, z2.h
2323
; CHECK-NEXT: ret
2424
%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
2525
ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
@@ -39,10 +39,10 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
3939
define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) {
4040
; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
4141
; CHECK: // %bb.0:
42-
; CHECK-NEXT: uunpkhi z1.d, z0.s
43-
; CHECK-NEXT: uunpklo z2.d, z0.s
44-
; CHECK-NEXT: uzp1 z0.d, z2.d, z1.d
45-
; CHECK-NEXT: uzp2 z1.d, z2.d, z1.d
42+
; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s
43+
; CHECK-NEXT: uzp2 z2.s, z0.s, z0.s
44+
; CHECK-NEXT: uunpklo z0.d, z1.s
45+
; CHECK-NEXT: uunpklo z1.d, z2.s
4646
; CHECK-NEXT: ret
4747
%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
4848
ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
@@ -131,10 +131,10 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
131131
define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i1(<vscale x 16 x i1> %vec) {
132132
; CHECK-LABEL: vector_deinterleave_nxv8i1_nxv16i1:
133133
; CHECK: // %bb.0:
134-
; CHECK-NEXT: punpkhi p1.h, p0.b
135-
; CHECK-NEXT: punpklo p2.h, p0.b
136-
; CHECK-NEXT: uzp1 p0.h, p2.h, p1.h
137-
; CHECK-NEXT: uzp2 p1.h, p2.h, p1.h
134+
; CHECK-NEXT: uzp1 p1.b, p0.b, p0.b
135+
; CHECK-NEXT: uzp2 p2.b, p0.b, p0.b
136+
; CHECK-NEXT: punpklo p0.h, p1.b
137+
; CHECK-NEXT: punpklo p1.h, p2.b
138138
; CHECK-NEXT: ret
139139
%retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
140140
ret {<vscale x 8 x i1>, <vscale x 8 x i1>} %retval
@@ -143,10 +143,10 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
143143
define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1(<vscale x 8 x i1> %vec) {
144144
; CHECK-LABEL: vector_deinterleave_nxv4i1_nxv8i1:
145145
; CHECK: // %bb.0:
146-
; CHECK-NEXT: punpkhi p1.h, p0.b
147-
; CHECK-NEXT: punpklo p2.h, p0.b
148-
; CHECK-NEXT: uzp1 p0.s, p2.s, p1.s
149-
; CHECK-NEXT: uzp2 p1.s, p2.s, p1.s
146+
; CHECK-NEXT: uzp1 p1.h, p0.h, p0.h
147+
; CHECK-NEXT: uzp2 p2.h, p0.h, p0.h
148+
; CHECK-NEXT: punpklo p0.h, p1.b
149+
; CHECK-NEXT: punpklo p1.h, p2.b
150150
; CHECK-NEXT: ret
151151
%retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
152152
ret {<vscale x 4 x i1>, <vscale x 4 x i1>} %retval
@@ -155,10 +155,10 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
155155
define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1(<vscale x 4 x i1> %vec) {
156156
; CHECK-LABEL: vector_deinterleave_nxv2i1_nxv4i1:
157157
; CHECK: // %bb.0:
158-
; CHECK-NEXT: punpkhi p1.h, p0.b
159-
; CHECK-NEXT: punpklo p2.h, p0.b
160-
; CHECK-NEXT: uzp1 p0.d, p2.d, p1.d
161-
; CHECK-NEXT: uzp2 p1.d, p2.d, p1.d
158+
; CHECK-NEXT: uzp1 p1.s, p0.s, p0.s
159+
; CHECK-NEXT: uzp2 p2.s, p0.s, p0.s
160+
; CHECK-NEXT: punpklo p0.h, p1.b
161+
; CHECK-NEXT: punpklo p1.h, p2.b
162162
; CHECK-NEXT: ret
163163
%retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
164164
ret {<vscale x 2 x i1>, <vscale x 2 x i1>} %retval

0 commit comments

Comments
 (0)