Skip to content

Commit ea3e220

Browse files
authored
[DAGComb] Do not turn insert_elt into shuffle for single elt vectors. (#1287)
Currently combineInsertEltToShuffle turns insert_vector_elt into a vector_shuffle, even if the inserted element is a vector with a single element. In this case, it should be unlikely that the additional shuffle would be more efficient than a insert_vector_elt. Additionally, this fixes a infinite cycle in DAGCombine, where combineInsertEltToShuffle turns a insert_vector_elt into a shuffle, which gets turned back into a insert_vector_elt/extract_vector_elt by a custom AArch64 lowering (in visitVECTOR_SHUFFLE). Such insert_vector_elt and extract_vector_elt combinations can be lowered efficiently using mov on AArch64. There are 2 test changes in arm64-neon-copy.ll: we now use one or two mov instructions instead of a single zip1. The reason that we need a second mov in ins1f2 is that we have to move the result to the result register and is not really related to the DAGCombine fold I think. But in any case, on most uarchs, mov should be cheaper than zip1. On a Cortex-A75 for example, zip1 is twice as expensive as mov (https://developer.arm.com/docs/101398/latest/arm-cortex-a75-software-optimization-guide-v20) Reviewers: spatel, efriedma, dmgreen, RKSimon Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D80710 (Cherry-picked from d20a3d3)
1 parent 77ecbf2 commit ea3e220

File tree

3 files changed

+52
-1
lines changed

3 files changed

+52
-1
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16826,6 +16826,10 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
1682616826
EVT SubVecVT = SubVec.getValueType();
1682716827
EVT VT = DestVec.getValueType();
1682816828
unsigned NumSrcElts = SubVecVT.getVectorNumElements();
16829+
// If the source only has a single vector element, the cost of creating adding
16830+
// it to a vector is likely to exceed the cost of a insert_vector_elt.
16831+
if (NumSrcElts == 1)
16832+
return SDValue();
1682916833
unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
1683016834
unsigned NumMaskVals = ExtendRatio * NumSrcElts;
1683116835

llvm/test/CodeGen/AArch64/arm64-neon-copy.ll

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,19 @@ define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
200200
; CHECK-LABEL: ins1f2:
201201
; CHECK: // %bb.0:
202202
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
203-
; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d
203+
; CHECK-NEXT: mov v1.d[1], v0.d[0]
204+
; CHECK-NEXT: mov v0.16b, v1.16b
205+
; CHECK-NEXT: ret
206+
%tmp3 = extractelement <1 x double> %tmp1, i32 0
207+
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
208+
ret <2 x double> %tmp4
209+
}
210+
211+
define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1) {
212+
; CHECK-LABEL: ins1f2_args_flipped:
213+
; CHECK: // %bb.0:
214+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
215+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
204216
; CHECK-NEXT: ret
205217
%tmp3 = extractelement <1 x double> %tmp1, i32 0
206218
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc %s -o - | FileCheck %s
3+
4+
target triple = "arm64-apple-ios13.4.0"
5+
6+
; Make we do not get stuck in a cycle in DAGCombiner.
7+
8+
define void @test(i1 %c, <1 x double>* %ptr) {
9+
; CHECK-LABEL: test:
10+
; CHECK: ; %bb.0: ; %entry
11+
; CHECK-NEXT: movi d0, #0000000000000000
12+
; CHECK-NEXT: tbz w0, #0, LBB0_2
13+
; CHECK-NEXT: ; %bb.1: ; %bb1
14+
; CHECK-NEXT: ldr d0, [x1]
15+
; CHECK-NEXT: LBB0_2: ; %bb2
16+
; CHECK-NEXT: ldr q1, [x8]
17+
; CHECK-NEXT: mov.d v1[0], v0[0]
18+
; CHECK-NEXT: str q1, [x8]
19+
; CHECK-NEXT: ret
20+
entry:
21+
br i1 %c, label %bb1, label %bb2
22+
23+
bb1:
24+
%lv1 = load <1 x double>, <1 x double>* %ptr, align 16
25+
br label %bb2
26+
27+
bb2:
28+
%p = phi <1 x double> [ %lv1, %bb1 ], [ zeroinitializer, %entry ]
29+
%vecext19 = extractelement <1 x double> %p, i32 0
30+
%arrayidx21 = getelementptr inbounds [4 x <4 x double>], [4 x <4 x double>]* undef, i64 0, i64 3
31+
%lv2 = load <4 x double>, <4 x double>* %arrayidx21, align 16
32+
%vecins22 = insertelement <4 x double> %lv2, double %vecext19, i32 2
33+
store <4 x double> %vecins22, <4 x double>* %arrayidx21, align 16
34+
ret void
35+
}

0 commit comments

Comments
 (0)