Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 70b6337

Browse files
committed
ARM64: implement cunning optimisation from AArch64
A vector extract followed by a dup can become a single instruction even if the types don't match. AArch64 handled this in ISelLowering, but a few reasonably simple patterns can take care of it in TableGen, so that's where I've put it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206573 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent e7ec66e commit 70b6337

File tree

2 files changed

+56
-2
lines changed

2 files changed

+56
-2
lines changed

lib/Target/ARM64/ARM64InstrInfo.td

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3026,6 +3026,59 @@ def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
30263026
def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
30273027
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
30283028

3029+
// If there's an (ARM64dup (vector_extract ...) ...), we can use a duplane
3030+
// instruction even if the types don't match: we just have to remap the lane
3031+
// carefully. N.b. this trick only applies to truncations.
3032+
def VecIndex_x2 : SDNodeXForm<imm, [{
3033+
return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
3034+
}]>;
3035+
def VecIndex_x4 : SDNodeXForm<imm, [{
3036+
return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
3037+
}]>;
3038+
def VecIndex_x8 : SDNodeXForm<imm, [{
3039+
return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
3040+
}]>;
3041+
3042+
multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
3043+
ValueType Src128VT, ValueType ScalVT,
3044+
Instruction DUP, SDNodeXForm IdxXFORM> {
3045+
def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
3046+
imm:$idx)))),
3047+
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
3048+
3049+
def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
3050+
imm:$idx)))),
3051+
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
3052+
}
3053+
3054+
defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
3055+
defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
3056+
defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
3057+
3058+
defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
3059+
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
3060+
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
3061+
3062+
multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
3063+
SDNodeXForm IdxXFORM> {
3064+
def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
3065+
imm:$idx))))),
3066+
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;
3067+
3068+
def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
3069+
imm:$idx))))),
3070+
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
3071+
}
3072+
3073+
defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
3074+
defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
3075+
defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
3076+
3077+
defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
3078+
defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
3079+
defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
3080+
3081+
// SMOV and UMOV definitions, with some extra patterns for convenience
30293082
defm SMOV : SMov;
30303083
defm UMOV : UMov;
30313084

test/CodeGen/ARM64/dup.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,10 +297,11 @@ define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
297297
; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
298298
; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
299299
; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
300+
;
301+
; *However*, it is a dup vD.4h, vN.h[2*idx].
300302
define <4 x i16> @test_build_illegal(<4 x i32> %in) {
301303
; CHECK-LABEL: test_build_illegal:
302-
; CHECK: umov.s [[WTMP:w[0-9]+]], v0[3]
303-
; CHECK: dup.4h v0, [[WTMP]]
304+
; CHECK: dup.4h v0, v0[6]
304305
%val = extractelement <4 x i32> %in, i32 3
305306
%smallval = trunc i32 %val to i16
306307
%vec = insertelement <4x i16> undef, i16 %smallval, i32 3

0 commit comments

Comments
 (0)