Skip to content

[AArch64] Prefer using DUP instead of INS where possible #138549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 28 additions & 12 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -7358,7 +7358,8 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))),

// Patterns for i8/i16 -> v2i32/v4i16 lane moves via insert and extract that go via i32.
multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType OutVT,
Instruction INS, SDNodeXForm VecIndexMult> {
Instruction INS, Instruction DUP, SubRegIndex DUPSub,
SDNodeXForm VecIndexMult> {
// VT64->OutVT
def : Pat<(OutVT (vector_insert (OutVT V64:$src),
(i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
Expand All @@ -7369,8 +7370,10 @@ multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType O
dsub)>;
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))))),
(EXTRACT_SUBREG
(INS (IMPLICIT_DEF), 0,
(INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
(VT128 (SUBREG_TO_REG
(i64 0),
(DUP (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
DUPSub)),
dsub)>;

// VT128->OutVT
Expand All @@ -7383,25 +7386,38 @@ multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType O
dsub)>;
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))))),
(EXTRACT_SUBREG
(INS (IMPLICIT_DEF), 0, V128:$Rn, imm:$Immn),
(VT128 (SUBREG_TO_REG
(i64 0),
(DUP V128:$Rn, imm:$Immn),
DUPSub)),
dsub)>;
}

defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, VecIndex_x2>;
defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, VecIndex_x4>;
defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, VecIndex_x2>;
defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, DUPi8, bsub, VecIndex_x2>;
defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, DUPi8, bsub, VecIndex_x4>;
defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, DUPi16, hsub, VecIndex_x2>;

// bitcast of an extract
// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
(EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
// f32 bitcast(vector_extract(v4i32 src, 0)) -> EXTRACT_SUBREG(src)
def : Pat<(f32 (bitconvert (i32 (vector_extract v16i8:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, bsub)>;
def : Pat<(f32 (bitconvert (i32 (vector_extract v8i16:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, hsub)>;
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, ssub)>;
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
(EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, dsub)>;

// f32 bitcast(vector_extract(v4i32 src, lane)) -> DUPi32(src, lane)
def : Pat<(f32 (bitconvert (i32 (vector_extract v16i8:$src, imm:$Immd)))),
(EXTRACT_SUBREG (v16i8 (SUBREG_TO_REG (i64 0), (DUPi8 V128:$src, imm:$Immd), bsub)), ssub)>;
def : Pat<(f32 (bitconvert (i32 (vector_extract v8i16:$src, imm:$Immd)))),
(EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (DUPi16 V128:$src, imm:$Immd), hsub)), ssub)>;
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
(DUPi32 V128:$src, imm:$Immd)>;
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
(DUPi64 V128:$src, imm:$Immd)>;

// Floating point vector extractions are codegen'd as either a sequence of
// subregister extractions, or a MOV (aka DUP here) if
// the lane number is anything other than zero.
Expand Down
10 changes: 2 additions & 8 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -3459,16 +3459,10 @@ let Predicates = [HasSVE_or_SME] in {
// Alternative case where insertelement is just scalar_to_vector rather than vector_insert.
def : Pat<(v1f64 (scalar_to_vector
(f64 (vector_extract nxv2f64:$vec, VectorIndexD:$index)))),
(EXTRACT_SUBREG
(INSvi64lane (IMPLICIT_DEF), (i64 0),
(EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index),
dsub)>;
(DUPi64 (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index)>;
def : Pat<(v1i64 (scalar_to_vector
(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)))),
(EXTRACT_SUBREG
(INSvi64lane (IMPLICIT_DEF), (i64 0),
(EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index),
dsub)>;
(DUPi64 (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index)>;
} // End HasNEON

let Predicates = [HasNEON] in {
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
define i32 @uqxtn_ext(<4 x i32> noundef %a, <4 x i32> noundef %b, i32 %c, float %d, <2 x i64> %e) {
; CHECK-LABEL: uqxtn_ext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov v0.d[0], v3.d[1]
; CHECK-NEXT: mov d0, v3.d[1]
; CHECK-NEXT: uqxtn s0, d0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
Expand All @@ -219,7 +219,7 @@ entry:
define <4 x i32> @sqxtun_insext(<4 x i32> noundef %a, <2 x i64> %e) {
; CHECK-LABEL: sqxtun_insext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov v1.d[0], v1.d[1]
; CHECK-NEXT: mov d1, v1.d[1]
; CHECK-NEXT: sqxtun s1, d1
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/bitcast-extend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ define <4 x i64> @z_i32_v4i64(i32 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
; CHECK-SD-NEXT: mov b2, v0.b[0]
; CHECK-SD-NEXT: mov b3, v0.b[2]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0
Expand Down Expand Up @@ -172,8 +172,8 @@ define <4 x i64> @s_i32_v4i64(i32 %x) {
; CHECK-SD-LABEL: s_i32_v4i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: mov v1.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v2.b[0], v0.b[2]
; CHECK-SD-NEXT: mov b1, v0.b[0]
; CHECK-SD-NEXT: mov b2, v0.b[2]
; CHECK-SD-NEXT: mov v1.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[3]
; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECKLE-LABEL: test_reconstructshuffle:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: mov v2.b[0], v0.b[3]
; CHECKLE-NEXT: mov b2, v0.b[3]
; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
Expand All @@ -21,7 +21,7 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: mov v2.b[0], v0.b[3]
; CHECKBE-NEXT: mov b2, v0.b[3]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -347,9 +347,8 @@ define half @get_lane_64(<4 x half> %a) #0 {
; CHECK-LABEL: get_lane_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: umov w8, v0.h[2]
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: mov h0, v0.h[2]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x half> %a to <4 x i16>
Expand All @@ -362,9 +361,8 @@ entry:
define half @get_lane_128(<8 x half> %a) #0 {
; CHECK-LABEL: get_lane_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: umov w8, v0.h[2]
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: mov h0, v0.h[2]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x half> %a to <8 x i16>
Expand Down
112 changes: 56 additions & 56 deletions llvm/test/CodeGen/AArch64/itofp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3443,10 +3443,10 @@ define <8 x double> @stofp_v8i8_v8f64(<8 x i8> %a) {
; CHECK-SD-LABEL: stofp_v8i8_v8f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov v1.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v2.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v3.b[0], v0.b[4]
; CHECK-SD-NEXT: mov v4.b[0], v0.b[6]
; CHECK-SD-NEXT: mov b1, v0.b[0]
; CHECK-SD-NEXT: mov b2, v0.b[2]
; CHECK-SD-NEXT: mov b3, v0.b[4]
; CHECK-SD-NEXT: mov b4, v0.b[6]
; CHECK-SD-NEXT: mov v1.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[3]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[5]
Expand Down Expand Up @@ -3492,10 +3492,10 @@ define <8 x double> @utofp_v8i8_v8f64(<8 x i8> %a) {
; CHECK-SD-LABEL: utofp_v8i8_v8f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v4.b[0], v0.b[4]
; CHECK-SD-NEXT: mov v5.b[0], v0.b[6]
; CHECK-SD-NEXT: mov b2, v0.b[0]
; CHECK-SD-NEXT: mov b3, v0.b[2]
; CHECK-SD-NEXT: mov b4, v0.b[4]
; CHECK-SD-NEXT: mov b5, v0.b[6]
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
Expand Down Expand Up @@ -3538,14 +3538,14 @@ define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) {
; CHECK-SD-LABEL: stofp_v16i8_v16f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v4.b[0], v0.b[4]
; CHECK-SD-NEXT: mov v5.b[0], v0.b[6]
; CHECK-SD-NEXT: mov v6.b[0], v1.b[0]
; CHECK-SD-NEXT: mov v7.b[0], v1.b[2]
; CHECK-SD-NEXT: mov v16.b[0], v1.b[4]
; CHECK-SD-NEXT: mov v17.b[0], v1.b[6]
; CHECK-SD-NEXT: mov b2, v0.b[0]
; CHECK-SD-NEXT: mov b3, v0.b[2]
; CHECK-SD-NEXT: mov b4, v0.b[4]
; CHECK-SD-NEXT: mov b5, v0.b[6]
; CHECK-SD-NEXT: mov b6, v1.b[0]
; CHECK-SD-NEXT: mov b7, v1.b[2]
; CHECK-SD-NEXT: mov b16, v1.b[4]
; CHECK-SD-NEXT: mov b17, v1.b[6]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
; CHECK-SD-NEXT: mov v4.b[4], v0.b[5]
Expand Down Expand Up @@ -3622,15 +3622,15 @@ define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) {
; CHECK-SD-LABEL: utofp_v16i8_v16f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mov v3.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v4.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v5.b[0], v0.b[4]
; CHECK-SD-NEXT: mov v6.b[0], v0.b[6]
; CHECK-SD-NEXT: mov b3, v0.b[0]
; CHECK-SD-NEXT: mov b4, v0.b[2]
; CHECK-SD-NEXT: mov b5, v0.b[4]
; CHECK-SD-NEXT: mov b6, v0.b[6]
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v7.b[0], v2.b[0]
; CHECK-SD-NEXT: mov v16.b[0], v2.b[2]
; CHECK-SD-NEXT: mov v17.b[0], v2.b[4]
; CHECK-SD-NEXT: mov v18.b[0], v2.b[6]
; CHECK-SD-NEXT: mov b7, v2.b[0]
; CHECK-SD-NEXT: mov b16, v2.b[2]
; CHECK-SD-NEXT: mov b17, v2.b[4]
; CHECK-SD-NEXT: mov b18, v2.b[6]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v4.b[4], v0.b[3]
; CHECK-SD-NEXT: mov v5.b[4], v0.b[5]
Expand Down Expand Up @@ -3699,35 +3699,35 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-LABEL: stofp_v32i8_v32f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mov v5.b[0], v1.b[6]
; CHECK-SD-NEXT: mov v17.b[0], v1.b[4]
; CHECK-SD-NEXT: mov v20.b[0], v1.b[2]
; CHECK-SD-NEXT: mov v21.b[0], v1.b[0]
; CHECK-SD-NEXT: mov v18.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v19.b[0], v0.b[6]
; CHECK-SD-NEXT: mov v22.b[0], v0.b[4]
; CHECK-SD-NEXT: mov b5, v1.b[6]
; CHECK-SD-NEXT: mov b17, v1.b[4]
; CHECK-SD-NEXT: mov b20, v1.b[2]
; CHECK-SD-NEXT: mov b21, v1.b[0]
; CHECK-SD-NEXT: mov b18, v0.b[0]
; CHECK-SD-NEXT: mov b19, v0.b[6]
; CHECK-SD-NEXT: mov b22, v0.b[4]
; CHECK-SD-NEXT: ext v16.16b, v1.16b, v1.16b, #8
; CHECK-SD-NEXT: mov v2.b[0], v3.b[0]
; CHECK-SD-NEXT: mov v4.b[0], v3.b[2]
; CHECK-SD-NEXT: mov v6.b[0], v3.b[4]
; CHECK-SD-NEXT: mov v7.b[0], v3.b[6]
; CHECK-SD-NEXT: mov b2, v3.b[0]
; CHECK-SD-NEXT: mov b4, v3.b[2]
; CHECK-SD-NEXT: mov b6, v3.b[4]
; CHECK-SD-NEXT: mov b7, v3.b[6]
; CHECK-SD-NEXT: mov v5.b[4], v1.b[7]
; CHECK-SD-NEXT: mov v17.b[4], v1.b[5]
; CHECK-SD-NEXT: mov v20.b[4], v1.b[3]
; CHECK-SD-NEXT: mov v21.b[4], v1.b[1]
; CHECK-SD-NEXT: mov v19.b[4], v0.b[7]
; CHECK-SD-NEXT: mov v22.b[4], v0.b[5]
; CHECK-SD-NEXT: mov v18.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v23.b[0], v16.b[0]
; CHECK-SD-NEXT: mov b23, v16.b[0]
; CHECK-SD-NEXT: mov v2.b[4], v3.b[1]
; CHECK-SD-NEXT: mov v4.b[4], v3.b[3]
; CHECK-SD-NEXT: mov v6.b[4], v3.b[5]
; CHECK-SD-NEXT: mov v7.b[4], v3.b[7]
; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
; CHECK-SD-NEXT: mov b3, v0.b[2]
; CHECK-SD-NEXT: shl v5.2s, v5.2s, #24
; CHECK-SD-NEXT: shl v17.2s, v17.2s, #24
; CHECK-SD-NEXT: shl v20.2s, v20.2s, #24
; CHECK-SD-NEXT: mov v24.b[0], v16.b[4]
; CHECK-SD-NEXT: mov b24, v16.b[4]
; CHECK-SD-NEXT: mov v23.b[4], v16.b[1]
; CHECK-SD-NEXT: shl v18.2s, v18.2s, #24
; CHECK-SD-NEXT: shl v19.2s, v19.2s, #24
Expand All @@ -3739,10 +3739,10 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-NEXT: shl v0.2s, v21.2s, #24
; CHECK-SD-NEXT: shl v4.2s, v6.2s, #24
; CHECK-SD-NEXT: shl v6.2s, v7.2s, #24
; CHECK-SD-NEXT: mov v7.b[0], v16.b[2]
; CHECK-SD-NEXT: mov b7, v16.b[2]
; CHECK-SD-NEXT: sshll v5.2d, v5.2s, #0
; CHECK-SD-NEXT: sshr v20.2s, v20.2s, #24
; CHECK-SD-NEXT: mov v21.b[0], v16.b[6]
; CHECK-SD-NEXT: mov b21, v16.b[6]
; CHECK-SD-NEXT: sshll v17.2d, v17.2s, #0
; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: shl v22.2s, v22.2s, #24
Expand Down Expand Up @@ -3869,25 +3869,25 @@ entry:
define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-LABEL: utofp_v32i8_v32f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov v6.b[0], v1.b[6]
; CHECK-SD-NEXT: mov v7.b[0], v1.b[4]
; CHECK-SD-NEXT: mov b6, v1.b[6]
; CHECK-SD-NEXT: mov b7, v1.b[4]
; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
; CHECK-SD-NEXT: mov v16.b[0], v1.b[2]
; CHECK-SD-NEXT: mov v17.b[0], v1.b[0]
; CHECK-SD-NEXT: mov v19.b[0], v0.b[6]
; CHECK-SD-NEXT: mov v20.b[0], v0.b[4]
; CHECK-SD-NEXT: mov b16, v1.b[2]
; CHECK-SD-NEXT: mov b17, v1.b[0]
; CHECK-SD-NEXT: mov b19, v0.b[6]
; CHECK-SD-NEXT: mov b20, v0.b[4]
; CHECK-SD-NEXT: movi d5, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v24.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v25.b[0], v0.b[0]
; CHECK-SD-NEXT: mov b24, v0.b[2]
; CHECK-SD-NEXT: mov b25, v0.b[0]
; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mov v6.b[4], v1.b[7]
; CHECK-SD-NEXT: mov v7.b[4], v1.b[5]
; CHECK-SD-NEXT: mov v18.b[0], v3.b[0]
; CHECK-SD-NEXT: mov v21.b[0], v3.b[2]
; CHECK-SD-NEXT: mov v23.b[0], v3.b[4]
; CHECK-SD-NEXT: mov b18, v3.b[0]
; CHECK-SD-NEXT: mov b21, v3.b[2]
; CHECK-SD-NEXT: mov b23, v3.b[4]
; CHECK-SD-NEXT: mov v16.b[4], v1.b[3]
; CHECK-SD-NEXT: mov v17.b[4], v1.b[1]
; CHECK-SD-NEXT: mov v1.b[0], v3.b[6]
; CHECK-SD-NEXT: mov b1, v3.b[6]
; CHECK-SD-NEXT: mov v19.b[4], v0.b[7]
; CHECK-SD-NEXT: mov v20.b[4], v0.b[5]
; CHECK-SD-NEXT: mov v24.b[4], v0.b[3]
Expand All @@ -3905,15 +3905,15 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0
; CHECK-SD-NEXT: and v20.8b, v20.8b, v5.8b
; CHECK-SD-NEXT: ushll v16.2d, v16.2s, #0
; CHECK-SD-NEXT: mov v4.b[0], v2.b[0]
; CHECK-SD-NEXT: mov v22.b[0], v2.b[2]
; CHECK-SD-NEXT: mov b4, v2.b[0]
; CHECK-SD-NEXT: mov b22, v2.b[2]
; CHECK-SD-NEXT: ushll v17.2d, v17.2s, #0
; CHECK-SD-NEXT: ushll v0.2d, v3.2s, #0
; CHECK-SD-NEXT: mov v19.b[0], v2.b[4]
; CHECK-SD-NEXT: mov b19, v2.b[4]
; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
; CHECK-SD-NEXT: ucvtf v3.2d, v7.2d
; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0
; CHECK-SD-NEXT: mov v7.b[0], v2.b[6]
; CHECK-SD-NEXT: mov b7, v2.b[6]
; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d
; CHECK-SD-NEXT: and v24.8b, v24.8b, v5.8b
; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/neon-bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
; CHECK-LE-LABEL: bitcast_i16_to_v2i8:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: fmov s1, w0
; CHECK-LE-NEXT: mov v0.b[0], v1.b[0]
; CHECK-LE-NEXT: mov b0, v1.b[0]
; CHECK-LE-NEXT: mov v0.b[4], v1.b[1]
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
Expand All @@ -564,7 +564,7 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: fmov s0, w0
; CHECK-BE-NEXT: rev16 v0.16b, v0.16b
; CHECK-BE-NEXT: mov v1.b[0], v0.b[0]
; CHECK-BE-NEXT: mov b1, v0.b[0]
; CHECK-BE-NEXT: mov v1.b[4], v0.b[1]
; CHECK-BE-NEXT: rev64 v0.2s, v1.2s
; CHECK-BE-NEXT: ret
Expand Down
Loading