Skip to content

[AArch64][GlobalISel] Avoid generating inserts for undefs when selecting G_BUILD_VECTOR #84452

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 26 additions & 7 deletions llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5934,13 +5934,16 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,

// Keep track of the last MI we inserted. Later on, we might be able to save
// a copy using it.
MachineInstr *PrevMI = nullptr;
MachineInstr *PrevMI = ScalarToVec;
for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
// Note that if we don't do a subregister copy, we can end up making an
// extra register.
PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
i - 1, RB, MIB);
DstVec = PrevMI->getOperand(0).getReg();
Register OpReg = I.getOperand(i).getReg();
// Do not emit inserts for undefs
if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
DstVec = PrevMI->getOperand(0).getReg();
}
}

// If DstTy's size in bits is less than 128, then emit a subregister copy
Expand Down Expand Up @@ -5973,11 +5976,27 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
RegOp.setReg(Reg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
// We don't need a subregister copy. Save a copy by re-using the
// destination register on the final insert.
assert(PrevMI && "PrevMI was null?");
// We either have a vector with all elements (except the first one) undef or
// at least one non-undef non-first element. In the first case, we need to
// constrain the output register ourselves as we may have generated an
// INSERT_SUBREG operation which is a generic operation for which the
// output regclass cannot be automatically chosen.
//
// In the second case, there is no need to do this as it may generate an
// instruction like INSvi32gpr where the regclass can be automatically
// chosen.
//
// Also, we save a copy by re-using the destination register on the final
// insert.
PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);

Register DstReg = PrevMI->getOperand(0).getReg();
if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
const TargetRegisterClass *RC =
getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
RBI.constrainGenericRegister(DstReg, *RC, MRI);
}
}

I.eraseFromParent();
Expand Down
6 changes: 1 addition & 5 deletions llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,8 @@ body: |
; CHECK-LABEL: name: undef_elts_different_regbanks
; CHECK: liveins: $w0
; CHECK: %val:gpr32all = COPY $w0
; CHECK: %undef:gpr32 = IMPLICIT_DEF
; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %val, %subreg.ssub
; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %undef
; CHECK: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr]], 2, %undef
; CHECK: %bv:fpr128 = INSvi32gpr [[INSvi32gpr1]], 3, %undef
; CHECK: %bv:fpr128 = INSERT_SUBREG [[DEF]], %val, %subreg.ssub
; CHECK: $q0 = COPY %bv
; CHECK: RET_ReallyLR implicit $q0
%val:gpr(s32) = COPY $w0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,18 @@ body: |
; CHECK: liveins: $d0
; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
; CHECK: [[DEF1:%[0-9]+]]:gpr32 = IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[DEF]], %subreg.ssub
; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, [[DEF1]]
; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.ssub
; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSERT_SUBREG]].dsub
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
; CHECK: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.dsub
; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[COPY1]], %subreg.dsub
; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY1]], %subreg.dsub
; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG2]], 0
; CHECK: [[DEF5:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF5]], [[LDRDui]], %subreg.dsub
; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[LDRDui]], %subreg.dsub
; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[INSERT_SUBREG3]]
; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
; CHECK: $d0 = COPY [[COPY2]]
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
; CHECK-GI-NEXT: and w8, w8, w10
; CHECK-GI-NEXT: orr w8, w9, w8
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%neg = xor <1 x i32> %C, <i32 -1>
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
; CHECK-GI-NEXT: bic w8, w10, w8
; CHECK-GI-NEXT: orr w8, w9, w8
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%and = and <1 x i32> %C, %B
Expand Down
6 changes: 0 additions & 6 deletions llvm/test/CodeGen/AArch64/abs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: eor w8, w8, w9
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
Expand Down Expand Up @@ -308,11 +307,6 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s1, w2
; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
; CHECK-GI-NEXT: mov v0.b[3], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[4], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[5], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[6], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[7], v0.b[0]
; CHECK-GI-NEXT: abs v0.8b, v0.8b
; CHECK-GI-NEXT: umov w0, v0.b[0]
; CHECK-GI-NEXT: umov w1, v0.b[1]
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AArch64/arm64-dup.ll
Original file line number Diff line number Diff line change
Expand Up @@ -373,11 +373,9 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
;
; CHECK-GI-LABEL: test_build_illegal:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov.h v1[1], v0[0]
; CHECK-GI-NEXT: mov s0, v0[3]
; CHECK-GI-NEXT: mov.h v1[2], v0[0]
; CHECK-GI-NEXT: mov.h v1[3], v0[0]
; CHECK-GI-NEXT: fmov d0, d1
; CHECK-GI-NEXT: mov.h v0[3], v0[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%val = extractelement <4 x i32> %in, i32 3
%smallval = trunc i32 %val to i16
Expand Down
45 changes: 8 additions & 37 deletions llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1346,41 +1346,26 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
; CHECK-GI-LABEL: scalar_to_vector.v2i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
%b = insertelement <2 x i32> undef, i32 %a, i32 0
ret <2 x i32> %b
}

define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
; CHECK-SD-LABEL: scalar_to_vector.v4i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: scalar_to_vector.v4i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: mov v0.s[3], w8
; CHECK-GI-NEXT: ret
; CHECK-LABEL: scalar_to_vector.v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ret
%b = insertelement <4 x i32> undef, i32 %a, i32 0
ret <4 x i32> %b
}

define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
; CHECK-SD-LABEL: scalar_to_vector.v2i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov d0, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: scalar_to_vector.v2i64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov d0, x0
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
; CHECK-LABEL: scalar_to_vector.v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ret
%b = insertelement <2 x i64> undef, i64 %a, i32 0
ret <2 x i64> %b
}
Expand Down Expand Up @@ -1900,14 +1885,6 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
; CHECK-GI-NEXT: mov v0.b[5], v6.b[0]
; CHECK-GI-NEXT: mov v0.b[6], v7.b[0]
; CHECK-GI-NEXT: mov v0.b[7], v16.b[0]
; CHECK-GI-NEXT: mov v0.b[8], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[9], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[10], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[11], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[12], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[13], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[14], v0.b[0]
; CHECK-GI-NEXT: mov v0.b[15], v0.b[0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
entry:
Expand Down Expand Up @@ -2123,10 +2100,6 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0]
; CHECK-GI-NEXT: mov v0.h[2], v3.h[0]
; CHECK-GI-NEXT: mov v0.h[3], v4.h[0]
; CHECK-GI-NEXT: mov v0.h[4], v0.h[0]
; CHECK-GI-NEXT: mov v0.h[5], v0.h[0]
; CHECK-GI-NEXT: mov v0.h[6], v0.h[0]
; CHECK-GI-NEXT: mov v0.h[7], v0.h[0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
entry:
Expand Down Expand Up @@ -2266,8 +2239,6 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov v0.s[1], v2.s[0]
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0]
; CHECK-GI-NEXT: mov v0.s[2], v0.s[0]
; CHECK-GI-NEXT: mov v0.s[3], v0.s[0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
entry:
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AArch64/bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ define <4 x i16> @foo1(<2 x i32> %a) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #58712 // =0xe558
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s
; CHECK-GI-NEXT: rev32 v0.4h, v0.4h
; CHECK-GI-NEXT: ret
Expand All @@ -42,7 +41,6 @@ define <4 x i16> @foo2(<2 x i32> %a) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #712 // =0x2c8
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: mov v1.s[1], w8
; CHECK-GI-NEXT: zip1 v0.2s, v1.2s, v0.2s
; CHECK-GI-NEXT: rev32 v0.4h, v0.4h
; CHECK-GI-NEXT: ret
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AArch64/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: rev w8, w8
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
entry:
Expand Down
26 changes: 12 additions & 14 deletions llvm/test/CodeGen/AArch64/fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -160,21 +160,20 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
;
; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v1.h[1], v2.h[0]
; CHECK-GI-NOFP16-NEXT: fabs v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v1.h[2], v0.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-GI-NOFP16-NEXT: mov v1.h[3], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[6]
; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v4.h[0]
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: fabs v1.4s, v2.4s
; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v3.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s
; CHECK-GI-NOFP16-NEXT: mov v0.h[3], v4.h[0]
Expand All @@ -183,7 +182,6 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[7], v0.h[0]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fabs_v7f16:
Expand Down
62 changes: 28 additions & 34 deletions llvm/test/CodeGen/AArch64/faddsub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -186,26 +186,24 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov h4, v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov h5, v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v7.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
; CHECK-GI-NOFP16-NEXT: fadd v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
; CHECK-GI-NOFP16-NEXT: fadd v3.4s, v6.4s, v7.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0]
; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v3.4s
Expand All @@ -217,7 +215,6 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[7], v0.h[0]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_v7f16:
Expand Down Expand Up @@ -538,26 +535,24 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
;
; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov h4, v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov h5, v1.h[5]
; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v7.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[6]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[4]
; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[5]
; CHECK-GI-NOFP16-NEXT: mov h6, v1.h[4]
; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[6]
; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0]
; CHECK-GI-NOFP16-NEXT: fsub v2.4s, v2.4s, v3.4s
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[6]
; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0]
; CHECK-GI-NOFP16-NEXT: fsub v3.4s, v6.4s, v7.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v1.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v3.4s
; CHECK-GI-NOFP16-NEXT: mov v2.h[3], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v4.h[3], v0.h[0]
; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0]
; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v3.h[0]
; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v4.4h
; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v6.4h
; CHECK-GI-NOFP16-NEXT: mov h4, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NOFP16-NEXT: fsub v1.4s, v2.4s, v3.4s
Expand All @@ -569,7 +564,6 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-GI-NOFP16-NEXT: mov h1, v1.h[2]
; CHECK-GI-NOFP16-NEXT: mov v0.h[5], v2.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[6], v1.h[0]
; CHECK-GI-NOFP16-NEXT: mov v0.h[7], v0.h[0]
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fsub_v7f16:
Expand Down
Loading