Skip to content

Commit d320d8a

Browse files
author
git apple-llvm automerger
committed
Merge commit '41d8149ee972' from llvm.org/main into apple/main
2 parents 853f18e + 41d8149 commit d320d8a

File tree

5 files changed

+162
-261
lines changed

5 files changed

+162
-261
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17217,7 +17217,7 @@ static SDValue PerformBITCASTCombine(SDNode *N,
1721717217
}
1721817218

1721917219
// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
17220-
// node into a buildvector after legalizeOps.
17220+
// node into stack operations after legalizeOps.
1722117221
SDValue ARMTargetLowering::PerformMVETruncCombine(
1722217222
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
1722317223
SelectionDAG &DAG = DCI.DAG;
@@ -17265,7 +17265,14 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
1726517265
}
1726617266
}
1726717267

17268-
auto LowerToBuildVec = [&]() {
17268+
// For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
17269+
// truncate to a buildvector to allow the generic optimisations to kick in.
17270+
if (all_of(N->ops(), [](SDValue Op) {
17271+
return Op.getOpcode() == ISD::BUILD_VECTOR ||
17272+
Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
17273+
(Op.getOpcode() == ISD::BITCAST &&
17274+
Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
17275+
})) {
1726917276
SmallVector<SDValue, 8> Extracts;
1727017277
for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
1727117278
SDValue O = N->getOperand(Op);
@@ -17276,26 +17283,40 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
1727617283
}
1727717284
}
1727817285
return DAG.getBuildVector(VT, DL, Extracts);
17279-
};
17280-
17281-
// For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
17282-
// truncate to a buildvector to allow the generic optimisations to kick in.
17283-
if (all_of(N->ops(), [](SDValue Op) {
17284-
return Op.getOpcode() == ISD::BUILD_VECTOR ||
17285-
Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
17286-
(Op.getOpcode() == ISD::BITCAST &&
17287-
Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
17288-
}))
17289-
return LowerToBuildVec();
17286+
}
1729017287

1729117288
// If we are late in the legalization process and nothing has optimised
17292-
// the trunc to anything better lower it to a series of extracts and a
17293-
// buildvector.
17289+
// the trunc to anything better, lower it to a stack store and reload,
17290+
// performing the truncation whilst keeping the lanes in the correct order:
17291+
// VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
1729417292
if (DCI.isBeforeLegalizeOps())
1729517293
return SDValue();
1729617294

17297-
SDValue BuildVec = LowerToBuildVec();
17298-
return LowerBUILD_VECTOR(BuildVec, DCI.DAG, Subtarget);
17295+
SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17296+
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
17297+
int NumIns = N->getNumOperands();
17298+
assert((NumIns == 2 || NumIns == 4) &&
17299+
"Expected 2 or 4 inputs to an MVETrunc");
17300+
EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
17301+
if (N->getNumOperands() == 4)
17302+
StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
17303+
17304+
SmallVector<SDValue> Chains;
17305+
for (int I = 0; I < NumIns; I++) {
17306+
SDValue Ptr = DAG.getNode(
17307+
ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
17308+
DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
17309+
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
17310+
DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
17311+
SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
17312+
Ptr, MPI, StoreVT, Align(4));
17313+
Chains.push_back(Ch);
17314+
}
17315+
17316+
SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17317+
MachinePointerInfo MPI =
17318+
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
17319+
return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
1729917320
}
1730017321

1730117322
SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,

llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll

Lines changed: 71 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -275,105 +275,88 @@ entry:
275275
define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) {
276276
; CHECK-LABEL: ext_add_ashr_trunc_i8i32:
277277
; CHECK: @ %bb.0: @ %entry
278-
; CHECK-NEXT: .save {r4, r5, r7, lr}
279-
; CHECK-NEXT: push {r4, r5, r7, lr}
280-
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
281-
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
282-
; CHECK-NEXT: vmov.u8 r0, q1[14]
283-
; CHECK-NEXT: vmov.u8 r1, q1[12]
284-
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
285-
; CHECK-NEXT: vmov.u8 r0, q1[15]
286-
; CHECK-NEXT: vmov.u8 r1, q1[13]
278+
; CHECK-NEXT: .vsave {d8, d9}
279+
; CHECK-NEXT: vpush {d8, d9}
280+
; CHECK-NEXT: .pad #16
281+
; CHECK-NEXT: sub sp, #16
282+
; CHECK-NEXT: vmov.u8 r1, q1[14]
283+
; CHECK-NEXT: vmov.u8 r2, q1[12]
284+
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
285+
; CHECK-NEXT: vmov.u8 r1, q1[15]
286+
; CHECK-NEXT: vmov.u8 r2, q1[13]
287287
; CHECK-NEXT: vmov.i32 q2, #0xff
288-
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
289-
; CHECK-NEXT: vmov.u8 r0, q0[14]
290-
; CHECK-NEXT: vmov.u8 r1, q0[12]
288+
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
289+
; CHECK-NEXT: vmov.u8 r1, q0[14]
290+
; CHECK-NEXT: vmov.u8 r2, q0[12]
291291
; CHECK-NEXT: vand q3, q3, q2
292-
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
293-
; CHECK-NEXT: vmov.u8 r0, q0[15]
294-
; CHECK-NEXT: vmov.u8 r1, q0[13]
295-
; CHECK-NEXT: vmov.u8 r4, q1[6]
296-
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
297-
; CHECK-NEXT: vmov.u8 r0, q1[2]
292+
; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
293+
; CHECK-NEXT: vmov.u8 r1, q0[15]
294+
; CHECK-NEXT: vmov.u8 r2, q0[13]
295+
; CHECK-NEXT: mov r0, sp
296+
; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
297+
; CHECK-NEXT: vmov.u8 r1, q1[10]
298298
; CHECK-NEXT: vmovlb.s8 q4, q4
299-
; CHECK-NEXT: vmov.u8 r1, q1[0]
299+
; CHECK-NEXT: vmov.u8 r2, q1[8]
300300
; CHECK-NEXT: vmovlb.s16 q4, q4
301-
; CHECK-NEXT: vmov.u8 r5, q1[4]
302301
; CHECK-NEXT: vadd.i32 q3, q4, q3
303302
; CHECK-NEXT: vshr.u32 q3, q3, #1
304-
; CHECK-NEXT: vmov lr, r12, d7
305-
; CHECK-NEXT: vmov r3, r2, d6
306-
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
307-
; CHECK-NEXT: vmov.u8 r0, q1[3]
308-
; CHECK-NEXT: vmov.u8 r1, q1[1]
309-
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
310-
; CHECK-NEXT: vmov.u8 r0, q0[2]
311-
; CHECK-NEXT: vmov.u8 r1, q0[0]
303+
; CHECK-NEXT: vstrb.32 q3, [r0, #12]
304+
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
305+
; CHECK-NEXT: vmov.u8 r1, q1[11]
306+
; CHECK-NEXT: vmov.u8 r2, q1[9]
307+
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
308+
; CHECK-NEXT: vmov.u8 r1, q0[10]
309+
; CHECK-NEXT: vmov.u8 r2, q0[8]
312310
; CHECK-NEXT: vand q3, q3, q2
313-
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
314-
; CHECK-NEXT: vmov.u8 r0, q0[3]
315-
; CHECK-NEXT: vmov.u8 r1, q0[1]
316-
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
311+
; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
312+
; CHECK-NEXT: vmov.u8 r1, q0[11]
313+
; CHECK-NEXT: vmov.u8 r2, q0[9]
314+
; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
315+
; CHECK-NEXT: vmov.u8 r1, q1[6]
317316
; CHECK-NEXT: vmovlb.s8 q4, q4
317+
; CHECK-NEXT: vmov.u8 r2, q1[4]
318318
; CHECK-NEXT: vmovlb.s16 q4, q4
319319
; CHECK-NEXT: vadd.i32 q3, q4, q3
320-
; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
321-
; CHECK-NEXT: vmov.u8 r4, q1[7]
322-
; CHECK-NEXT: vmov.u8 r5, q1[5]
323-
; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
324-
; CHECK-NEXT: vmov.u8 r4, q0[6]
325-
; CHECK-NEXT: vmov.u8 r5, q0[4]
326320
; CHECK-NEXT: vshr.u32 q3, q3, #1
327-
; CHECK-NEXT: vmov q5[2], q5[0], r5, r4
328-
; CHECK-NEXT: vmov.u8 r4, q0[7]
329-
; CHECK-NEXT: vmov.u8 r5, q0[5]
330-
; CHECK-NEXT: vand q4, q4, q2
331-
; CHECK-NEXT: vmov q5[3], q5[1], r5, r4
332-
; CHECK-NEXT: vmov.u8 r4, q0[10]
333-
; CHECK-NEXT: vmovlb.s8 q5, q5
334-
; CHECK-NEXT: vmov.u8 r5, q0[8]
335-
; CHECK-NEXT: vmovlb.s16 q5, q5
336-
; CHECK-NEXT: vmov r1, r0, d6
337-
; CHECK-NEXT: vadd.i32 q4, q5, q4
338-
; CHECK-NEXT: vmov q5[2], q5[0], r5, r4
339-
; CHECK-NEXT: vmov.u8 r4, q0[11]
340-
; CHECK-NEXT: vmov.u8 r5, q0[9]
341-
; CHECK-NEXT: vmov q5[3], q5[1], r5, r4
342-
; CHECK-NEXT: vmov.8 q0[0], r1
343-
; CHECK-NEXT: vmov.u8 r4, q1[10]
344-
; CHECK-NEXT: vmov.u8 r5, q1[8]
345-
; CHECK-NEXT: vmov q6[2], q6[0], r5, r4
346-
; CHECK-NEXT: vmov.8 q0[1], r0
347-
; CHECK-NEXT: vmov r0, r1, d7
348-
; CHECK-NEXT: vmov.u8 r4, q1[11]
349-
; CHECK-NEXT: vmov.u8 r5, q1[9]
350-
; CHECK-NEXT: vmov.8 q0[2], r0
351-
; CHECK-NEXT: vmov q6[3], q6[1], r5, r4
352-
; CHECK-NEXT: vshr.u32 q4, q4, #1
353-
; CHECK-NEXT: vmov.8 q0[3], r1
354-
; CHECK-NEXT: vmov r0, r1, d8
355-
; CHECK-NEXT: vand q1, q6, q2
356-
; CHECK-NEXT: vmovlb.s8 q2, q5
357-
; CHECK-NEXT: vmov.8 q0[4], r0
358-
; CHECK-NEXT: vmovlb.s16 q2, q2
359-
; CHECK-NEXT: vadd.i32 q1, q2, q1
360-
; CHECK-NEXT: vmov r4, r5, d9
361-
; CHECK-NEXT: vmov.8 q0[5], r1
362-
; CHECK-NEXT: vshr.u32 q1, q1, #1
363-
; CHECK-NEXT: vmov.8 q0[6], r4
364-
; CHECK-NEXT: vmov r0, r1, d3
365-
; CHECK-NEXT: vmov.8 q0[7], r5
366-
; CHECK-NEXT: vmov r4, r5, d2
367-
; CHECK-NEXT: vmov.8 q0[8], r4
368-
; CHECK-NEXT: vmov.8 q0[9], r5
369-
; CHECK-NEXT: vmov.8 q0[10], r0
370-
; CHECK-NEXT: vmov.8 q0[11], r1
371-
; CHECK-NEXT: vmov.8 q0[12], r3
372-
; CHECK-NEXT: vmov.8 q0[13], r2
373-
; CHECK-NEXT: vmov.8 q0[14], lr
374-
; CHECK-NEXT: vmov.8 q0[15], r12
375-
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
376-
; CHECK-NEXT: pop {r4, r5, r7, pc}
321+
; CHECK-NEXT: vstrb.32 q3, [r0, #8]
322+
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
323+
; CHECK-NEXT: vmov.u8 r1, q1[7]
324+
; CHECK-NEXT: vmov.u8 r2, q1[5]
325+
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
326+
; CHECK-NEXT: vmov.u8 r1, q0[6]
327+
; CHECK-NEXT: vmov.u8 r2, q0[4]
328+
; CHECK-NEXT: vand q3, q3, q2
329+
; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
330+
; CHECK-NEXT: vmov.u8 r1, q0[7]
331+
; CHECK-NEXT: vmov.u8 r2, q0[5]
332+
; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
333+
; CHECK-NEXT: vmov.u8 r1, q1[2]
334+
; CHECK-NEXT: vmovlb.s8 q4, q4
335+
; CHECK-NEXT: vmov.u8 r2, q1[0]
336+
; CHECK-NEXT: vmovlb.s16 q4, q4
337+
; CHECK-NEXT: vadd.i32 q3, q4, q3
338+
; CHECK-NEXT: vshr.u32 q3, q3, #1
339+
; CHECK-NEXT: vstrb.32 q3, [r0, #4]
340+
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
341+
; CHECK-NEXT: vmov.u8 r1, q1[3]
342+
; CHECK-NEXT: vmov.u8 r2, q1[1]
343+
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
344+
; CHECK-NEXT: vmov.u8 r1, q0[2]
345+
; CHECK-NEXT: vmov.u8 r2, q0[0]
346+
; CHECK-NEXT: vand q1, q3, q2
347+
; CHECK-NEXT: vmov q2[2], q2[0], r2, r1
348+
; CHECK-NEXT: vmov.u8 r1, q0[3]
349+
; CHECK-NEXT: vmov.u8 r2, q0[1]
350+
; CHECK-NEXT: vmov q2[3], q2[1], r2, r1
351+
; CHECK-NEXT: vmovlb.s8 q0, q2
352+
; CHECK-NEXT: vmovlb.s16 q0, q0
353+
; CHECK-NEXT: vadd.i32 q0, q0, q1
354+
; CHECK-NEXT: vshr.u32 q0, q0, #1
355+
; CHECK-NEXT: vstrb.32 q0, [r0]
356+
; CHECK-NEXT: vldrw.u32 q0, [r0]
357+
; CHECK-NEXT: add sp, #16
358+
; CHECK-NEXT: vpop {d8, d9}
359+
; CHECK-NEXT: bx lr
377360
entry:
378361
%sa = sext <16 x i8> %a to <16 x i32>
379362
%sb = zext <16 x i8> %b to <16 x i32>

llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -362,23 +362,16 @@ entry:
362362
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
363363
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
364364
; CHECK: @ %bb.0: @ %entry
365-
; CHECK-NEXT: .save {r4, r5, r7, lr}
366-
; CHECK-NEXT: push {r4, r5, r7, lr}
367-
; CHECK-NEXT: vmov r4, r5, d0
368-
; CHECK-NEXT: vmov.16 q2[0], r4
369-
; CHECK-NEXT: vmov lr, r12, d3
370-
; CHECK-NEXT: vmov r3, r2, d2
371-
; CHECK-NEXT: vldrb.u16 q1, [r1]
372-
; CHECK-NEXT: vmov r1, r4, d1
373-
; CHECK-NEXT: vmov.16 q2[1], r5
374-
; CHECK-NEXT: vmov.16 q2[2], r1
375-
; CHECK-NEXT: vmov.16 q2[3], r4
376-
; CHECK-NEXT: vmov.16 q2[4], r3
377-
; CHECK-NEXT: vmov.16 q2[5], r2
378-
; CHECK-NEXT: vmov.16 q2[6], lr
379-
; CHECK-NEXT: vmov.16 q2[7], r12
380-
; CHECK-NEXT: vstrh.16 q2, [r0, q1]
381-
; CHECK-NEXT: pop {r4, r5, r7, pc}
365+
; CHECK-NEXT: .pad #16
366+
; CHECK-NEXT: sub sp, #16
367+
; CHECK-NEXT: mov r2, sp
368+
; CHECK-NEXT: vstrh.32 q1, [r2, #8]
369+
; CHECK-NEXT: vstrh.32 q0, [r2]
370+
; CHECK-NEXT: vldrb.u16 q0, [r1]
371+
; CHECK-NEXT: vldrw.u32 q1, [r2]
372+
; CHECK-NEXT: vstrh.16 q1, [r0, q0]
373+
; CHECK-NEXT: add sp, #16
374+
; CHECK-NEXT: bx lr
382375
entry:
383376
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
384377
%offs.zext = zext <8 x i8> %offs to <8 x i32>

llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll

Lines changed: 20 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -374,38 +374,18 @@ entry:
374374
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
375375
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
376376
; CHECK: @ %bb.0: @ %entry
377-
; CHECK-NEXT: .save {r4, r5, r7, lr}
378-
; CHECK-NEXT: push {r4, r5, r7, lr}
379-
; CHECK-NEXT: .vsave {d8, d9}
380-
; CHECK-NEXT: vpush {d8, d9}
381-
; CHECK-NEXT: vmov r4, r5, d0
382-
; CHECK-NEXT: vmov.8 q4[0], r4
383-
; CHECK-NEXT: vmov lr, r12, d7
384-
; CHECK-NEXT: vmov r3, r2, d6
385-
; CHECK-NEXT: vldrb.u8 q3, [r1]
386-
; CHECK-NEXT: vmov r1, r4, d1
387-
; CHECK-NEXT: vmov.8 q4[1], r5
388-
; CHECK-NEXT: vmov.8 q4[2], r1
389-
; CHECK-NEXT: vmov r1, r5, d2
390-
; CHECK-NEXT: vmov.8 q4[3], r4
391-
; CHECK-NEXT: vmov.8 q4[4], r1
392-
; CHECK-NEXT: vmov r1, r4, d3
393-
; CHECK-NEXT: vmov.8 q4[5], r5
394-
; CHECK-NEXT: vmov.8 q4[6], r1
395-
; CHECK-NEXT: vmov r1, r5, d4
396-
; CHECK-NEXT: vmov.8 q4[7], r4
397-
; CHECK-NEXT: vmov.8 q4[8], r1
398-
; CHECK-NEXT: vmov r1, r4, d5
399-
; CHECK-NEXT: vmov.8 q4[9], r5
400-
; CHECK-NEXT: vmov.8 q4[10], r1
401-
; CHECK-NEXT: vmov.8 q4[11], r4
402-
; CHECK-NEXT: vmov.8 q4[12], r3
403-
; CHECK-NEXT: vmov.8 q4[13], r2
404-
; CHECK-NEXT: vmov.8 q4[14], lr
405-
; CHECK-NEXT: vmov.8 q4[15], r12
406-
; CHECK-NEXT: vstrb.8 q4, [r0, q3]
407-
; CHECK-NEXT: vpop {d8, d9}
408-
; CHECK-NEXT: pop {r4, r5, r7, pc}
377+
; CHECK-NEXT: .pad #16
378+
; CHECK-NEXT: sub sp, #16
379+
; CHECK-NEXT: mov r2, sp
380+
; CHECK-NEXT: vstrb.32 q3, [r2, #12]
381+
; CHECK-NEXT: vstrb.32 q2, [r2, #8]
382+
; CHECK-NEXT: vstrb.32 q1, [r2, #4]
383+
; CHECK-NEXT: vstrb.32 q0, [r2]
384+
; CHECK-NEXT: vldrb.u8 q0, [r1]
385+
; CHECK-NEXT: vldrw.u32 q1, [r2]
386+
; CHECK-NEXT: vstrb.8 q1, [r0, q0]
387+
; CHECK-NEXT: add sp, #16
388+
; CHECK-NEXT: bx lr
409389
entry:
410390
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
411391
%offs.zext = zext <16 x i8> %offs to <16 x i32>
@@ -418,40 +398,15 @@ entry:
418398
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
419399
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
420400
; CHECK: @ %bb.0: @ %entry
421-
; CHECK-NEXT: vmov.u16 r3, q0[0]
422-
; CHECK-NEXT: vmov.u16 r2, q1[7]
423-
; CHECK-NEXT: vmov.8 q2[0], r3
424-
; CHECK-NEXT: vmov.u16 r3, q0[1]
425-
; CHECK-NEXT: vmov.8 q2[1], r3
426-
; CHECK-NEXT: vmov.u16 r3, q0[2]
427-
; CHECK-NEXT: vmov.8 q2[2], r3
428-
; CHECK-NEXT: vmov.u16 r3, q0[3]
429-
; CHECK-NEXT: vmov.8 q2[3], r3
430-
; CHECK-NEXT: vmov.u16 r3, q0[4]
431-
; CHECK-NEXT: vmov.8 q2[4], r3
432-
; CHECK-NEXT: vmov.u16 r3, q0[5]
433-
; CHECK-NEXT: vmov.8 q2[5], r3
434-
; CHECK-NEXT: vmov.u16 r3, q0[6]
435-
; CHECK-NEXT: vmov.8 q2[6], r3
436-
; CHECK-NEXT: vmov.u16 r3, q0[7]
437-
; CHECK-NEXT: vmov.8 q2[7], r3
438-
; CHECK-NEXT: vmov.u16 r3, q1[0]
439-
; CHECK-NEXT: vmov.8 q2[8], r3
440-
; CHECK-NEXT: vmov.u16 r3, q1[1]
441-
; CHECK-NEXT: vmov.8 q2[9], r3
442-
; CHECK-NEXT: vmov.u16 r3, q1[2]
443-
; CHECK-NEXT: vmov.8 q2[10], r3
444-
; CHECK-NEXT: vmov.u16 r3, q1[3]
445-
; CHECK-NEXT: vmov.8 q2[11], r3
446-
; CHECK-NEXT: vmov.u16 r3, q1[4]
447-
; CHECK-NEXT: vmov.8 q2[12], r3
448-
; CHECK-NEXT: vmov.u16 r3, q1[5]
449-
; CHECK-NEXT: vmov.8 q2[13], r3
450-
; CHECK-NEXT: vmov.u16 r3, q1[6]
451-
; CHECK-NEXT: vmov.8 q2[14], r3
401+
; CHECK-NEXT: .pad #16
402+
; CHECK-NEXT: sub sp, #16
403+
; CHECK-NEXT: mov r2, sp
404+
; CHECK-NEXT: vstrb.16 q1, [r2, #8]
405+
; CHECK-NEXT: vstrb.16 q0, [r2]
452406
; CHECK-NEXT: vldrb.u8 q0, [r1]
453-
; CHECK-NEXT: vmov.8 q2[15], r2
454-
; CHECK-NEXT: vstrb.8 q2, [r0, q0]
407+
; CHECK-NEXT: vldrw.u32 q1, [r2]
408+
; CHECK-NEXT: vstrb.8 q1, [r0, q0]
409+
; CHECK-NEXT: add sp, #16
455410
; CHECK-NEXT: bx lr
456411
entry:
457412
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1

0 commit comments

Comments
 (0)