Skip to content

Commit 4373463

Browse files
authored
[GlobalISel] Widen vector loads from aligned ptrs (#144309)
If the pointer is aligned to more than the size of the vector, we can widen the load up to next power of 2 size, as SDAG performs. Some of the v3 tests are currently worse - those should be addressed in other issues.
1 parent 1fec092 commit 4373463

File tree

11 files changed

+598
-554
lines changed

11 files changed

+598
-554
lines changed

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4072,6 +4072,21 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
40724072
if (MemTy != DstTy)
40734073
return UnableToLegalize;
40744074

4075+
Align Alignment = LoadMI.getAlign();
4076+
// Given an alignment larger than the size of the memory, we can increase
4077+
// the size of the load without needing to scalarize it.
4078+
if (Alignment.value() * 8 > MemSizeInBits &&
4079+
isPowerOf2_64(DstTy.getScalarSizeInBits())) {
4080+
LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
4081+
DstTy.getElementType());
4082+
MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
4083+
auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
4084+
MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
4085+
NewLoad.getReg(0));
4086+
LoadMI.eraseFromParent();
4087+
return Legalized;
4088+
}
4089+
40754090
// TODO: We can do better than scalarizing the vector and at least split it
40764091
// in half.
40774092
return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel -o - %s | FileCheck %s
3+
4+
--- |
5+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
6+
target triple = "aarch64"
7+
8+
define <3 x i16> @range_v3i16(ptr %a_ptr, ptr %b_ptr) {
9+
%a = load <3 x i16>, ptr %a_ptr, align 8, !range !0, !noundef !1
10+
%b = load <3 x i16>, ptr %b_ptr, align 8, !range !2, !noundef !1
11+
%result = add <3 x i16> %a, %b
12+
ret <3 x i16> %result
13+
}
14+
15+
!0 = !{i16 16, i16 17}
16+
!1 = !{}
17+
!2 = !{i16 32, i16 33}
18+
...
19+
---
20+
name: range_v3i16
21+
body: |
22+
bb.1 (%ir-block.0):
23+
liveins: $x0, $x1
24+
; Make sure we drop the range metadata when widening an aligned load.
25+
26+
; CHECK-LABEL: name: range_v3i16
27+
; CHECK: liveins: $x0, $x1
28+
; CHECK-NEXT: {{ $}}
29+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
30+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
31+
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>) from %ir.a_ptr)
32+
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY1]](p0) :: (load (<4 x s16>) from %ir.b_ptr)
33+
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s16>) = G_ADD [[LOAD]], [[LOAD1]]
34+
; CHECK-NEXT: $d0 = COPY [[ADD]](<4 x s16>)
35+
; CHECK-NEXT: RET_ReallyLR implicit $d0
36+
%0:_(p0) = COPY $x0
37+
%1:_(p0) = COPY $x1
38+
%2:_(<3 x s16>) = G_LOAD %0(p0) :: (load (<3 x s16>) from %ir.a_ptr, align 8, !range !0)
39+
%3:_(<3 x s16>) = G_LOAD %1(p0) :: (load (<3 x s16>) from %ir.b_ptr, align 8, !range !2)
40+
%4:_(<3 x s16>) = G_ADD %2, %3
41+
%5:_(s16), %6:_(s16), %7:_(s16) = G_UNMERGE_VALUES %4(<3 x s16>)
42+
%8:_(s16) = G_IMPLICIT_DEF
43+
%9:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %8(s16)
44+
$d0 = COPY %9(<4 x s16>)
45+
RET_ReallyLR implicit $d0
46+
...

llvm/test/CodeGen/AArch64/add.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
110110
;
111111
; CHECK-GI-LABEL: v3i8:
112112
; CHECK-GI: // %bb.0: // %entry
113-
; CHECK-GI-NEXT: ldrb w8, [x0]
114-
; CHECK-GI-NEXT: ldrb w9, [x1]
115-
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
116-
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
113+
; CHECK-GI-NEXT: ldr w8, [x0]
114+
; CHECK-GI-NEXT: ldr w9, [x1]
117115
; CHECK-GI-NEXT: fmov s0, w8
118116
; CHECK-GI-NEXT: fmov s1, w9
119-
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
120-
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
121-
; CHECK-GI-NEXT: mov v0.h[1], w10
122-
; CHECK-GI-NEXT: mov v1.h[1], w11
117+
; CHECK-GI-NEXT: mov b2, v0.b[1]
118+
; CHECK-GI-NEXT: mov b3, v1.b[1]
119+
; CHECK-GI-NEXT: mov b4, v0.b[2]
120+
; CHECK-GI-NEXT: mov b5, v1.b[2]
121+
; CHECK-GI-NEXT: fmov w8, s2
122+
; CHECK-GI-NEXT: fmov w9, s3
123+
; CHECK-GI-NEXT: mov v0.h[1], w8
124+
; CHECK-GI-NEXT: mov v1.h[1], w9
125+
; CHECK-GI-NEXT: fmov w8, s4
126+
; CHECK-GI-NEXT: fmov w9, s5
123127
; CHECK-GI-NEXT: mov v0.h[2], w8
124128
; CHECK-GI-NEXT: mov v1.h[2], w9
125129
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
270274
;
271275
; CHECK-GI-LABEL: v3i16:
272276
; CHECK-GI: // %bb.0: // %entry
273-
; CHECK-GI-NEXT: ldr h0, [x0]
274-
; CHECK-GI-NEXT: ldr h1, [x1]
277+
; CHECK-GI-NEXT: ldr d0, [x0]
278+
; CHECK-GI-NEXT: ldr d1, [x1]
275279
; CHECK-GI-NEXT: add x8, x0, #2
276-
; CHECK-GI-NEXT: add x9, x1, #2
277-
; CHECK-GI-NEXT: add x10, x1, #4
278-
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
279-
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
280280
; CHECK-GI-NEXT: add x9, x0, #4
281-
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
282-
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
283281
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
284282
; CHECK-GI-NEXT: str h0, [x0]
285283
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]

llvm/test/CodeGen/AArch64/andorxor.ll

Lines changed: 42 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -302,16 +302,20 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
302302
;
303303
; CHECK-GI-LABEL: and_v3i8:
304304
; CHECK-GI: // %bb.0: // %entry
305-
; CHECK-GI-NEXT: ldrb w8, [x0]
306-
; CHECK-GI-NEXT: ldrb w9, [x1]
307-
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
308-
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
305+
; CHECK-GI-NEXT: ldr w8, [x0]
306+
; CHECK-GI-NEXT: ldr w9, [x1]
309307
; CHECK-GI-NEXT: fmov s0, w8
310308
; CHECK-GI-NEXT: fmov s1, w9
311-
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
312-
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
313-
; CHECK-GI-NEXT: mov v0.h[1], w10
314-
; CHECK-GI-NEXT: mov v1.h[1], w11
309+
; CHECK-GI-NEXT: mov b2, v0.b[1]
310+
; CHECK-GI-NEXT: mov b3, v1.b[1]
311+
; CHECK-GI-NEXT: mov b4, v0.b[2]
312+
; CHECK-GI-NEXT: mov b5, v1.b[2]
313+
; CHECK-GI-NEXT: fmov w8, s2
314+
; CHECK-GI-NEXT: fmov w9, s3
315+
; CHECK-GI-NEXT: mov v0.h[1], w8
316+
; CHECK-GI-NEXT: mov v1.h[1], w9
317+
; CHECK-GI-NEXT: fmov w8, s4
318+
; CHECK-GI-NEXT: fmov w9, s5
315319
; CHECK-GI-NEXT: mov v0.h[2], w8
316320
; CHECK-GI-NEXT: mov v1.h[2], w9
317321
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
@@ -350,16 +354,20 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
350354
;
351355
; CHECK-GI-LABEL: or_v3i8:
352356
; CHECK-GI: // %bb.0: // %entry
353-
; CHECK-GI-NEXT: ldrb w8, [x0]
354-
; CHECK-GI-NEXT: ldrb w9, [x1]
355-
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
356-
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
357+
; CHECK-GI-NEXT: ldr w8, [x0]
358+
; CHECK-GI-NEXT: ldr w9, [x1]
357359
; CHECK-GI-NEXT: fmov s0, w8
358360
; CHECK-GI-NEXT: fmov s1, w9
359-
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
360-
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
361-
; CHECK-GI-NEXT: mov v0.h[1], w10
362-
; CHECK-GI-NEXT: mov v1.h[1], w11
361+
; CHECK-GI-NEXT: mov b2, v0.b[1]
362+
; CHECK-GI-NEXT: mov b3, v1.b[1]
363+
; CHECK-GI-NEXT: mov b4, v0.b[2]
364+
; CHECK-GI-NEXT: mov b5, v1.b[2]
365+
; CHECK-GI-NEXT: fmov w8, s2
366+
; CHECK-GI-NEXT: fmov w9, s3
367+
; CHECK-GI-NEXT: mov v0.h[1], w8
368+
; CHECK-GI-NEXT: mov v1.h[1], w9
369+
; CHECK-GI-NEXT: fmov w8, s4
370+
; CHECK-GI-NEXT: fmov w9, s5
363371
; CHECK-GI-NEXT: mov v0.h[2], w8
364372
; CHECK-GI-NEXT: mov v1.h[2], w9
365373
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b
@@ -398,16 +406,20 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
398406
;
399407
; CHECK-GI-LABEL: xor_v3i8:
400408
; CHECK-GI: // %bb.0: // %entry
401-
; CHECK-GI-NEXT: ldrb w8, [x0]
402-
; CHECK-GI-NEXT: ldrb w9, [x1]
403-
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
404-
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
409+
; CHECK-GI-NEXT: ldr w8, [x0]
410+
; CHECK-GI-NEXT: ldr w9, [x1]
405411
; CHECK-GI-NEXT: fmov s0, w8
406412
; CHECK-GI-NEXT: fmov s1, w9
407-
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
408-
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
409-
; CHECK-GI-NEXT: mov v0.h[1], w10
410-
; CHECK-GI-NEXT: mov v1.h[1], w11
413+
; CHECK-GI-NEXT: mov b2, v0.b[1]
414+
; CHECK-GI-NEXT: mov b3, v1.b[1]
415+
; CHECK-GI-NEXT: mov b4, v0.b[2]
416+
; CHECK-GI-NEXT: mov b5, v1.b[2]
417+
; CHECK-GI-NEXT: fmov w8, s2
418+
; CHECK-GI-NEXT: fmov w9, s3
419+
; CHECK-GI-NEXT: mov v0.h[1], w8
420+
; CHECK-GI-NEXT: mov v1.h[1], w9
421+
; CHECK-GI-NEXT: fmov w8, s4
422+
; CHECK-GI-NEXT: fmov w9, s5
411423
; CHECK-GI-NEXT: mov v0.h[2], w8
412424
; CHECK-GI-NEXT: mov v1.h[2], w9
413425
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b
@@ -805,16 +817,10 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
805817
;
806818
; CHECK-GI-LABEL: and_v3i16:
807819
; CHECK-GI: // %bb.0: // %entry
808-
; CHECK-GI-NEXT: ldr h0, [x0]
809-
; CHECK-GI-NEXT: ldr h1, [x1]
820+
; CHECK-GI-NEXT: ldr d0, [x0]
821+
; CHECK-GI-NEXT: ldr d1, [x1]
810822
; CHECK-GI-NEXT: add x8, x0, #2
811-
; CHECK-GI-NEXT: add x9, x1, #2
812-
; CHECK-GI-NEXT: add x10, x1, #4
813-
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
814-
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
815823
; CHECK-GI-NEXT: add x9, x0, #4
816-
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
817-
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
818824
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
819825
; CHECK-GI-NEXT: str h0, [x0]
820826
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
@@ -842,16 +848,10 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
842848
;
843849
; CHECK-GI-LABEL: or_v3i16:
844850
; CHECK-GI: // %bb.0: // %entry
845-
; CHECK-GI-NEXT: ldr h0, [x0]
846-
; CHECK-GI-NEXT: ldr h1, [x1]
851+
; CHECK-GI-NEXT: ldr d0, [x0]
852+
; CHECK-GI-NEXT: ldr d1, [x1]
847853
; CHECK-GI-NEXT: add x8, x0, #2
848-
; CHECK-GI-NEXT: add x9, x1, #2
849-
; CHECK-GI-NEXT: add x10, x1, #4
850-
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
851-
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
852854
; CHECK-GI-NEXT: add x9, x0, #4
853-
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
854-
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
855855
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b
856856
; CHECK-GI-NEXT: str h0, [x0]
857857
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
@@ -879,16 +879,10 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
879879
;
880880
; CHECK-GI-LABEL: xor_v3i16:
881881
; CHECK-GI: // %bb.0: // %entry
882-
; CHECK-GI-NEXT: ldr h0, [x0]
883-
; CHECK-GI-NEXT: ldr h1, [x1]
882+
; CHECK-GI-NEXT: ldr d0, [x0]
883+
; CHECK-GI-NEXT: ldr d1, [x1]
884884
; CHECK-GI-NEXT: add x8, x0, #2
885-
; CHECK-GI-NEXT: add x9, x1, #2
886-
; CHECK-GI-NEXT: add x10, x1, #4
887-
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
888-
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
889885
; CHECK-GI-NEXT: add x9, x0, #4
890-
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
891-
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
892886
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b
893887
; CHECK-GI-NEXT: str h0, [x0]
894888
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]

llvm/test/CodeGen/AArch64/ctlz.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,16 @@ define void @v3i8(ptr %p1) {
5656
;
5757
; CHECK-GI-LABEL: v3i8:
5858
; CHECK-GI: // %bb.0: // %entry
59-
; CHECK-GI-NEXT: ldr b0, [x0]
60-
; CHECK-GI-NEXT: add x8, x0, #1
59+
; CHECK-GI-NEXT: ldr w8, [x0]
6160
; CHECK-GI-NEXT: add x9, x0, #2
62-
; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
63-
; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9]
64-
; CHECK-GI-NEXT: clz v0.8b, v0.8b
61+
; CHECK-GI-NEXT: fmov s0, w8
62+
; CHECK-GI-NEXT: add x8, x0, #1
63+
; CHECK-GI-NEXT: mov b1, v0.b[1]
64+
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
65+
; CHECK-GI-NEXT: mov b0, v0.b[2]
66+
; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
67+
; CHECK-GI-NEXT: mov v2.b[2], v0.b[0]
68+
; CHECK-GI-NEXT: clz v0.8b, v2.8b
6569
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
6670
; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
6771
; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
181185
;
182186
; CHECK-GI-LABEL: v3i16:
183187
; CHECK-GI: // %bb.0: // %entry
184-
; CHECK-GI-NEXT: ldr h0, [x0]
188+
; CHECK-GI-NEXT: ldr d0, [x0]
185189
; CHECK-GI-NEXT: add x8, x0, #2
186190
; CHECK-GI-NEXT: add x9, x0, #4
187-
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
188-
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
189191
; CHECK-GI-NEXT: clz v0.4h, v0.4h
190192
; CHECK-GI-NEXT: str h0, [x0]
191193
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]

llvm/test/CodeGen/AArch64/ctpop.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,16 @@ define void @v3i8(ptr %p1) {
5555
;
5656
; CHECK-GI-LABEL: v3i8:
5757
; CHECK-GI: // %bb.0: // %entry
58-
; CHECK-GI-NEXT: ldr b0, [x0]
59-
; CHECK-GI-NEXT: add x8, x0, #1
58+
; CHECK-GI-NEXT: ldr w8, [x0]
6059
; CHECK-GI-NEXT: add x9, x0, #2
61-
; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
62-
; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9]
63-
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
60+
; CHECK-GI-NEXT: fmov s0, w8
61+
; CHECK-GI-NEXT: add x8, x0, #1
62+
; CHECK-GI-NEXT: mov b1, v0.b[1]
63+
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
64+
; CHECK-GI-NEXT: mov b0, v0.b[2]
65+
; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
66+
; CHECK-GI-NEXT: mov v2.b[2], v0.b[0]
67+
; CHECK-GI-NEXT: cnt v0.8b, v2.8b
6468
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
6569
; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
6670
; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
181185
;
182186
; CHECK-GI-LABEL: v3i16:
183187
; CHECK-GI: // %bb.0: // %entry
184-
; CHECK-GI-NEXT: ldr h0, [x0]
188+
; CHECK-GI-NEXT: ldr d0, [x0]
185189
; CHECK-GI-NEXT: add x8, x0, #2
186190
; CHECK-GI-NEXT: add x9, x0, #4
187-
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
188-
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
189191
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
190192
; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
191193
; CHECK-GI-NEXT: str h0, [x0]

llvm/test/CodeGen/AArch64/cttz.ll

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -68,21 +68,23 @@ define void @v3i8(ptr %p1) {
6868
;
6969
; CHECK-GI-LABEL: v3i8:
7070
; CHECK-GI: // %bb.0: // %entry
71-
; CHECK-GI-NEXT: ldrb w9, [x0]
71+
; CHECK-GI-NEXT: ldr w9, [x0]
7272
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
73-
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
74-
; CHECK-GI-NEXT: fmov s0, w8
75-
; CHECK-GI-NEXT: fmov s1, w9
76-
; CHECK-GI-NEXT: ldrb w9, [x0, #2]
77-
; CHECK-GI-NEXT: mov v0.h[1], w8
78-
; CHECK-GI-NEXT: mov v1.h[1], w10
79-
; CHECK-GI-NEXT: mov v0.h[2], w8
73+
; CHECK-GI-NEXT: fmov s2, w8
74+
; CHECK-GI-NEXT: fmov s0, w9
75+
; CHECK-GI-NEXT: mov v2.h[1], w8
76+
; CHECK-GI-NEXT: mov b1, v0.b[1]
77+
; CHECK-GI-NEXT: mov v2.h[2], w8
8078
; CHECK-GI-NEXT: add x8, x0, #1
81-
; CHECK-GI-NEXT: mov v1.h[2], w9
79+
; CHECK-GI-NEXT: fmov w9, s1
80+
; CHECK-GI-NEXT: mov b1, v0.b[2]
81+
; CHECK-GI-NEXT: mov v0.h[1], w9
82+
; CHECK-GI-NEXT: fmov w9, s1
83+
; CHECK-GI-NEXT: mov v0.h[2], w9
8284
; CHECK-GI-NEXT: add x9, x0, #2
83-
; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b
84-
; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
85-
; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b
85+
; CHECK-GI-NEXT: eor v1.8b, v0.8b, v2.8b
86+
; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h
87+
; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
8688
; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
8789
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
8890
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
@@ -275,22 +277,20 @@ define void @v3i16(ptr %p1) {
275277
; CHECK-GI-LABEL: v3i16:
276278
; CHECK-GI: // %bb.0: // %entry
277279
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
278-
; CHECK-GI-NEXT: ldr h1, [x0]
279-
; CHECK-GI-NEXT: add x9, x0, #2
280+
; CHECK-GI-NEXT: ldr d1, [x0]
281+
; CHECK-GI-NEXT: add x9, x0, #4
280282
; CHECK-GI-NEXT: fmov s0, w8
281-
; CHECK-GI-NEXT: add x10, x0, #4
282-
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
283283
; CHECK-GI-NEXT: mov v0.h[1], w8
284-
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
285284
; CHECK-GI-NEXT: mov v0.h[2], w8
285+
; CHECK-GI-NEXT: add x8, x0, #2
286286
; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b
287287
; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
288288
; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b
289289
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
290290
; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
291291
; CHECK-GI-NEXT: str h0, [x0]
292-
; CHECK-GI-NEXT: st1 { v0.h }[1], [x9]
293-
; CHECK-GI-NEXT: st1 { v0.h }[2], [x10]
292+
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
293+
; CHECK-GI-NEXT: st1 { v0.h }[2], [x9]
294294
; CHECK-GI-NEXT: ret
295295
entry:
296296
%d = load <3 x i16>, ptr %p1

0 commit comments

Comments
 (0)