Skip to content

Commit cad4275

Browse files
committed
[DAGCombiner] Enable SimplifyDemandedBits vector support for TRUNCATE
Add DemandedElts support inside the TRUNCATE analysis. Differential Revision: https://reviews.llvm.org/D56387
1 parent 8ba442b commit cad4275

File tree

13 files changed

+222
-153
lines changed

13 files changed

+222
-153
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11952,8 +11952,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1195211952
}
1195311953

1195411954
// Simplify the operands using demanded-bits information.
11955-
if (!VT.isVector() &&
11956-
SimplifyDemandedBits(SDValue(N, 0)))
11955+
if (SimplifyDemandedBits(SDValue(N, 0)))
1195711956
return SDValue(N, 0);
1195811957

1195911958
// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,7 +1986,8 @@ bool TargetLowering::SimplifyDemandedBits(
19861986
// zero/one bits live out.
19871987
unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
19881988
APInt TruncMask = DemandedBits.zext(OperandBitWidth);
1989-
if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1))
1989+
if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO,
1990+
Depth + 1))
19901991
return true;
19911992
Known = Known.trunc(BitWidth);
19921993

@@ -2009,9 +2010,9 @@ bool TargetLowering::SimplifyDemandedBits(
20092010
// undesirable.
20102011
break;
20112012

2012-
SDValue ShAmt = Src.getOperand(1);
2013-
auto *ShAmtC = dyn_cast<ConstantSDNode>(ShAmt);
2014-
if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
2013+
const APInt *ShAmtC =
2014+
TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts);
2015+
if (!ShAmtC)
20152016
break;
20162017
uint64_t ShVal = ShAmtC->getZExtValue();
20172018

@@ -2023,6 +2024,7 @@ bool TargetLowering::SimplifyDemandedBits(
20232024
if (!(HighBits & DemandedBits)) {
20242025
// None of the shifted in bits are needed. Add a truncate of the
20252026
// shift input, then shift it.
2027+
SDValue ShAmt = Src.getOperand(1);
20262028
if (TLO.LegalTypes())
20272029
ShAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));
20282030
SDValue NewTrunc =

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3399,6 +3399,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
33993399

34003400
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
34013401
return N->getOpcode() == ISD::SIGN_EXTEND ||
3402+
N->getOpcode() == ISD::ANY_EXTEND ||
34023403
isExtendedBUILD_VECTOR(N, DAG, true);
34033404
}
34043405

llvm/test/CodeGen/AArch64/aarch64-smull.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ define <8 x i16> @amull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
9696
; CHECK: // %bb.0:
9797
; CHECK-NEXT: ldr d0, [x0]
9898
; CHECK-NEXT: ldr d1, [x1]
99-
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
99+
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
100100
; CHECK-NEXT: bic v0.8h, #255, lsl #8
101101
; CHECK-NEXT: ret
102102
%tmp1 = load <8 x i8>, <8 x i8>* %A
@@ -113,7 +113,7 @@ define <4 x i32> @amull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
113113
; CHECK: // %bb.0:
114114
; CHECK-NEXT: ldr d0, [x0]
115115
; CHECK-NEXT: ldr d1, [x1]
116-
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
116+
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
117117
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
118118
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
119119
; CHECK-NEXT: ret
@@ -131,7 +131,7 @@ define <2 x i64> @amull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
131131
; CHECK: // %bb.0:
132132
; CHECK-NEXT: ldr d0, [x0]
133133
; CHECK-NEXT: ldr d1, [x1]
134-
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
134+
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
135135
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
136136
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
137137
; CHECK-NEXT: ret
@@ -258,7 +258,7 @@ define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) no
258258
; CHECK-NEXT: ldr q0, [x0]
259259
; CHECK-NEXT: ldr d1, [x1]
260260
; CHECK-NEXT: ldr d2, [x2]
261-
; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
261+
; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
262262
; CHECK-NEXT: bic v0.8h, #255, lsl #8
263263
; CHECK-NEXT: ret
264264
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -278,7 +278,7 @@ define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C)
278278
; CHECK-NEXT: ldr q0, [x0]
279279
; CHECK-NEXT: ldr d1, [x1]
280280
; CHECK-NEXT: ldr d2, [x2]
281-
; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
281+
; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
282282
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
283283
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
284284
; CHECK-NEXT: ret
@@ -299,7 +299,7 @@ define <2 x i64> @amlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C)
299299
; CHECK-NEXT: ldr q0, [x0]
300300
; CHECK-NEXT: ldr d1, [x1]
301301
; CHECK-NEXT: ldr d2, [x2]
302-
; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
302+
; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
303303
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
304304
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
305305
; CHECK-NEXT: ret
@@ -428,7 +428,7 @@ define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) no
428428
; CHECK-NEXT: ldr q0, [x0]
429429
; CHECK-NEXT: ldr d1, [x1]
430430
; CHECK-NEXT: ldr d2, [x2]
431-
; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
431+
; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
432432
; CHECK-NEXT: bic v0.8h, #255, lsl #8
433433
; CHECK-NEXT: ret
434434
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -448,7 +448,7 @@ define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C)
448448
; CHECK-NEXT: ldr q0, [x0]
449449
; CHECK-NEXT: ldr d1, [x1]
450450
; CHECK-NEXT: ldr d2, [x2]
451-
; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
451+
; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
452452
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
453453
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
454454
; CHECK-NEXT: ret
@@ -469,7 +469,7 @@ define <2 x i64> @amlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C)
469469
; CHECK-NEXT: ldr q0, [x0]
470470
; CHECK-NEXT: ldr d1, [x1]
471471
; CHECK-NEXT: ldr d2, [x2]
472-
; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
472+
; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
473473
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
474474
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
475475
; CHECK-NEXT: ret
@@ -586,7 +586,7 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
586586
; CHECK-LABEL: amull_extvec_v8i8_v8i16:
587587
; CHECK: // %bb.0:
588588
; CHECK-NEXT: movi v1.8b, #12
589-
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
589+
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
590590
; CHECK-NEXT: bic v0.8h, #255, lsl #8
591591
; CHECK-NEXT: ret
592592
%tmp3 = zext <8 x i8> %arg to <8 x i16>
@@ -600,7 +600,7 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
600600
; CHECK: // %bb.0:
601601
; CHECK-NEXT: mov w8, #1234
602602
; CHECK-NEXT: dup v1.4h, w8
603-
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
603+
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
604604
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
605605
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
606606
; CHECK-NEXT: ret
@@ -615,7 +615,7 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
615615
; CHECK: // %bb.0:
616616
; CHECK-NEXT: mov w8, #1234
617617
; CHECK-NEXT: dup v1.2s, w8
618-
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
618+
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
619619
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
620620
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
621621
; CHECK-NEXT: ret
@@ -752,8 +752,8 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
752752
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
753753
; CHECK-LABEL: amull2_i8:
754754
; CHECK: // %bb.0:
755-
; CHECK-NEXT: umull v2.8h, v0.8b, v1.8b
756-
; CHECK-NEXT: umull2 v1.8h, v0.16b, v1.16b
755+
; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
756+
; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
757757
; CHECK-NEXT: bic v2.8h, #255, lsl #8
758758
; CHECK-NEXT: bic v1.8h, #255, lsl #8
759759
; CHECK-NEXT: mov v0.16b, v2.16b
@@ -768,8 +768,8 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
768768
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
769769
; CHECK-LABEL: amull2_i16:
770770
; CHECK: // %bb.0:
771-
; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h
772-
; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
771+
; CHECK-NEXT: smull v2.4s, v0.4h, v1.4h
772+
; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
773773
; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
774774
; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
775775
; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
@@ -784,8 +784,8 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
784784
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
785785
; CHECK-LABEL: amull2_i32:
786786
; CHECK: // %bb.0:
787-
; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
788-
; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
787+
; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s
788+
; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
789789
; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
790790
; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
791791
; CHECK-NEXT: and v0.16b, v2.16b, v3.16b

llvm/test/CodeGen/AArch64/lowerMUL-newload.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ define <4 x i16> @mlai16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2
55
; CHECK-LABEL: mlai16_trunc:
66
; CHECK: // %bb.0: // %entry
77
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
8-
; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
8+
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
99
; CHECK-NEXT: xtn v0.4h, v0.4s
1010
; CHECK-NEXT: ret
1111
entry:
@@ -21,7 +21,7 @@ entry:
2121
define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
2222
; CHECK-LABEL: mlai16_and:
2323
; CHECK: // %bb.0: // %entry
24-
; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h
24+
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
2525
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
2626
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
2727
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -43,7 +43,7 @@ define void @mlai16_loadstore(i16* %a, i16* %b, i16* %c) {
4343
; CHECK-NEXT: ldr d1, [x1, #16]
4444
; CHECK-NEXT: ldr d2, [x2, #16]
4545
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
46-
; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
46+
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
4747
; CHECK-NEXT: xtn v0.4h, v0.4s
4848
; CHECK-NEXT: str d0, [x0, #16]
4949
; CHECK-NEXT: ret
@@ -89,8 +89,8 @@ entry:
8989
define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
9090
; CHECK-LABEL: addmuli16_and:
9191
; CHECK: // %bb.0: // %entry
92-
; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
93-
; CHECK-NEXT: umlal v1.4s, v0.4h, v2.4h
92+
; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
93+
; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h
9494
; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
9595
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
9696
; CHECK-NEXT: ret
@@ -141,7 +141,7 @@ define <2 x i32> @mlai32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2
141141
; CHECK-LABEL: mlai32_trunc:
142142
; CHECK: // %bb.0: // %entry
143143
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
144-
; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s
144+
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
145145
; CHECK-NEXT: xtn v0.2s, v0.2d
146146
; CHECK-NEXT: ret
147147
entry:
@@ -157,7 +157,7 @@ entry:
157157
define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
158158
; CHECK-LABEL: mlai32_and:
159159
; CHECK: // %bb.0: // %entry
160-
; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s
160+
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
161161
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
162162
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
163163
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -179,7 +179,7 @@ define void @mlai32_loadstore(i32* %a, i32* %b, i32* %c) {
179179
; CHECK-NEXT: ldr d1, [x1, #32]
180180
; CHECK-NEXT: ldr d2, [x2, #32]
181181
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
182-
; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s
182+
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
183183
; CHECK-NEXT: xtn v0.2s, v0.2d
184184
; CHECK-NEXT: str d0, [x0, #32]
185185
; CHECK-NEXT: ret
@@ -225,8 +225,8 @@ entry:
225225
define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
226226
; CHECK-LABEL: addmuli32_and:
227227
; CHECK: // %bb.0: // %entry
228-
; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
229-
; CHECK-NEXT: umlal v1.2d, v0.2s, v2.2s
228+
; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
229+
; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s
230230
; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
231231
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
232232
; CHECK-NEXT: ret
@@ -359,7 +359,7 @@ define void @func2(i16* %a, i16* %b, i16* %c) {
359359
; CHECK-NEXT: str d1, [x1, #16]
360360
; CHECK-NEXT: ldr d1, [x2, #16]
361361
; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
362-
; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
362+
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
363363
; CHECK-NEXT: xtn v0.4h, v0.4s
364364
; CHECK-NEXT: str d0, [x0, #16]
365365
; CHECK-NEXT: ret

llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
240240
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
241241
; VI-NEXT: s_or_b32 s0, s1, 4
242242
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
243-
; VI-NEXT: s_and_b32 s0, s0, 0xff
244-
; VI-NEXT: v_or_b32_e32 v2, s0, v0
243+
; VI-NEXT: v_mov_b32_e32 v1, s0
244+
; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
245245
; VI-NEXT: v_mov_b32_e32 v0, 0
246246
; VI-NEXT: v_mov_b32_e32 v1, 0
247247
; VI-NEXT: flat_store_short v[0:1], v2

llvm/test/CodeGen/ARM/lowerMUL-newload.ll

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
define arm_aapcs_vfpcc <4 x i16> @mla_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
55
; CHECK-LABEL: mla_args:
66
; CHECK: @ %bb.0: @ %entry
7-
; CHECK-NEXT: vmull.s16 q8, d1, d0
8-
; CHECK-NEXT: vaddw.s16 q8, q8, d2
7+
; CHECK-NEXT: vmull.u16 q8, d1, d0
8+
; CHECK-NEXT: vaddw.u16 q8, q8, d2
99
; CHECK-NEXT: vmovn.i32 d0, q8
1010
; CHECK-NEXT: bx lr
1111
entry:
@@ -24,8 +24,8 @@ define void @mla_loadstore(i16* %a, i16* %b, i16* %c) {
2424
; CHECK-NEXT: vldr d16, [r0, #16]
2525
; CHECK-NEXT: vldr d17, [r1, #16]
2626
; CHECK-NEXT: vldr d18, [r2, #16]
27-
; CHECK-NEXT: vmull.s16 q8, d17, d16
28-
; CHECK-NEXT: vaddw.s16 q8, q8, d18
27+
; CHECK-NEXT: vmull.u16 q8, d17, d16
28+
; CHECK-NEXT: vaddw.u16 q8, q8, d18
2929
; CHECK-NEXT: vmovn.i32 d16, q8
3030
; CHECK-NEXT: vstr d16, [r0, #16]
3131
; CHECK-NEXT: bx lr
@@ -54,8 +54,8 @@ entry:
5454
define arm_aapcs_vfpcc <4 x i16> @addmul_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
5555
; CHECK-LABEL: addmul_args:
5656
; CHECK: @ %bb.0: @ %entry
57-
; CHECK-NEXT: vmull.s16 q8, d1, d2
58-
; CHECK-NEXT: vmlal.s16 q8, d0, d2
57+
; CHECK-NEXT: vmull.u16 q8, d1, d2
58+
; CHECK-NEXT: vmlal.u16 q8, d0, d2
5959
; CHECK-NEXT: vmovn.i32 d0, q8
6060
; CHECK-NEXT: bx lr
6161
entry:
@@ -73,9 +73,9 @@ define void @addmul_loadstore(i16* %a, i16* %b, i16* %c) {
7373
; CHECK: @ %bb.0: @ %entry
7474
; CHECK-NEXT: vldr d16, [r2, #16]
7575
; CHECK-NEXT: vldr d17, [r1, #16]
76-
; CHECK-NEXT: vmull.s16 q9, d17, d16
76+
; CHECK-NEXT: vmull.u16 q9, d17, d16
7777
; CHECK-NEXT: vldr d17, [r0, #16]
78-
; CHECK-NEXT: vmlal.s16 q9, d17, d16
78+
; CHECK-NEXT: vmlal.u16 q9, d17, d16
7979
; CHECK-NEXT: vmovn.i32 d16, q9
8080
; CHECK-NEXT: vstr d16, [r0, #16]
8181
; CHECK-NEXT: bx lr
@@ -108,7 +108,7 @@ define void @func1(i16* %a, i16* %b, i16* %c) {
108108
; CHECK-NEXT: vldr d18, [r2, #16]
109109
; CHECK-NEXT: vld1.16 {d16}, [r3:64]
110110
; CHECK-NEXT: vmovl.u16 q8, d16
111-
; CHECK-NEXT: vaddw.s16 q10, q8, d18
111+
; CHECK-NEXT: vaddw.u16 q10, q8, d18
112112
; CHECK-NEXT: vmovn.i32 d19, q10
113113
; CHECK-NEXT: vldr d20, [r0, #16]
114114
; CHECK-NEXT: vstr d19, [r0, #16]
@@ -119,7 +119,7 @@ define void @func1(i16* %a, i16* %b, i16* %c) {
119119
; CHECK-NEXT: vmovn.i32 d16, q11
120120
; CHECK-NEXT: vstr d16, [r1, #16]
121121
; CHECK-NEXT: vldr d16, [r2, #16]
122-
; CHECK-NEXT: vmlal.s16 q11, d16, d20
122+
; CHECK-NEXT: vmlal.u16 q11, d16, d20
123123
; CHECK-NEXT: vmovn.i32 d16, q11
124124
; CHECK-NEXT: vstr d16, [r0, #16]
125125
; CHECK-NEXT: bx lr
@@ -175,23 +175,26 @@ entry:
175175
define void @func2(i16* %a, i16* %b, i16* %c) {
176176
; CHECK-LABEL: func2:
177177
; CHECK: @ %bb.0: @ %entry
178-
; CHECK-NEXT: add r3, r1, #16
178+
; CHECK-NEXT: vldr d16, [r1, #16]
179+
; CHECK-NEXT: add r3, r0, #16
180+
; CHECK-NEXT: vldr d17, [r2, #16]
181+
; CHECK-NEXT: vaddl.u16 q9, d17, d16
182+
; CHECK-NEXT: vmovn.i32 d18, q9
183+
; CHECK-NEXT: vld1.16 {d19}, [r3:64]
184+
; CHECK-NEXT: vstr d18, [r0, #16]
179185
; CHECK-NEXT: vldr d18, [r2, #16]
180-
; CHECK-NEXT: vld1.16 {d16}, [r3:64]
186+
; CHECK-NEXT: vmull.s16 q10, d17, d18
187+
; CHECK-NEXT: vmovl.s16 q11, d18
181188
; CHECK-NEXT: vmovl.u16 q8, d16
182-
; CHECK-NEXT: vaddw.s16 q10, q8, d18
183-
; CHECK-NEXT: vmovn.i32 d19, q10
184-
; CHECK-NEXT: vldr d20, [r0, #16]
185-
; CHECK-NEXT: vstr d19, [r0, #16]
186-
; CHECK-NEXT: vldr d19, [r2, #16]
187-
; CHECK-NEXT: vmull.s16 q11, d18, d19
188189
; CHECK-NEXT: vmovl.s16 q9, d19
189-
; CHECK-NEXT: vmla.i32 q11, q8, q9
190-
; CHECK-NEXT: vmovn.i32 d16, q11
190+
; CHECK-NEXT: vmla.i32 q10, q8, q11
191+
; CHECK-NEXT: vmovn.i32 d16, q10
191192
; CHECK-NEXT: vstr d16, [r1, #16]
192-
; CHECK-NEXT: vldr d16, [r2, #16]
193-
; CHECK-NEXT: vmlal.s16 q11, d16, d20
194-
; CHECK-NEXT: vaddw.s16 q8, q11, d20
193+
; CHECK-NEXT: add r1, r2, #16
194+
; CHECK-NEXT: vld1.16 {d16}, [r1:64]
195+
; CHECK-NEXT: vmovl.u16 q8, d16
196+
; CHECK-NEXT: vmla.i32 q10, q8, q9
197+
; CHECK-NEXT: vadd.i32 q8, q10, q9
195198
; CHECK-NEXT: vmovn.i32 d16, q8
196199
; CHECK-NEXT: vstr d16, [r0, #16]
197200
; CHECK-NEXT: bx lr

0 commit comments

Comments
 (0)