Skip to content

Commit a512260

Browse files
committed
Revert "[DAGCombiner] Enable SimplifyDemandedBits vector support for TRUNCATE"
It caused "Vector shift amounts must be in the same as their first arg" asserts in Chromium builds. See the code review for repro instructions. > Add DemandedElts support inside the TRUNCATE analysis. > > Differential Revision: https://reviews.llvm.org/D56387 This reverts commit cad4275.
1 parent b270fd5 commit a512260

File tree

13 files changed

+153
-222
lines changed

13 files changed

+153
-222
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11952,7 +11952,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1195211952
}
1195311953

1195411954
// Simplify the operands using demanded-bits information.
11955-
if (SimplifyDemandedBits(SDValue(N, 0)))
11955+
if (!VT.isVector() &&
11956+
SimplifyDemandedBits(SDValue(N, 0)))
1195611957
return SDValue(N, 0);
1195711958

1195811959
// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,8 +1986,7 @@ bool TargetLowering::SimplifyDemandedBits(
19861986
// zero/one bits live out.
19871987
unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
19881988
APInt TruncMask = DemandedBits.zext(OperandBitWidth);
1989-
if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO,
1990-
Depth + 1))
1989+
if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1))
19911990
return true;
19921991
Known = Known.trunc(BitWidth);
19931992

@@ -2010,9 +2009,9 @@ bool TargetLowering::SimplifyDemandedBits(
20102009
// undesirable.
20112010
break;
20122011

2013-
const APInt *ShAmtC =
2014-
TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts);
2015-
if (!ShAmtC)
2012+
SDValue ShAmt = Src.getOperand(1);
2013+
auto *ShAmtC = dyn_cast<ConstantSDNode>(ShAmt);
2014+
if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
20162015
break;
20172016
uint64_t ShVal = ShAmtC->getZExtValue();
20182017

@@ -2024,7 +2023,6 @@ bool TargetLowering::SimplifyDemandedBits(
20242023
if (!(HighBits & DemandedBits)) {
20252024
// None of the shifted in bits are needed. Add a truncate of the
20262025
// shift input, then shift it.
2027-
SDValue ShAmt = Src.getOperand(1);
20282026
if (TLO.LegalTypes())
20292027
ShAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));
20302028
SDValue NewTrunc =

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3399,7 +3399,6 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
33993399

34003400
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
34013401
return N->getOpcode() == ISD::SIGN_EXTEND ||
3402-
N->getOpcode() == ISD::ANY_EXTEND ||
34033402
isExtendedBUILD_VECTOR(N, DAG, true);
34043403
}
34053404

llvm/test/CodeGen/AArch64/aarch64-smull.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ define <8 x i16> @amull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
9696
; CHECK: // %bb.0:
9797
; CHECK-NEXT: ldr d0, [x0]
9898
; CHECK-NEXT: ldr d1, [x1]
99-
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
99+
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
100100
; CHECK-NEXT: bic v0.8h, #255, lsl #8
101101
; CHECK-NEXT: ret
102102
%tmp1 = load <8 x i8>, <8 x i8>* %A
@@ -113,7 +113,7 @@ define <4 x i32> @amull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
113113
; CHECK: // %bb.0:
114114
; CHECK-NEXT: ldr d0, [x0]
115115
; CHECK-NEXT: ldr d1, [x1]
116-
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
116+
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
117117
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
118118
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
119119
; CHECK-NEXT: ret
@@ -131,7 +131,7 @@ define <2 x i64> @amull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
131131
; CHECK: // %bb.0:
132132
; CHECK-NEXT: ldr d0, [x0]
133133
; CHECK-NEXT: ldr d1, [x1]
134-
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
134+
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
135135
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
136136
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
137137
; CHECK-NEXT: ret
@@ -258,7 +258,7 @@ define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) no
258258
; CHECK-NEXT: ldr q0, [x0]
259259
; CHECK-NEXT: ldr d1, [x1]
260260
; CHECK-NEXT: ldr d2, [x2]
261-
; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
261+
; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
262262
; CHECK-NEXT: bic v0.8h, #255, lsl #8
263263
; CHECK-NEXT: ret
264264
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -278,7 +278,7 @@ define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C)
278278
; CHECK-NEXT: ldr q0, [x0]
279279
; CHECK-NEXT: ldr d1, [x1]
280280
; CHECK-NEXT: ldr d2, [x2]
281-
; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
281+
; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
282282
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
283283
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
284284
; CHECK-NEXT: ret
@@ -299,7 +299,7 @@ define <2 x i64> @amlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C)
299299
; CHECK-NEXT: ldr q0, [x0]
300300
; CHECK-NEXT: ldr d1, [x1]
301301
; CHECK-NEXT: ldr d2, [x2]
302-
; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
302+
; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
303303
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
304304
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
305305
; CHECK-NEXT: ret
@@ -428,7 +428,7 @@ define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) no
428428
; CHECK-NEXT: ldr q0, [x0]
429429
; CHECK-NEXT: ldr d1, [x1]
430430
; CHECK-NEXT: ldr d2, [x2]
431-
; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
431+
; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
432432
; CHECK-NEXT: bic v0.8h, #255, lsl #8
433433
; CHECK-NEXT: ret
434434
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -448,7 +448,7 @@ define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C)
448448
; CHECK-NEXT: ldr q0, [x0]
449449
; CHECK-NEXT: ldr d1, [x1]
450450
; CHECK-NEXT: ldr d2, [x2]
451-
; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
451+
; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
452452
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
453453
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
454454
; CHECK-NEXT: ret
@@ -469,7 +469,7 @@ define <2 x i64> @amlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C)
469469
; CHECK-NEXT: ldr q0, [x0]
470470
; CHECK-NEXT: ldr d1, [x1]
471471
; CHECK-NEXT: ldr d2, [x2]
472-
; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
472+
; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
473473
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
474474
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
475475
; CHECK-NEXT: ret
@@ -586,7 +586,7 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
586586
; CHECK-LABEL: amull_extvec_v8i8_v8i16:
587587
; CHECK: // %bb.0:
588588
; CHECK-NEXT: movi v1.8b, #12
589-
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
589+
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
590590
; CHECK-NEXT: bic v0.8h, #255, lsl #8
591591
; CHECK-NEXT: ret
592592
%tmp3 = zext <8 x i8> %arg to <8 x i16>
@@ -600,7 +600,7 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
600600
; CHECK: // %bb.0:
601601
; CHECK-NEXT: mov w8, #1234
602602
; CHECK-NEXT: dup v1.4h, w8
603-
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
603+
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
604604
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
605605
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
606606
; CHECK-NEXT: ret
@@ -615,7 +615,7 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
615615
; CHECK: // %bb.0:
616616
; CHECK-NEXT: mov w8, #1234
617617
; CHECK-NEXT: dup v1.2s, w8
618-
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
618+
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
619619
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
620620
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
621621
; CHECK-NEXT: ret
@@ -752,8 +752,8 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
752752
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
753753
; CHECK-LABEL: amull2_i8:
754754
; CHECK: // %bb.0:
755-
; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
756-
; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
755+
; CHECK-NEXT: umull v2.8h, v0.8b, v1.8b
756+
; CHECK-NEXT: umull2 v1.8h, v0.16b, v1.16b
757757
; CHECK-NEXT: bic v2.8h, #255, lsl #8
758758
; CHECK-NEXT: bic v1.8h, #255, lsl #8
759759
; CHECK-NEXT: mov v0.16b, v2.16b
@@ -768,8 +768,8 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
768768
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
769769
; CHECK-LABEL: amull2_i16:
770770
; CHECK: // %bb.0:
771-
; CHECK-NEXT: smull v2.4s, v0.4h, v1.4h
772-
; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
771+
; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h
772+
; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
773773
; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
774774
; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
775775
; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
@@ -784,8 +784,8 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
784784
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
785785
; CHECK-LABEL: amull2_i32:
786786
; CHECK: // %bb.0:
787-
; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s
788-
; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
787+
; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
788+
; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
789789
; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
790790
; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
791791
; CHECK-NEXT: and v0.16b, v2.16b, v3.16b

llvm/test/CodeGen/AArch64/lowerMUL-newload.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ define <4 x i16> @mlai16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2
55
; CHECK-LABEL: mlai16_trunc:
66
; CHECK: // %bb.0: // %entry
77
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
8-
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
8+
; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
99
; CHECK-NEXT: xtn v0.4h, v0.4s
1010
; CHECK-NEXT: ret
1111
entry:
@@ -21,7 +21,7 @@ entry:
2121
define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
2222
; CHECK-LABEL: mlai16_and:
2323
; CHECK: // %bb.0: // %entry
24-
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
24+
; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h
2525
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
2626
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
2727
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -43,7 +43,7 @@ define void @mlai16_loadstore(i16* %a, i16* %b, i16* %c) {
4343
; CHECK-NEXT: ldr d1, [x1, #16]
4444
; CHECK-NEXT: ldr d2, [x2, #16]
4545
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
46-
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
46+
; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
4747
; CHECK-NEXT: xtn v0.4h, v0.4s
4848
; CHECK-NEXT: str d0, [x0, #16]
4949
; CHECK-NEXT: ret
@@ -89,8 +89,8 @@ entry:
8989
define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
9090
; CHECK-LABEL: addmuli16_and:
9191
; CHECK: // %bb.0: // %entry
92-
; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
93-
; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h
92+
; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
93+
; CHECK-NEXT: umlal v1.4s, v0.4h, v2.4h
9494
; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
9595
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
9696
; CHECK-NEXT: ret
@@ -141,7 +141,7 @@ define <2 x i32> @mlai32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2
141141
; CHECK-LABEL: mlai32_trunc:
142142
; CHECK: // %bb.0: // %entry
143143
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
144-
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
144+
; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s
145145
; CHECK-NEXT: xtn v0.2s, v0.2d
146146
; CHECK-NEXT: ret
147147
entry:
@@ -157,7 +157,7 @@ entry:
157157
define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
158158
; CHECK-LABEL: mlai32_and:
159159
; CHECK: // %bb.0: // %entry
160-
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
160+
; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s
161161
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
162162
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
163163
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -179,7 +179,7 @@ define void @mlai32_loadstore(i32* %a, i32* %b, i32* %c) {
179179
; CHECK-NEXT: ldr d1, [x1, #32]
180180
; CHECK-NEXT: ldr d2, [x2, #32]
181181
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
182-
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
182+
; CHECK-NEXT: saddw v0.2d, v0.2d, v2.2s
183183
; CHECK-NEXT: xtn v0.2s, v0.2d
184184
; CHECK-NEXT: str d0, [x0, #32]
185185
; CHECK-NEXT: ret
@@ -225,8 +225,8 @@ entry:
225225
define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
226226
; CHECK-LABEL: addmuli32_and:
227227
; CHECK: // %bb.0: // %entry
228-
; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
229-
; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s
228+
; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
229+
; CHECK-NEXT: umlal v1.2d, v0.2s, v2.2s
230230
; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
231231
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
232232
; CHECK-NEXT: ret
@@ -359,7 +359,7 @@ define void @func2(i16* %a, i16* %b, i16* %c) {
359359
; CHECK-NEXT: str d1, [x1, #16]
360360
; CHECK-NEXT: ldr d1, [x2, #16]
361361
; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
362-
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
362+
; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h
363363
; CHECK-NEXT: xtn v0.4h, v0.4s
364364
; CHECK-NEXT: str d0, [x0, #16]
365365
; CHECK-NEXT: ret

llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
240240
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
241241
; VI-NEXT: s_or_b32 s0, s1, 4
242242
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
243-
; VI-NEXT: v_mov_b32_e32 v1, s0
244-
; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
243+
; VI-NEXT: s_and_b32 s0, s0, 0xff
244+
; VI-NEXT: v_or_b32_e32 v2, s0, v0
245245
; VI-NEXT: v_mov_b32_e32 v0, 0
246246
; VI-NEXT: v_mov_b32_e32 v1, 0
247247
; VI-NEXT: flat_store_short v[0:1], v2

llvm/test/CodeGen/ARM/lowerMUL-newload.ll

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
define arm_aapcs_vfpcc <4 x i16> @mla_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
55
; CHECK-LABEL: mla_args:
66
; CHECK: @ %bb.0: @ %entry
7-
; CHECK-NEXT: vmull.u16 q8, d1, d0
8-
; CHECK-NEXT: vaddw.u16 q8, q8, d2
7+
; CHECK-NEXT: vmull.s16 q8, d1, d0
8+
; CHECK-NEXT: vaddw.s16 q8, q8, d2
99
; CHECK-NEXT: vmovn.i32 d0, q8
1010
; CHECK-NEXT: bx lr
1111
entry:
@@ -24,8 +24,8 @@ define void @mla_loadstore(i16* %a, i16* %b, i16* %c) {
2424
; CHECK-NEXT: vldr d16, [r0, #16]
2525
; CHECK-NEXT: vldr d17, [r1, #16]
2626
; CHECK-NEXT: vldr d18, [r2, #16]
27-
; CHECK-NEXT: vmull.u16 q8, d17, d16
28-
; CHECK-NEXT: vaddw.u16 q8, q8, d18
27+
; CHECK-NEXT: vmull.s16 q8, d17, d16
28+
; CHECK-NEXT: vaddw.s16 q8, q8, d18
2929
; CHECK-NEXT: vmovn.i32 d16, q8
3030
; CHECK-NEXT: vstr d16, [r0, #16]
3131
; CHECK-NEXT: bx lr
@@ -54,8 +54,8 @@ entry:
5454
define arm_aapcs_vfpcc <4 x i16> @addmul_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
5555
; CHECK-LABEL: addmul_args:
5656
; CHECK: @ %bb.0: @ %entry
57-
; CHECK-NEXT: vmull.u16 q8, d1, d2
58-
; CHECK-NEXT: vmlal.u16 q8, d0, d2
57+
; CHECK-NEXT: vmull.s16 q8, d1, d2
58+
; CHECK-NEXT: vmlal.s16 q8, d0, d2
5959
; CHECK-NEXT: vmovn.i32 d0, q8
6060
; CHECK-NEXT: bx lr
6161
entry:
@@ -73,9 +73,9 @@ define void @addmul_loadstore(i16* %a, i16* %b, i16* %c) {
7373
; CHECK: @ %bb.0: @ %entry
7474
; CHECK-NEXT: vldr d16, [r2, #16]
7575
; CHECK-NEXT: vldr d17, [r1, #16]
76-
; CHECK-NEXT: vmull.u16 q9, d17, d16
76+
; CHECK-NEXT: vmull.s16 q9, d17, d16
7777
; CHECK-NEXT: vldr d17, [r0, #16]
78-
; CHECK-NEXT: vmlal.u16 q9, d17, d16
78+
; CHECK-NEXT: vmlal.s16 q9, d17, d16
7979
; CHECK-NEXT: vmovn.i32 d16, q9
8080
; CHECK-NEXT: vstr d16, [r0, #16]
8181
; CHECK-NEXT: bx lr
@@ -108,7 +108,7 @@ define void @func1(i16* %a, i16* %b, i16* %c) {
108108
; CHECK-NEXT: vldr d18, [r2, #16]
109109
; CHECK-NEXT: vld1.16 {d16}, [r3:64]
110110
; CHECK-NEXT: vmovl.u16 q8, d16
111-
; CHECK-NEXT: vaddw.u16 q10, q8, d18
111+
; CHECK-NEXT: vaddw.s16 q10, q8, d18
112112
; CHECK-NEXT: vmovn.i32 d19, q10
113113
; CHECK-NEXT: vldr d20, [r0, #16]
114114
; CHECK-NEXT: vstr d19, [r0, #16]
@@ -119,7 +119,7 @@ define void @func1(i16* %a, i16* %b, i16* %c) {
119119
; CHECK-NEXT: vmovn.i32 d16, q11
120120
; CHECK-NEXT: vstr d16, [r1, #16]
121121
; CHECK-NEXT: vldr d16, [r2, #16]
122-
; CHECK-NEXT: vmlal.u16 q11, d16, d20
122+
; CHECK-NEXT: vmlal.s16 q11, d16, d20
123123
; CHECK-NEXT: vmovn.i32 d16, q11
124124
; CHECK-NEXT: vstr d16, [r0, #16]
125125
; CHECK-NEXT: bx lr
@@ -175,26 +175,23 @@ entry:
175175
define void @func2(i16* %a, i16* %b, i16* %c) {
176176
; CHECK-LABEL: func2:
177177
; CHECK: @ %bb.0: @ %entry
178-
; CHECK-NEXT: vldr d16, [r1, #16]
179-
; CHECK-NEXT: add r3, r0, #16
180-
; CHECK-NEXT: vldr d17, [r2, #16]
181-
; CHECK-NEXT: vaddl.u16 q9, d17, d16
182-
; CHECK-NEXT: vmovn.i32 d18, q9
183-
; CHECK-NEXT: vld1.16 {d19}, [r3:64]
184-
; CHECK-NEXT: vstr d18, [r0, #16]
178+
; CHECK-NEXT: add r3, r1, #16
185179
; CHECK-NEXT: vldr d18, [r2, #16]
186-
; CHECK-NEXT: vmull.s16 q10, d17, d18
187-
; CHECK-NEXT: vmovl.s16 q11, d18
180+
; CHECK-NEXT: vld1.16 {d16}, [r3:64]
188181
; CHECK-NEXT: vmovl.u16 q8, d16
182+
; CHECK-NEXT: vaddw.s16 q10, q8, d18
183+
; CHECK-NEXT: vmovn.i32 d19, q10
184+
; CHECK-NEXT: vldr d20, [r0, #16]
185+
; CHECK-NEXT: vstr d19, [r0, #16]
186+
; CHECK-NEXT: vldr d19, [r2, #16]
187+
; CHECK-NEXT: vmull.s16 q11, d18, d19
189188
; CHECK-NEXT: vmovl.s16 q9, d19
190-
; CHECK-NEXT: vmla.i32 q10, q8, q11
191-
; CHECK-NEXT: vmovn.i32 d16, q10
189+
; CHECK-NEXT: vmla.i32 q11, q8, q9
190+
; CHECK-NEXT: vmovn.i32 d16, q11
192191
; CHECK-NEXT: vstr d16, [r1, #16]
193-
; CHECK-NEXT: add r1, r2, #16
194-
; CHECK-NEXT: vld1.16 {d16}, [r1:64]
195-
; CHECK-NEXT: vmovl.u16 q8, d16
196-
; CHECK-NEXT: vmla.i32 q10, q8, q9
197-
; CHECK-NEXT: vadd.i32 q8, q10, q9
192+
; CHECK-NEXT: vldr d16, [r2, #16]
193+
; CHECK-NEXT: vmlal.s16 q11, d16, d20
194+
; CHECK-NEXT: vaddw.s16 q8, q11, d20
198195
; CHECK-NEXT: vmovn.i32 d16, q8
199196
; CHECK-NEXT: vstr d16, [r0, #16]
200197
; CHECK-NEXT: bx lr

0 commit comments

Comments
 (0)