@@ -14,11 +14,11 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
14
14
; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
15
15
; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
16
16
; CHECK: while.body.preheader:
17
- ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8
17
+ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 4
18
18
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[WHILE_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
19
19
; CHECK: vector.ph:
20
- ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -8
21
- ; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 7
20
+ ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[BLOCKSIZE]], -4
21
+ ; CHECK-NEXT: [[IND_END:%.*]] = and i32 [[BLOCKSIZE]], 3
22
22
; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[N_VEC]], 1
23
23
; CHECK-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[PSRCA:%.*]], i32 [[TMP0]]
24
24
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[N_VEC]], 1
@@ -34,18 +34,19 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
34
34
; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[TMP4]]
35
35
; CHECK-NEXT: [[TMP5:%.*]] = shl i32 [[INDEX]], 1
36
36
; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PSRCB]], i32 [[TMP5]]
37
- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
38
- ; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
39
- ; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i16>, ptr [[NEXT_GEP14]], align 2
40
- ; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[WIDE_LOAD15]] to <8 x i32>
41
- ; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[TMP7]], [[TMP6]]
42
- ; CHECK-NEXT: [[TMP9:%.*]] = ashr <8 x i32> [[TMP8]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
43
- ; CHECK-NEXT: [[TMP10:%.*]] = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP9]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
44
- ; CHECK-NEXT: [[TMP11:%.*]] = trunc <8 x i32> [[TMP10]] to <8 x i16>
45
- ; CHECK-NEXT: store <8 x i16> [[TMP11]], ptr [[NEXT_GEP13]], align 2
46
- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
47
- ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
48
- ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
37
+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[NEXT_GEP]], align 2
38
+ ; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
39
+ ; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i16>, ptr [[NEXT_GEP14]], align 2
40
+ ; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[WIDE_LOAD15]] to <4 x i32>
41
+ ; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP6]]
42
+ ; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], <i32 15, i32 15, i32 15, i32 15>
43
+ ; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP8]], <i32 1073741824, i32 1073741824, i32 1073741824, i32 1073741824>
44
+ ; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16>
45
+ ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> <i16 32767, i16 32767, i16 32767, i16 32767>
46
+ ; CHECK-NEXT: store <4 x i16> [[TMP12]], ptr [[NEXT_GEP13]], align 2
47
+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
48
+ ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
49
+ ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
49
50
; CHECK: middle.block:
50
51
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
51
52
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[WHILE_BODY_PREHEADER16]]
@@ -61,15 +62,16 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
61
62
; CHECK-NEXT: [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER16]] ]
62
63
; CHECK-NEXT: [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER16]] ]
63
64
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRCA_ADDR_05]], i32 2
64
- ; CHECK-NEXT: [[TMP13 :%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
65
- ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP13 ]] to i32
65
+ ; CHECK-NEXT: [[TMP14 :%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
66
+ ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP14 ]] to i32
66
67
; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[PSRCB_ADDR_03]], i32 2
67
- ; CHECK-NEXT: [[TMP14 :%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
68
- ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP14 ]] to i32
68
+ ; CHECK-NEXT: [[TMP15 :%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
69
+ ; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP15 ]] to i32
69
70
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
70
- ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], 15
71
- ; CHECK-NEXT: [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
72
- ; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[SPEC_SELECT_I]] to i16
71
+ ; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], 15
72
+ ; CHECK-NEXT: [[CMP4_I:%.*]] = icmp slt i32 [[MUL]], 1073741824
73
+ ; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SHR]] to i16
74
+ ; CHECK-NEXT: [[CONV3:%.*]] = select i1 [[CMP4_I]], i16 [[TMP16]], i16 32767
73
75
; CHECK-NEXT: [[INCDEC_PTR4]] = getelementptr inbounds i8, ptr [[PDST_ADDR_04]], i32 2
74
76
; CHECK-NEXT: store i16 [[CONV3]], ptr [[PDST_ADDR_04]], align 2
75
77
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_06]], -1
0 commit comments