5
5
target triple = "riscv64-none-unknown-elf"
6
6
7
7
define i32 @vqdot (ptr %a , ptr %b ) #0 {
8
- ; CHECK-LABEL: define i32 @vqdot(
9
- ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
10
- ; CHECK-NEXT: entry:
11
- ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
12
- ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
13
- ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
14
- ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15
- ; CHECK: vector.ph:
16
- ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
17
- ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
18
- ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
19
- ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
20
- ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
21
- ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
22
- ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
23
- ; CHECK: vector.body:
24
- ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25
- ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
26
- ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
27
- ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
28
- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
29
- ; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
30
- ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
31
- ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
32
- ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
33
- ; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
34
- ; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
35
- ; CHECK-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
36
- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
37
- ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
38
- ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
39
- ; CHECK: middle.block:
40
- ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
41
- ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
42
- ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
43
- ; CHECK: scalar.ph:
8
+ ; V-LABEL: define i32 @vqdot(
9
+ ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
10
+ ; V-NEXT: entry:
11
+ ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
12
+ ; V-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
13
+ ; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
14
+ ; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15
+ ; V: vector.ph:
16
+ ; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
17
+ ; V-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
18
+ ; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
19
+ ; V-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
20
+ ; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
21
+ ; V-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
22
+ ; V-NEXT: br label [[VECTOR_BODY:%.*]]
23
+ ; V: vector.body:
24
+ ; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25
+ ; V-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
26
+ ; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
27
+ ; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
28
+ ; V-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
29
+ ; V-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
30
+ ; V-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
31
+ ; V-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
32
+ ; V-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
33
+ ; V-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
34
+ ; V-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
35
+ ; V-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
36
+ ; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
37
+ ; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
38
+ ; V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
39
+ ; V: middle.block:
40
+ ; V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
41
+ ; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
42
+ ; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
43
+ ; V: scalar.ph:
44
+ ;
45
+ ; ZVQDOTQ-LABEL: define i32 @vqdot(
46
+ ; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
47
+ ; ZVQDOTQ-NEXT: entry:
48
+ ; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
49
+ ; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
50
+ ; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
51
+ ; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
52
+ ; ZVQDOTQ: vector.ph:
53
+ ; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
54
+ ; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
55
+ ; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
56
+ ; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
57
+ ; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
58
+ ; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
59
+ ; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]]
60
+ ; ZVQDOTQ: vector.body:
61
+ ; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
62
+ ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
63
+ ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
64
+ ; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
65
+ ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
66
+ ; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
67
+ ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
68
+ ; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
69
+ ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
70
+ ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
71
+ ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
72
+ ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
73
+ ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
74
+ ; ZVQDOTQ-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
75
+ ; ZVQDOTQ-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
76
+ ; ZVQDOTQ: middle.block:
77
+ ; ZVQDOTQ-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
78
+ ; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
79
+ ; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
80
+ ; ZVQDOTQ: scalar.ph:
44
81
;
45
82
entry:
46
83
br label %for.body
@@ -66,42 +103,79 @@ for.exit: ; preds = %for.body
66
103
67
104
68
105
define i32 @vqdotu (ptr %a , ptr %b ) #0 {
69
- ; CHECK-LABEL: define i32 @vqdotu(
70
- ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
71
- ; CHECK-NEXT: entry:
72
- ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
73
- ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
74
- ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
75
- ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
76
- ; CHECK: vector.ph:
77
- ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
78
- ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
79
- ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
80
- ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
81
- ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
82
- ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
83
- ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
84
- ; CHECK: vector.body:
85
- ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
86
- ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
87
- ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
88
- ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
89
- ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
90
- ; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
91
- ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
92
- ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
93
- ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
94
- ; CHECK-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
95
- ; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
96
- ; CHECK-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
97
- ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
98
- ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
99
- ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
100
- ; CHECK: middle.block:
101
- ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
102
- ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
103
- ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
104
- ; CHECK: scalar.ph:
106
+ ; V-LABEL: define i32 @vqdotu(
107
+ ; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
108
+ ; V-NEXT: entry:
109
+ ; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
110
+ ; V-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
111
+ ; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
112
+ ; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
113
+ ; V: vector.ph:
114
+ ; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
115
+ ; V-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
116
+ ; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
117
+ ; V-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
118
+ ; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
119
+ ; V-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
120
+ ; V-NEXT: br label [[VECTOR_BODY:%.*]]
121
+ ; V: vector.body:
122
+ ; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
123
+ ; V-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
124
+ ; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
125
+ ; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
126
+ ; V-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
127
+ ; V-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
128
+ ; V-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
129
+ ; V-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
130
+ ; V-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
131
+ ; V-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
132
+ ; V-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
133
+ ; V-NEXT: [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
134
+ ; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
135
+ ; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
136
+ ; V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
137
+ ; V: middle.block:
138
+ ; V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
139
+ ; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
140
+ ; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
141
+ ; V: scalar.ph:
142
+ ;
143
+ ; ZVQDOTQ-LABEL: define i32 @vqdotu(
144
+ ; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
145
+ ; ZVQDOTQ-NEXT: entry:
146
+ ; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
147
+ ; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
148
+ ; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
149
+ ; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
150
+ ; ZVQDOTQ: vector.ph:
151
+ ; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
152
+ ; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
153
+ ; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
154
+ ; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
155
+ ; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
156
+ ; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
157
+ ; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]]
158
+ ; ZVQDOTQ: vector.body:
159
+ ; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
160
+ ; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
161
+ ; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
162
+ ; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
163
+ ; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
164
+ ; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
165
+ ; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
166
+ ; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
167
+ ; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
168
+ ; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
169
+ ; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
170
+ ; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
171
+ ; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
172
+ ; ZVQDOTQ-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
173
+ ; ZVQDOTQ-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
174
+ ; ZVQDOTQ: middle.block:
175
+ ; ZVQDOTQ-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
176
+ ; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
177
+ ; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
178
+ ; ZVQDOTQ: scalar.ph:
105
179
;
106
180
entry:
107
181
br label %for.body
@@ -128,7 +202,7 @@ for.exit: ; preds = %for.body
128
202
129
203
define i32 @vqdotsu (ptr %a , ptr %b ) #0 {
130
204
; CHECK-LABEL: define i32 @vqdotsu(
131
- ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
205
+ ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+ ]] {
132
206
; CHECK-NEXT: entry:
133
207
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
134
208
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
@@ -245,6 +319,3 @@ for.body: ; preds = %for.body, %entry
245
319
for.exit: ; preds = %for.body
246
320
ret i32 %add
247
321
}
248
- ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
249
- ; V: {{.*}}
250
- ; ZVQDOTQ: {{.*}}
0 commit comments