|
4 | 4 | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
5 | 5 | target triple = "aarch64-none-unknown-elf"
|
6 | 6 |
|
7 |
| -define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { |
| 7 | +define i32 @cdotp() { |
8 | 8 | ; CHECK-LABEL: define i32 @cdotp(
|
9 |
| -; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { |
| 9 | +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { |
10 | 10 | ; CHECK-NEXT: [[ENTRY:.*]]:
|
11 |
| -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 |
12 |
| -; CHECK-NEXT: br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] |
13 |
| -; CHECK: [[FOR_BODY_PREHEADER]]: |
14 |
| -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 |
15 |
| -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 |
16 |
| -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() |
17 |
| -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 |
18 |
| -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] |
19 |
| -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] |
20 |
| -; CHECK: [[VECTOR_PH]]: |
21 |
| -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() |
22 |
| -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 |
23 |
| -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] |
24 |
| -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] |
25 |
| -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() |
26 |
| -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 |
27 | 11 | ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
|
28 | 12 | ; CHECK: [[VECTOR_BODY]]:
|
29 |
| -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] |
30 |
| -; CHECK-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ] |
31 |
| -; CHECK-NEXT: [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1 |
32 |
| -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]] |
33 |
| -; CHECK-NEXT: [[A_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[A_PTR]], align 32 |
34 |
| -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]] |
35 |
| -; CHECK-NEXT: [[B_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[B_PTR]], align 32 |
36 |
| -; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 0) |
37 |
| -; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 0) |
38 |
| -; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 16) |
39 |
| -; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 16) |
40 |
| -; CHECK-NEXT: [[VEC_PHI:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0) |
41 |
| -; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4) |
42 |
| -; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i32 0) |
43 |
| -; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP13]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i32 0) |
44 |
| -; CHECK-NEXT: [[TMP22:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP10]], i64 0) |
45 |
| -; CHECK-NEXT: [[TMP20]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP21]], i64 4) |
46 |
| -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] |
47 |
| -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] |
48 |
| -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] |
| 13 | +; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] |
| 14 | +; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)) |
| 15 | +; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 0) |
| 16 | +; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 0) |
| 17 | +; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 16) |
| 18 | +; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TMP1]], i64 16) |
| 19 | +; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 0) |
| 20 | +; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP0]], i64 4) |
| 21 | +; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i32 0) |
| 22 | +; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], i32 0) |
| 23 | +; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0) |
| 24 | +; CHECK-NEXT: [[TMP11]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP9]], i64 4) |
| 25 | +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] |
49 | 26 | ; CHECK: [[MIDDLE_BLOCK]]:
|
50 |
| -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP20]]) |
51 |
| -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] |
52 |
| -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] |
53 |
| -; CHECK: [[SCALAR_PH]]: |
54 |
| -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] |
55 |
| -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] |
56 |
| -; CHECK-NEXT: br label %[[FOR_BODY:.*]] |
57 |
| -; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: |
58 |
| -; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ] |
59 |
| -; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] |
60 |
| -; CHECK: [[FOR_COND_CLEANUP]]: |
61 |
| -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] |
62 |
| -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] |
63 |
| -; CHECK: [[FOR_BODY]]: |
64 |
| -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] |
65 |
| -; CHECK-NEXT: [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ] |
66 |
| -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 |
67 |
| -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] |
68 |
| -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 |
69 |
| -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP15]] to i32 |
70 |
| -; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1 |
71 |
| -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]] |
72 |
| -; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 |
73 |
| -; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32 |
74 |
| -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] |
75 |
| -; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1 |
76 |
| -; CHECK-NEXT: [[CONV10:%.*]] = sext i8 [[TMP18]] to i32 |
77 |
| -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]] |
78 |
| -; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1 |
79 |
| -; CHECK-NEXT: [[CONV15:%.*]] = sext i8 [[TMP19]] to i32 |
80 |
| -; CHECK-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]] |
81 |
| -; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]] |
82 |
| -; CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]] |
83 |
| -; CHECK-NEXT: [[SUB]] = sub i32 [[ADD17]], [[MUL18]] |
84 |
| -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 |
85 |
| -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] |
86 |
| -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]] |
| 27 | +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP11]]) |
| 28 | +; CHECK-NEXT: ret i32 [[TMP12]] |
87 | 29 | ;
|
88 | 30 | entry:
|
89 |
| - %cmp28.not = icmp ult i32 %N, 2 |
90 |
| - br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader |
91 |
| -for.body.preheader: ; preds = %entry |
92 |
| - %div27 = lshr i32 %N, 1 |
93 |
| - %wide.trip.count = zext nneg i32 %div27 to i64 |
94 |
| - %0 = call i64 @llvm.vscale.i64() |
95 |
| - %1 = mul i64 %0, 16 |
96 |
| - %min.iters.check = icmp ult i64 %wide.trip.count, %1 |
97 |
| - br i1 %min.iters.check, label %scalar.ph, label %vector.ph |
98 |
| -vector.ph: ; preds = %for.body.preheader |
99 |
| - %2 = call i64 @llvm.vscale.i64() |
100 |
| - %3 = mul i64 %2, 16 |
101 |
| - %n.mod.vf = urem i64 %wide.trip.count, %3 |
102 |
| - %n.vec = sub i64 %wide.trip.count, %n.mod.vf |
103 |
| - %4 = call i64 @llvm.vscale.i64() |
104 |
| - %5 = mul i64 %4, 16 |
105 | 31 | br label %vector.body
|
106 |
| -vector.body: |
107 |
| - %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] |
108 |
| - %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ] |
109 |
| - %index.i = shl nuw nsw i64 %index, 1 |
110 |
| - %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i |
111 |
| - %a.load = load <vscale x 32 x i8>, ptr %a.ptr |
112 |
| - %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a.load) |
113 |
| - %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0 |
114 |
| - %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1 |
115 |
| - %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i |
116 |
| - %b.load = load <vscale x 32 x i8>, ptr %b.ptr |
117 |
| - %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b.load) |
118 |
| - %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0 |
119 |
| - %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1 |
120 |
| - %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32> |
121 |
| - %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32> |
122 |
| - %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32> |
123 |
| - %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32> |
124 |
| - %real.mul = mul nsw <vscale x 16 x i32> %b.real.ext, %a.real.ext |
| 32 | + |
| 33 | +vector.body: ; preds = %vector.body, %entry |
| 34 | + %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] |
| 35 | + %a.real.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32> |
| 36 | + %a.imag.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32> |
| 37 | + %b.real.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32> |
| 38 | + %b.imag.ext = sext <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i32> |
| 39 | + %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext |
125 | 40 | %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
|
126 |
| - %imag.mul = mul nsw <vscale x 16 x i32> %b.imag.ext, %a.imag.ext |
| 41 | + %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext |
127 | 42 | %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
|
128 | 43 | %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
|
129 |
| - %index.next = add nuw i64 %index, %5 |
130 |
| - %22 = icmp eq i64 %index.next, %n.vec |
131 |
| - br i1 %22, label %middle.block, label %vector.body |
| 44 | + br i1 true, label %middle.block, label %vector.body |
| 45 | + |
132 | 46 | middle.block: ; preds = %vector.body
|
133 |
| - %25 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub) |
134 |
| - %cmp.n = icmp eq i64 %wide.trip.count, %n.vec |
135 |
| - br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph |
136 |
| -scalar.ph: ; preds = %middle.block, %for.body.preheader |
137 |
| - %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ] |
138 |
| - %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ] |
139 |
| - br label %for.body |
140 |
| -for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body |
141 |
| - %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ] |
142 |
| - br label %for.cond.cleanup |
143 |
| -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry |
144 |
| - %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ] |
145 |
| - ret i32 %res.0.lcssa |
146 |
| -for.body: ; preds = %scalar.ph, %for.body |
147 |
| - %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ] |
148 |
| - %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ] |
149 |
| - %26 = shl nuw nsw i64 %indvars.iv, 1 |
150 |
| - %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26 |
151 |
| - %27 = load i8, ptr %arrayidx, align 1 |
152 |
| - %conv = sext i8 %27 to i32 |
153 |
| - %28 = or disjoint i64 %26, 1 |
154 |
| - %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28 |
155 |
| - %29 = load i8, ptr %arrayidx4, align 1 |
156 |
| - %conv5 = sext i8 %29 to i32 |
157 |
| - %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26 |
158 |
| - %30 = load i8, ptr %arrayidx9, align 1 |
159 |
| - %conv10 = sext i8 %30 to i32 |
160 |
| - %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28 |
161 |
| - %31 = load i8, ptr %arrayidx14, align 1 |
162 |
| - %conv15 = sext i8 %31 to i32 |
163 |
| - %mul16 = mul nsw i32 %conv10, %conv |
164 |
| - %add17 = add nsw i32 %mul16, %res.030 |
165 |
| - %mul18 = mul nsw i32 %conv15, %conv5 |
166 |
| - %sub = sub i32 %add17, %mul18 |
167 |
| - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 |
168 |
| - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count |
169 |
| - br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body |
| 47 | + %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub) |
| 48 | + ret i32 %0 |
170 | 49 | }
|
| 50 | + |
| 51 | +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 52 | +declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>) #0 |
| 53 | + |
| 54 | +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) |
| 55 | +declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) #1 |
| 56 | + |
| 57 | +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } |
| 58 | +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
0 commit comments