Skip to content

Commit f893b47

Browse files
authored
[ARM] Fix instruction selection for MVE vsbciq intrinsic (#118284)
There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics: * The VSBCI instruction behaves as if the carry-in is always set, but we were selecting it when the carry-in is clear. * The vsbciq intrinsics should generate IR with the carry-in set, but they were leaving it clear. These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant.
1 parent efe4bfa commit f893b47

File tree

5 files changed

+101
-13
lines changed

5 files changed

+101
-13
lines changed

clang/include/clang/Basic/arm_mve.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,13 +1270,13 @@ defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
12701270
def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
12711271
def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
12721272

1273-
multiclass vadcsbc {
1273+
multiclass vadcsbc<dag initial_carry_in> {
12741274
def q: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
12751275
(seq (IRInt<NAME, [Vector]> $a, $b, (shl (load $carry), 29)):$pair,
12761276
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
12771277
(xval $pair, 0))>;
12781278
def iq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
1279-
(seq (IRInt<NAME, [Vector]> $a, $b, 0):$pair,
1279+
(seq (IRInt<NAME, [Vector]> $a, $b, initial_carry_in):$pair,
12801280
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
12811281
(xval $pair, 0))>;
12821282
def q_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
@@ -1288,13 +1288,13 @@ multiclass vadcsbc {
12881288
def iq_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
12891289
Ptr<uint>:$carry, Predicate:$pred),
12901290
(seq (IRInt<NAME # "_predicated", [Vector, Predicate]> $inactive, $a, $b,
1291-
0, $pred):$pair,
1291+
initial_carry_in, $pred):$pair,
12921292
(store (and 1, (lshr (xval $pair, 1), 29)), $carry),
12931293
(xval $pair, 0))>;
12941294
}
12951295
let params = T.Int32 in {
1296-
defm vadc: vadcsbc;
1297-
defm vsbc: vadcsbc;
1296+
defm vadc: vadcsbc<(u32 0)>;
1297+
defm vsbc: vadcsbc<(shl 1, 29)>;
12981298
}
12991299

13001300
let params = T.Int in {

clang/test/CodeGen/arm-mve-intrinsics/vadc.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigne
9292

9393
// CHECK-LABEL: @test_vsbciq_s32(
9494
// CHECK-NEXT: entry:
95-
// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
95+
// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
9696
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
9797
// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
9898
// CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -110,7 +110,7 @@ int32x4_t test_vsbciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) {
110110

111111
// CHECK-LABEL: @test_vsbciq_u32(
112112
// CHECK-NEXT: entry:
113-
// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
113+
// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
114114
// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
115115
// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
116116
// CHECK-NEXT: [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -170,7 +170,7 @@ uint32x4_t test_vsbcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) {
170170
// CHECK-NEXT: entry:
171171
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
172172
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
173-
// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
173+
// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
174174
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
175175
// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
176176
// CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]]
@@ -190,7 +190,7 @@ int32x4_t test_vsbciq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsign
190190
// CHECK-NEXT: entry:
191191
// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
192192
// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
193-
// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
193+
// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
194194
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
195195
// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
196196
// CHECK-NEXT: [[TMP5:%.*]] = and i32 1, [[TMP4]]

llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5229,7 +5229,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
52295229
return;
52305230
case Intrinsic::arm_mve_vsbc:
52315231
case Intrinsic::arm_mve_vsbc_predicated:
5232-
SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
5232+
SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, false,
52335233
IntNo == Intrinsic::arm_mve_vsbc_predicated);
52345234
return;
52355235
case Intrinsic::arm_mve_vshlc:

llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, pt
108108
; CHECK-NEXT: str r1, [r0]
109109
; CHECK-NEXT: bx lr
110110
entry:
111-
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
111+
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
112112
%1 = extractvalue { <4 x i32>, i32 } %0, 1
113113
%2 = lshr i32 %1, 29
114114
%3 = and i32 %2, 1
@@ -125,6 +125,46 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, pt
125125
; CHECK-NEXT: ubfx r1, r1, #29, #1
126126
; CHECK-NEXT: str r1, [r0]
127127
; CHECK-NEXT: bx lr
128+
entry:
129+
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
130+
%1 = extractvalue { <4 x i32>, i32 } %0, 1
131+
%2 = lshr i32 %1, 29
132+
%3 = and i32 %2, 1
133+
store i32 %3, ptr %carry_out, align 4
134+
%4 = extractvalue { <4 x i32>, i32 } %0, 0
135+
ret <4 x i32> %4
136+
}
137+
138+
define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
139+
; CHECK-LABEL: test_vsbcq_s32_carry_in_zero:
140+
; CHECK: @ %bb.0: @ %entry
141+
; CHECK-NEXT: movs r1, #0
142+
; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
143+
; CHECK-NEXT: vsbc.i32 q0, q0, q1
144+
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
145+
; CHECK-NEXT: ubfx r1, r1, #29, #1
146+
; CHECK-NEXT: str r1, [r0]
147+
; CHECK-NEXT: bx lr
148+
entry:
149+
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
150+
%1 = extractvalue { <4 x i32>, i32 } %0, 1
151+
%2 = lshr i32 %1, 29
152+
%3 = and i32 %2, 1
153+
store i32 %3, ptr %carry_out, align 4
154+
%4 = extractvalue { <4 x i32>, i32 } %0, 0
155+
ret <4 x i32> %4
156+
}
157+
158+
define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
159+
; CHECK-LABEL: test_vsbcq_u32_carry_in_zero:
160+
; CHECK: @ %bb.0: @ %entry
161+
; CHECK-NEXT: movs r1, #0
162+
; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
163+
; CHECK-NEXT: vsbc.i32 q0, q0, q1
164+
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
165+
; CHECK-NEXT: ubfx r1, r1, #29, #1
166+
; CHECK-NEXT: str r1, [r0]
167+
; CHECK-NEXT: bx lr
128168
entry:
129169
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
130170
%1 = extractvalue { <4 x i32>, i32 } %0, 1
@@ -196,7 +236,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i3
196236
entry:
197237
%0 = zext i16 %p to i32
198238
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
199-
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
239+
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
200240
%3 = extractvalue { <4 x i32>, i32 } %2, 1
201241
%4 = lshr i32 %3, 29
202242
%5 = and i32 %4, 1
@@ -215,6 +255,54 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i3
215255
; CHECK-NEXT: ubfx r1, r1, #29, #1
216256
; CHECK-NEXT: str r1, [r0]
217257
; CHECK-NEXT: bx lr
258+
entry:
259+
%0 = zext i16 %p to i32
260+
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
261+
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
262+
%3 = extractvalue { <4 x i32>, i32 } %2, 1
263+
%4 = lshr i32 %3, 29
264+
%5 = and i32 %4, 1
265+
store i32 %5, ptr %carry_out, align 4
266+
%6 = extractvalue { <4 x i32>, i32 } %2, 0
267+
ret <4 x i32> %6
268+
}
269+
270+
define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
271+
; CHECK-LABEL: test_vsbcq_m_s32_carry_in_zero:
272+
; CHECK: @ %bb.0: @ %entry
273+
; CHECK-NEXT: movs r2, #0
274+
; CHECK-NEXT: vmsr p0, r1
275+
; CHECK-NEXT: vmsr fpscr_nzcvqc, r2
276+
; CHECK-NEXT: vpst
277+
; CHECK-NEXT: vsbct.i32 q0, q1, q2
278+
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
279+
; CHECK-NEXT: ubfx r1, r1, #29, #1
280+
; CHECK-NEXT: str r1, [r0]
281+
; CHECK-NEXT: bx lr
282+
entry:
283+
%0 = zext i16 %p to i32
284+
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
285+
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
286+
%3 = extractvalue { <4 x i32>, i32 } %2, 1
287+
%4 = lshr i32 %3, 29
288+
%5 = and i32 %4, 1
289+
store i32 %5, ptr %carry_out, align 4
290+
%6 = extractvalue { <4 x i32>, i32 } %2, 0
291+
ret <4 x i32> %6
292+
}
293+
294+
define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
295+
; CHECK-LABEL: test_vsbcq_m_u32_carry_in_zero:
296+
; CHECK: @ %bb.0: @ %entry
297+
; CHECK-NEXT: movs r2, #0
298+
; CHECK-NEXT: vmsr p0, r1
299+
; CHECK-NEXT: vmsr fpscr_nzcvqc, r2
300+
; CHECK-NEXT: vpst
301+
; CHECK-NEXT: vsbct.i32 q0, q1, q2
302+
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
303+
; CHECK-NEXT: ubfx r1, r1, #29, #1
304+
; CHECK-NEXT: str r1, [r0]
305+
; CHECK-NEXT: bx lr
218306
entry:
219307
%0 = zext i16 %p to i32
220308
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)

llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x
5959
; CHECK-NEXT: pop.w {r7, lr}
6060
; CHECK-NEXT: b use_int32x4_t
6161
entry:
62-
%adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
62+
%adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912)
6363
%carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
6464
%result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
6565
tail call void @use_int32x4_t(<4 x i32> %result_low)

0 commit comments

Comments
 (0)