Skip to content

Commit 5b5af52

Browse files
authored
[AArch64] Extend efficient lowering of experimental.cttz.elts (#92114)
This patch extends support for more efficient lowering of the experimental.cttz.elts intrinsic to fixed-width vector types, by first creating an SVE predicate register mask from the fixed-width vector.
1 parent 4e0f8a4 commit 5b5af52

File tree

3 files changed

+179
-20
lines changed

3 files changed

+179
-20
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,9 +1871,11 @@ bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
18711871
if (!Subtarget->hasSVEorSME())
18721872
return true;
18731873

1874-
// We can only use the BRKB + CNTP sequence with legal predicate types.
1874+
// We can only use the BRKB + CNTP sequence with legal predicate types. We can
1875+
// also support fixed-width predicates.
18751876
return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1876-
VT != MVT::nxv2i1;
1877+
VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1878+
VT != MVT::v4i1 && VT != MVT::v2i1;
18771879
}
18781880

18791881
void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
@@ -5838,9 +5840,20 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
58385840
return SDValue();
58395841
}
58405842
case Intrinsic::experimental_cttz_elts: {
5841-
SDValue NewCttzElts =
5842-
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5843+
SDValue CttzOp = Op.getOperand(1);
5844+
EVT VT = CttzOp.getValueType();
5845+
assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
58435846

5847+
if (VT.isFixedLengthVector()) {
5848+
// We can use SVE instructions to lower this intrinsic by first creating
5849+
// an SVE predicate register mask from the fixed-width vector.
5850+
EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
5851+
SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
5852+
CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
5853+
}
5854+
5855+
SDValue NewCttzElts =
5856+
DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
58445857
return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
58455858
}
58465859
}

llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@ define void @foo_no_vscale_range() {
1313
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
1414
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
1515
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
16-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
17-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
19-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
2020
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
21-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
22-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
23-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
24-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
24+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
2525
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
2626
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
2727
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
@@ -33,15 +33,15 @@ define void @foo_no_vscale_range() {
3333
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
3434
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
3535
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
36-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
37-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
38-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
39-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
36+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
37+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
38+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
39+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
4040
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
41-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
42-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
43-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
44-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
41+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
42+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
43+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
44+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
4545
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
4646
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
4747
;

llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,152 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
359359
ret i32 %add
360360
}
361361

362+
; FIXED-WIDTH VECTOR TYPES
363+
364+
define i32 @ctz_v16i1(<16 x i1> %a) {
365+
; CHECK-LABEL: ctz_v16i1:
366+
; CHECK: // %bb.0:
367+
; CHECK-NEXT: shl v0.16b, v0.16b, #7
368+
; CHECK-NEXT: ptrue p0.b, vl16
369+
; CHECK-NEXT: ptrue p1.b
370+
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
371+
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
372+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
373+
; CHECK-NEXT: cntp x0, p0, p0.b
374+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
375+
; CHECK-NEXT: ret
376+
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
377+
ret i32 %res
378+
}
379+
380+
define i32 @ctz_v16i1_poison(<16 x i1> %a) {
381+
; CHECK-LABEL: ctz_v16i1_poison:
382+
; CHECK: // %bb.0:
383+
; CHECK-NEXT: shl v0.16b, v0.16b, #7
384+
; CHECK-NEXT: ptrue p0.b, vl16
385+
; CHECK-NEXT: ptrue p1.b
386+
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
387+
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
388+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
389+
; CHECK-NEXT: cntp x0, p0, p0.b
390+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
391+
; CHECK-NEXT: ret
392+
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
393+
ret i32 %res
394+
}
395+
396+
define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
397+
; CHECK-LABEL: add_i64_ctz_v16i1_poison:
398+
; CHECK: // %bb.0:
399+
; CHECK-NEXT: shl v0.16b, v0.16b, #7
400+
; CHECK-NEXT: ptrue p0.b, vl16
401+
; CHECK-NEXT: ptrue p1.b
402+
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
403+
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
404+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
405+
; CHECK-NEXT: incp x0, p0.b
406+
; CHECK-NEXT: ret
407+
%res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1)
408+
%add = add i64 %res, %b
409+
ret i64 %add
410+
}
411+
412+
define i32 @ctz_v8i1(<8 x i1> %a) {
413+
; CHECK-LABEL: ctz_v8i1:
414+
; CHECK: // %bb.0:
415+
; CHECK-NEXT: shl v0.8b, v0.8b, #7
416+
; CHECK-NEXT: ptrue p0.b, vl8
417+
; CHECK-NEXT: ptrue p1.b
418+
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
419+
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
420+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
421+
; CHECK-NEXT: cntp x0, p0, p0.b
422+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
423+
; CHECK-NEXT: ret
424+
%res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0)
425+
ret i32 %res
426+
}
427+
428+
define i32 @ctz_v8i1_poison(<8 x i1> %a) {
429+
; CHECK-LABEL: ctz_v8i1_poison:
430+
; CHECK: // %bb.0:
431+
; CHECK-NEXT: shl v0.8b, v0.8b, #7
432+
; CHECK-NEXT: ptrue p0.b, vl8
433+
; CHECK-NEXT: ptrue p1.b
434+
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
435+
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
436+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
437+
; CHECK-NEXT: cntp x0, p0, p0.b
438+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
439+
; CHECK-NEXT: ret
440+
%res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1)
441+
ret i32 %res
442+
}
443+
444+
define i32 @ctz_v4i1(<4 x i1> %a) {
445+
; CHECK-LABEL: ctz_v4i1:
446+
; CHECK: // %bb.0:
447+
; CHECK-NEXT: shl v0.4h, v0.4h, #15
448+
; CHECK-NEXT: ptrue p0.h, vl4
449+
; CHECK-NEXT: ptrue p1.h
450+
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
451+
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
452+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
453+
; CHECK-NEXT: cntp x0, p0, p0.h
454+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
455+
; CHECK-NEXT: ret
456+
%res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0)
457+
ret i32 %res
458+
}
459+
460+
define i32 @ctz_v4i1_poison(<4 x i1> %a) {
461+
; CHECK-LABEL: ctz_v4i1_poison:
462+
; CHECK: // %bb.0:
463+
; CHECK-NEXT: shl v0.4h, v0.4h, #15
464+
; CHECK-NEXT: ptrue p0.h, vl4
465+
; CHECK-NEXT: ptrue p1.h
466+
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
467+
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
468+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
469+
; CHECK-NEXT: cntp x0, p0, p0.h
470+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
471+
; CHECK-NEXT: ret
472+
%res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1)
473+
ret i32 %res
474+
}
475+
476+
define i32 @ctz_v2i1(<2 x i1> %a) {
477+
; CHECK-LABEL: ctz_v2i1:
478+
; CHECK: // %bb.0:
479+
; CHECK-NEXT: shl v0.2s, v0.2s, #31
480+
; CHECK-NEXT: ptrue p0.s, vl2
481+
; CHECK-NEXT: ptrue p1.s
482+
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
483+
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
484+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
485+
; CHECK-NEXT: cntp x0, p0, p0.s
486+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
487+
; CHECK-NEXT: ret
488+
%res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0)
489+
ret i32 %res
490+
}
491+
492+
define i32 @ctz_v2i1_poison(<2 x i1> %a) {
493+
; CHECK-LABEL: ctz_v2i1_poison:
494+
; CHECK: // %bb.0:
495+
; CHECK-NEXT: shl v0.2s, v0.2s, #31
496+
; CHECK-NEXT: ptrue p0.s, vl2
497+
; CHECK-NEXT: ptrue p1.s
498+
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
499+
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
500+
; CHECK-NEXT: brkb p0.b, p1/z, p0.b
501+
; CHECK-NEXT: cntp x0, p0, p0.s
502+
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
503+
; CHECK-NEXT: ret
504+
%res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
505+
ret i32 %res
506+
}
507+
362508
declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
363509
declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
364510
declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)

0 commit comments

Comments
 (0)