Skip to content

Commit 9054eff

Browse files
[LLVM][SVE] Improve legalisation of fixed length get.active.lane.mask operations.
We are effectively performing type and operation legalisation very early within the code generation flow. This results in worse code quality because the DAG is not in canonical form, which DAGCombiner corrects through the introduction of operations that are not legal. This patchs splits and moves the code to where type and operation legalisation is typically implemented.
1 parent d6c4ebb commit 9054eff

File tree

2 files changed

+63
-55
lines changed

2 files changed

+63
-55
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,6 +1674,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
16741674
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
16751675

16761676
setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1677+
1678+
for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1679+
setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
16771680
}
16781681

16791682
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -5686,8 +5689,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
56865689
case Intrinsic::get_active_lane_mask: {
56875690
SDValue ID =
56885691
DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5689-
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5690-
Op.getOperand(1), Op.getOperand(2));
5692+
5693+
EVT VT = Op.getValueType();
5694+
if (VT.isScalableVector())
5695+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
5696+
Op.getOperand(2));
5697+
5698+
// We can use the SVE whilelo instruction to lower this intrinsic by
5699+
// creating the appropriate sequence of scalable vector operations and
5700+
// then extracting a fixed-width subvector from the scalable vector.
5701+
5702+
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
5703+
EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5704+
5705+
SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
5706+
Op.getOperand(1), Op.getOperand(2));
5707+
SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
5708+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
5709+
DAG.getVectorIdxConstant(0, dl));
56915710
}
56925711
case Intrinsic::aarch64_neon_uaddlv: {
56935712
EVT OpVT = Op.getOperand(1).getValueType();
@@ -20462,39 +20481,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
2046220481
switch (IID) {
2046320482
default:
2046420483
break;
20465-
case Intrinsic::get_active_lane_mask: {
20466-
SDValue Res = SDValue();
20467-
EVT VT = N->getValueType(0);
20468-
if (VT.isFixedLengthVector()) {
20469-
// We can use the SVE whilelo instruction to lower this intrinsic by
20470-
// creating the appropriate sequence of scalable vector operations and
20471-
// then extracting a fixed-width subvector from the scalable vector.
20472-
20473-
SDLoc DL(N);
20474-
SDValue ID =
20475-
DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20476-
20477-
EVT WhileVT = EVT::getVectorVT(
20478-
*DAG.getContext(), MVT::i1,
20479-
ElementCount::getScalable(VT.getVectorNumElements()));
20480-
20481-
// Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20482-
EVT PromVT = getPromotedVTForPredicate(WhileVT);
20483-
20484-
// Get the fixed-width equivalent of PromVT for extraction.
20485-
EVT ExtVT =
20486-
EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
20487-
VT.getVectorElementCount());
20488-
20489-
Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
20490-
N->getOperand(1), N->getOperand(2));
20491-
Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
20492-
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20493-
DAG.getConstant(0, DL, MVT::i64));
20494-
Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
20495-
}
20496-
return Res;
20497-
}
2049820484
case Intrinsic::aarch64_neon_vcvtfxs2fp:
2049920485
case Intrinsic::aarch64_neon_vcvtfxu2fp:
2050020486
return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -25568,15 +25554,15 @@ void AArch64TargetLowering::ReplaceNodeResults(
2556825554
return;
2556925555
case ISD::INTRINSIC_WO_CHAIN: {
2557025556
EVT VT = N->getValueType(0);
25571-
assert((VT == MVT::i8 || VT == MVT::i16) &&
25572-
"custom lowering for unexpected type");
2557325557

2557425558
Intrinsic::ID IntID =
2557525559
static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
2557625560
switch (IntID) {
2557725561
default:
2557825562
return;
2557925563
case Intrinsic::aarch64_sve_clasta_n: {
25564+
assert((VT == MVT::i8 || VT == MVT::i16) &&
25565+
"custom lowering for unexpected type");
2558025566
SDLoc DL(N);
2558125567
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
2558225568
auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
@@ -25585,6 +25571,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
2558525571
return;
2558625572
}
2558725573
case Intrinsic::aarch64_sve_clastb_n: {
25574+
assert((VT == MVT::i8 || VT == MVT::i16) &&
25575+
"custom lowering for unexpected type");
2558825576
SDLoc DL(N);
2558925577
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
2559025578
auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
@@ -25593,19 +25581,39 @@ void AArch64TargetLowering::ReplaceNodeResults(
2559325581
return;
2559425582
}
2559525583
case Intrinsic::aarch64_sve_lasta: {
25584+
assert((VT == MVT::i8 || VT == MVT::i16) &&
25585+
"custom lowering for unexpected type");
2559625586
SDLoc DL(N);
2559725587
auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
2559825588
N->getOperand(1), N->getOperand(2));
2559925589
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
2560025590
return;
2560125591
}
2560225592
case Intrinsic::aarch64_sve_lastb: {
25593+
assert((VT == MVT::i8 || VT == MVT::i16) &&
25594+
"custom lowering for unexpected type");
2560325595
SDLoc DL(N);
2560425596
auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
2560525597
N->getOperand(1), N->getOperand(2));
2560625598
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
2560725599
return;
2560825600
}
25601+
case Intrinsic::get_active_lane_mask: {
25602+
if (!VT.isFixedLengthVector())
25603+
return;
25604+
if (VT.getVectorElementType() != MVT::i1)
25605+
return;
25606+
25607+
// NOTE: Only trivial type promotion is supported.
25608+
EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
25609+
if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
25610+
return;
25611+
25612+
SDLoc DL(N);
25613+
auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
25614+
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25615+
return;
25616+
}
2560925617
}
2561025618
}
2561125619
case ISD::READ_REGISTER: {

llvm/test/CodeGen/AArch64/active_lane_mask.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -353,9 +353,9 @@ define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) {
353353
define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
354354
; CHECK-LABEL: lane_mask_v8i1_i32:
355355
; CHECK: // %bb.0:
356-
; CHECK-NEXT: whilelo p0.h, w0, w1
357-
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
358-
; CHECK-NEXT: xtn v0.8b, v0.8h
356+
; CHECK-NEXT: whilelo p0.b, w0, w1
357+
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
358+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
359359
; CHECK-NEXT: ret
360360
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
361361
ret <8 x i1> %active.lane.mask
@@ -364,9 +364,9 @@ define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
364364
define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
365365
; CHECK-LABEL: lane_mask_v4i1_i32:
366366
; CHECK: // %bb.0:
367-
; CHECK-NEXT: whilelo p0.s, w0, w1
368-
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
369-
; CHECK-NEXT: xtn v0.4h, v0.4s
367+
; CHECK-NEXT: whilelo p0.h, w0, w1
368+
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
369+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
370370
; CHECK-NEXT: ret
371371
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
372372
ret <4 x i1> %active.lane.mask
@@ -375,9 +375,9 @@ define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
375375
define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) {
376376
; CHECK-LABEL: lane_mask_v2i1_i32:
377377
; CHECK: // %bb.0:
378-
; CHECK-NEXT: whilelo p0.d, w0, w1
379-
; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
380-
; CHECK-NEXT: xtn v0.2s, v0.2d
378+
; CHECK-NEXT: whilelo p0.s, w0, w1
379+
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
380+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
381381
; CHECK-NEXT: ret
382382
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC)
383383
ret <2 x i1> %active.lane.mask
@@ -397,9 +397,9 @@ define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) {
397397
define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
398398
; CHECK-LABEL: lane_mask_v8i1_i64:
399399
; CHECK: // %bb.0:
400-
; CHECK-NEXT: whilelo p0.h, x0, x1
401-
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
402-
; CHECK-NEXT: xtn v0.8b, v0.8h
400+
; CHECK-NEXT: whilelo p0.b, x0, x1
401+
; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
402+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
403403
; CHECK-NEXT: ret
404404
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC)
405405
ret <8 x i1> %active.lane.mask
@@ -408,9 +408,9 @@ define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
408408
define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
409409
; CHECK-LABEL: lane_mask_v4i1_i64:
410410
; CHECK: // %bb.0:
411-
; CHECK-NEXT: whilelo p0.s, x0, x1
412-
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
413-
; CHECK-NEXT: xtn v0.4h, v0.4s
411+
; CHECK-NEXT: whilelo p0.h, x0, x1
412+
; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
413+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
414414
; CHECK-NEXT: ret
415415
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC)
416416
ret <4 x i1> %active.lane.mask
@@ -419,9 +419,9 @@ define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
419419
define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) {
420420
; CHECK-LABEL: lane_mask_v2i1_i64:
421421
; CHECK: // %bb.0:
422-
; CHECK-NEXT: whilelo p0.d, x0, x1
423-
; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
424-
; CHECK-NEXT: xtn v0.2s, v0.2d
422+
; CHECK-NEXT: whilelo p0.s, x0, x1
423+
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
424+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
425425
; CHECK-NEXT: ret
426426
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC)
427427
ret <2 x i1> %active.lane.mask

0 commit comments

Comments
 (0)