-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LLVM][SVE] Improve legalisation of fixed length get.active.lane.mask #90213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LLVM][SVE] Improve legalisation of fixed length get.active.lane.mask #90213
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) Changes… operations. We are effectively performing type and operation legalisation very early within the code generation flow. This results in worse code quality because the DAG is not in canonical form, which DAGCombiner corrects through the introduction of operations that are not legal. This patchs splits and moves the code to where type and operation legalisation is typically implemented. Full diff: https://github.com/llvm/llvm-project/pull/90213.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8e9782c1930c3c..f7e25d53e94262 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1674,6 +1674,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
setOperationAction(ISD::VSCALE, MVT::i32, Custom);
+
+ for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
}
if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -5686,8 +5689,24 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::get_active_lane_mask: {
SDValue ID =
DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
- Op.getOperand(1), Op.getOperand(2));
+
+ EVT VT = Op.getValueType();
+ if (VT.isScalableVector())
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
+ Op.getOperand(2));
+
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
+
+ SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
+ Op.getOperand(1), Op.getOperand(2));
+ SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
+ DAG.getVectorIdxConstant(0, dl));
}
case Intrinsic::aarch64_neon_uaddlv: {
EVT OpVT = Op.getOperand(1).getValueType();
@@ -20462,39 +20481,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
switch (IID) {
default:
break;
- case Intrinsic::get_active_lane_mask: {
- SDValue Res = SDValue();
- EVT VT = N->getValueType(0);
- if (VT.isFixedLengthVector()) {
- // We can use the SVE whilelo instruction to lower this intrinsic by
- // creating the appropriate sequence of scalable vector operations and
- // then extracting a fixed-width subvector from the scalable vector.
-
- SDLoc DL(N);
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
-
- EVT WhileVT = EVT::getVectorVT(
- *DAG.getContext(), MVT::i1,
- ElementCount::getScalable(VT.getVectorNumElements()));
-
- // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
- EVT PromVT = getPromotedVTForPredicate(WhileVT);
-
- // Get the fixed-width equivalent of PromVT for extraction.
- EVT ExtVT =
- EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
- VT.getVectorElementCount());
-
- Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
- N->getOperand(1), N->getOperand(2));
- Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
- DAG.getConstant(0, DL, MVT::i64));
- Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- }
- return Res;
- }
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -25568,8 +25554,6 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
case ISD::INTRINSIC_WO_CHAIN: {
EVT VT = N->getValueType(0);
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "custom lowering for unexpected type");
Intrinsic::ID IntID =
static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
@@ -25577,6 +25561,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
default:
return;
case Intrinsic::aarch64_sve_clasta_n: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
@@ -25585,6 +25571,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
case Intrinsic::aarch64_sve_clastb_n: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
@@ -25593,6 +25581,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
case Intrinsic::aarch64_sve_lasta: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
N->getOperand(1), N->getOperand(2));
@@ -25600,12 +25590,30 @@ void AArch64TargetLowering::ReplaceNodeResults(
return;
}
case Intrinsic::aarch64_sve_lastb: {
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
SDLoc DL(N);
auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
N->getOperand(1), N->getOperand(2));
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
return;
}
+ case Intrinsic::get_active_lane_mask: {
+ if (!VT.isFixedLengthVector())
+ return;
+ if (VT.getVectorElementType() != MVT::i1)
+ return;
+
+ // NOTE: Only trivial type promotion is supported.
+ EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
+ return;
+
+ SDLoc DL(N);
+ auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+ return;
+ }
}
}
case ISD::READ_REGISTER: {
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 43122c8c953fc7..bd5d076d1ba82e 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -353,9 +353,9 @@ define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) {
define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v8i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.h, w0, w1
-; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: whilelo p0.b, w0, w1
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
ret <8 x i1> %active.lane.mask
@@ -364,9 +364,9 @@ define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) {
define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.s, w0, w1
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: whilelo p0.h, w0, w1
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC)
ret <4 x i1> %active.lane.mask
@@ -375,9 +375,9 @@ define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) {
define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.d, w0, w1
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: whilelo p0.s, w0, w1
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC)
ret <2 x i1> %active.lane.mask
@@ -397,9 +397,9 @@ define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) {
define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v8i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.h, x0, x1
-; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: whilelo p0.b, x0, x1
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC)
ret <8 x i1> %active.lane.mask
@@ -408,9 +408,9 @@ define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) {
define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.s, x0, x1
-; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: whilelo p0.h, x0, x1
+; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC)
ret <4 x i1> %active.lane.mask
@@ -419,9 +419,9 @@ define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) {
define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: whilelo p0.d, x0, x1
-; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: whilelo p0.s, x0, x1
+; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC)
ret <2 x i1> %active.lane.mask
|
0b96ba3
to
7b7fb40
Compare
The failures are due to a breakage upstream. |
… operations. We are effectively performing type and operation legalisation very early within the code generation flow. This results in worse code quality because the DAG is not in canonical form, which DAGCombiner corrects through the introduction of operations that are not legal. This patchs splits and moves the code to where type and operation legalisation is typically implemented.
7b7fb40
to
9054eff
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A work of art @paulwalker-arm! I just had one minor suggestion, but if you don't agree with my comment then I'll just accept it.
case Intrinsic::get_active_lane_mask: { | ||
if (!VT.isFixedLengthVector()) | ||
return; | ||
if (VT.getVectorElementType() != MVT::i1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it looks neater to combine these two if statements into one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense given they are related. I'll post an update, likely tomorrow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
We are effectively performing type and operation legalisation very early within the code generation flow. This results in worse code quality because the DAG is not in canonical form, which DAGCombiner corrects through the introduction of operations that are not legal.
This patchs splits and moves the code to where type and operation legalisation is typically implemented.