-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][SelectionDAG] Vector splitting and promotion for histogram intrinsic #103037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Adds support for wider-than-legal vector types for the histogram intrinsic (llvm.experimental.vector.histogram.add) by splitting the vector.
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Max Beck-Jones (DevM-uk) ChangesAdds support for wider-than-legal vector types for the histogram intrinsic (llvm.experimental.vector.histogram.add) by splitting the vector. Full diff: https://github.com/llvm/llvm-project/pull/103037.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7777aa4b50a370..7c9b34b272f17d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1128,6 +1128,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ setTargetDAGCombine(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM);
+
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemset =
@@ -25434,6 +25436,41 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
+static SDValue performHistogramCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N);
+ SDLoc DL(HG);
+ SDValue Chain = HG->getChain();
+ SDValue Inc = HG->getInc();
+ SDValue Mask = HG->getMask();
+ SDValue Ptr = HG->getBasePtr();
+ SDValue Index = HG->getIndex();
+ SDValue Scale = HG->getScale();
+ SDValue IntID = HG->getIntID();
+ EVT MemVT = HG->getMemoryVT();
+ EVT IndexVT = Index.getValueType();
+ MachineMemOperand *MMO = HG->getMemOperand();
+ ISD::MemIndexType IndexType = HG->getIndexType();
+
+ if (IndexVT == MVT::nxv4i32 || IndexVT == MVT::nxv2i64)
+ return SDValue();
+
+ // Split vectors which are too wide
+ SDValue IndexLo, IndexHi, MaskLo, MaskHi;
+ std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+ SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID};
+ SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+ HistogramOpsLo, MMO, IndexType);
+ SDValue HistogramOpsHi[] = {HChain, Inc, MaskHi, Ptr, IndexHi, Scale, IntID};
+ return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL,
+ HistogramOpsHi, MMO, IndexType);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -25778,6 +25815,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performCTLZCombine(N, DAG, Subtarget);
case ISD::SCALAR_TO_VECTOR:
return performScalarToVectorCombine(N, DCI, DAG);
+ case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
+ return performHistogramCombine(N, DCI, DAG);
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
index 2874e47511e12f..56d5eb13ab12e3 100644
--- a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll
@@ -169,4 +169,99 @@ define void @histogram_i16_literal_3(ptr %base, <vscale x 4 x i32> %indices, <vs
ret void
}
+define void @histogram_i64_4_lane(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64_4_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.d, x0
+; CHECK-NEXT: ptrue p2.d
+; CHECK-NEXT: histcnt z2.d, p1/z, z0.d, z0.d
+; CHECK-NEXT: ld1d { z3.d }, p1/z, [z0.d]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mad z2.d, p2/m, z4.d, z3.d
+; CHECK-NEXT: st1d { z2.d }, p1, [z0.d]
+; CHECK-NEXT: histcnt z0.d, p0/z, z1.d, z1.d
+; CHECK-NEXT: ld1d { z2.d }, p0/z, [z1.d]
+; CHECK-NEXT: mad z0.d, p2/m, z4.d, z2.d
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i64_8_lane(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i64_8_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: mov z6.d, x0
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: punpklo p3.h, p2.b
+; CHECK-NEXT: punpkhi p2.h, p2.b
+; CHECK-NEXT: histcnt z4.d, p3/z, z0.d, z0.d
+; CHECK-NEXT: ld1d { z5.d }, p3/z, [z0.d]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mad z4.d, p1/m, z6.d, z5.d
+; CHECK-NEXT: st1d { z4.d }, p3, [z0.d]
+; CHECK-NEXT: histcnt z0.d, p2/z, z1.d, z1.d
+; CHECK-NEXT: ld1d { z4.d }, p2/z, [z1.d]
+; CHECK-NEXT: mad z0.d, p1/m, z6.d, z4.d
+; CHECK-NEXT: st1d { z0.d }, p2, [z1.d]
+; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: histcnt z0.d, p2/z, z2.d, z2.d
+; CHECK-NEXT: ld1d { z1.d }, p2/z, [z2.d]
+; CHECK-NEXT: mad z0.d, p1/m, z6.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p2, [z2.d]
+; CHECK-NEXT: histcnt z0.d, p0/z, z3.d, z3.d
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [z3.d]
+; CHECK-NEXT: mad z0.d, p1/m, z6.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [z3.d]
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vector.histogram.add.nxv8p0.i64(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i32_8_lane(ptr %base, <vscale x 8 x i32> %indices, i32 %inc, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i32_8_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.s, w1
+; CHECK-NEXT: ptrue p2.s
+; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
+; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
+; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i32, ptr %base, <vscale x 8 x i32> %indices
+ call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 %inc, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %inc, <vscale x 8 x i1> %mask) #0 {
+; CHECK-LABEL: histogram_i16_8_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: punpklo p1.h, p0.b
+; CHECK-NEXT: mov z4.s, w1
+; CHECK-NEXT: ptrue p2.s
+; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s
+; CHECK-NEXT: ld1h { z3.s }, p1/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s
+; CHECK-NEXT: st1h { z2.s }, p1, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s
+; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z1.s, sxtw #1]
+; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s
+; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
+; CHECK-NEXT: ret
+ %buckets = getelementptr i16, ptr %base, <vscale x 8 x i32> %indices
+ call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 %inc, <vscale x 8 x i1> %mask)
+ ret void
+}
+
+
attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }
|
@@ -25434,6 +25436,41 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, | |||
return NVCAST; | |||
} | |||
|
|||
static SDValue performHistogramCombine(SDNode *N, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than doing this as a target-specific DAG combine, this should be done as generic type legalisation in LegalizeVectorTypes.cpp. See DAGTypeLegalizer::SplitVectorOperand
for how this is done for other operations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right. I've moved the splitting to DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM
and added the Inc
operand promotion to DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM
.
Moves the splitting from a target specific DAG combine to LegalizeVectorTypes and adds promotion to LegalizeIntegerTypes. Also updates how the histogram intrinsic is custom lowered during legalize ops.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left a few nits, but otherwise looks good to me!
SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { | ||
MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(N); | ||
SDLoc DL(HG); | ||
SDValue Chain = HG->getChain(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Chain
has one use, just inline.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SDValue Inc = HG->getInc(); | ||
SDValue Mask = HG->getMask(); | ||
SDValue Ptr = HG->getBasePtr(); | ||
SDValue Index = HG->getIndex(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Index
has one use, just inline.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SDLoc DL(HG); | ||
SDValue Chain = HG->getChain(); | ||
SDValue Inc = HG->getInc(); | ||
SDValue Mask = HG->getMask(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Mask
has one use, just inline.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SDValue IndexLo, IndexHi, MaskLo, MaskHi; | ||
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); | ||
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); | ||
SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID}; | |
SDValue OpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID}; |
or just inline the initializer-sequence into the getMaskedHistogram
directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've made the suggested change. Operand lists are passed in using arrays in other places (such as with getMaskedScatter
s) rather than passing the initializer-sequence inline.
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); | ||
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL); | ||
SDValue HistogramOpsLo[] = {Chain, Inc, MaskLo, Ptr, IndexLo, Scale, IntID}; | ||
SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
SDValue HChain = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, | |
SDValue Lo = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), MemVT, DL, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i8, Promote); | ||
setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::i16, Promote); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These two setOperationActions
s are unnecessary, because i8 and i16 are illegal types, so the legalizer will already try to legalize the operation, regardless of this setOperationAction
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed.
On the commit message, the patch is also doing type promotion, so maybe you want to rephrase it to something slightly more generic, e.g. |
This PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from llvm#101017 and llvm#103037 Work by Max Beck-Jones (@DevM-uk)
This PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from #101017 and #103037 Work by Max Beck-Jones (@DevM-uk) --------- Co-authored-by: DevM-uk <[email protected]>
…08521) This PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from llvm#101017 and llvm#103037 Work by Max Beck-Jones (@DevM-uk) --------- Co-authored-by: DevM-uk <[email protected]>
Adds support for wider-than-legal vector types for the histogram intrinsic (llvm.experimental.vector.histogram.add) by splitting the vector.