-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][SVE] Improve code quality of vector unsigned/signed add reductions. #97339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Improve code quality of vector unsigned/signed add reductions. #97339
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Dinar Temirbulatov (dtemirbulatov) ChangesFor SVE we don't have to zero extend and sum part of the result before issuing UADDV instruction. Also this change allows to handle bigger than a legal vector type more efficiently and lower a fixed-length vector type to SVE's UADDV where appropriate. Full diff: https://github.com/llvm/llvm-project/pull/97339.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 130d1e9683f7e..74c11a32a3c73 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17455,6 +17455,99 @@ static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
}
+static SDValue
+performVecReduceAddZextCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64TargetLowering &TLI) {
+ if (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ SDNode *ZEXT = N->getOperand(0).getNode();
+ EVT VecVT = ZEXT->getOperand(0).getValueType();
+ SDLoc DL(N);
+
+ SDValue VecOp = ZEXT->getOperand(0);
+ VecVT = VecOp.getValueType();
+ bool IsScalableType = VecVT.isScalableVector();
+
+ if (TLI.isTypeLegal(VecVT)) {
+ if (!IsScalableType &&
+ !TLI.useSVEForFixedLengthVectorVT(
+ VecVT,
+ /*OverrideNEON=*/Subtarget.useSVEForFixedLengthVectors(VecVT)))
+ return SDValue();
+
+ if (!IsScalableType) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VecVT);
+ VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+ }
+ VecVT = VecOp.getValueType();
+ EVT RdxVT = N->getValueType(0);
+ RdxVT = getPackedSVEVectorVT(RdxVT);
+ SDValue Pg = getPredicateForVector(DAG, DL, VecVT);
+ SDValue Res = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+ DAG.getConstant(Intrinsic::aarch64_sve_uaddv, DL, MVT::i64), Pg, VecOp);
+ EVT ResVT = MVT::i64;
+ if (ResVT != N->getValueType(0))
+ Res = DAG.getAnyExtOrTrunc(Res, DL, N->getValueType(0));
+ return Res;
+ }
+
+ SmallVector<SDValue, 4> SplitVals;
+ SmallVector<SDValue, 4> PrevVals;
+ PrevVals.push_back(VecOp);
+ while (true) {
+
+ if (!VecVT.isScalableVector() &&
+ !PrevVals[0].getValueType().getVectorElementCount().isKnownEven())
+ return SDValue();
+
+ for (SDValue Vec : PrevVals) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
+ SplitVals.push_back(Lo);
+ SplitVals.push_back(Hi);
+ }
+ if (TLI.isTypeLegal(SplitVals[0].getValueType()))
+ break;
+ PrevVals.clear();
+ std::copy(SplitVals.begin(), SplitVals.end(), std::back_inserter(PrevVals));
+ SplitVals.clear();
+ }
+ SDNode *VecRed = N;
+ EVT ElemType = VecRed->getValueType(0);
+ SmallVector<SDValue, 4> Results;
+
+ if (!IsScalableType &&
+ !TLI.useSVEForFixedLengthVectorVT(
+ SplitVals[0].getValueType(),
+ /*OverrideNEON=*/Subtarget.useSVEForFixedLengthVectors(
+ SplitVals[0].getValueType())))
+ return SDValue();
+
+ for (unsigned Num = 0; Num < SplitVals.size(); ++Num) {
+ SDValue Reg = SplitVals[Num];
+ EVT RdxVT = Reg->getValueType(0);
+ SDValue Pg = getPredicateForVector(DAG, DL, RdxVT);
+ if (!IsScalableType) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, RdxVT);
+ Reg = convertToScalableVector(DAG, ContainerVT, Reg);
+ }
+ SDValue Res = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+ DAG.getConstant(Intrinsic::aarch64_sve_uaddv, DL, MVT::i64), Pg, Reg);
+ if (ElemType != MVT::i64)
+ Res = DAG.getAnyExtOrTrunc(Res, DL, ElemType);
+ Results.push_back(Res);
+ }
+ SDValue ToAdd = Results[0];
+ for (unsigned I = 1; I < SplitVals.size(); ++I)
+ ToAdd = DAG.getNode(ISD::ADD, DL, ElemType, ToAdd, Results[I]);
+ return ToAdd;
+}
+
// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
@@ -25128,8 +25221,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performInsertVectorEltCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI, Subtarget);
- case ISD::VECREDUCE_ADD:
- return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
+ case ISD::VECREDUCE_ADD: {
+ if (SDValue Val = performVecReduceAddCombine(N, DCI.DAG, Subtarget))
+ return Val;
+ return performVecReduceAddZextCombine(N, DCI, *this);
+ }
case AArch64ISD::UADDV:
return performUADDVCombine(N, DAG);
case AArch64ISD::SMULL:
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb654..b289dfbec527c 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -103,17 +103,12 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: add_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpkhi z2.h, z0.b
-; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z3.h, z1.b
-; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: add z0.h, z0.h, z2.h
-; CHECK-NEXT: add z1.h, z1.h, z3.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: uaddv d1, p0, z1.b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -126,21 +121,15 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: add_ext_v32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uunpklo z3.h, z1.b
-; CHECK-NEXT: uunpklo z4.h, z0.b
-; CHECK-NEXT: uunpkhi z1.h, z1.b
-; CHECK-NEXT: uunpkhi z0.h, z0.b
-; CHECK-NEXT: uunpkhi z5.h, z2.b
-; CHECK-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: add z1.h, z4.h, z3.h
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: add z1.h, z2.h, z5.h
-; CHECK-NEXT: add z0.h, z0.h, z1.h
-; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uaddv d1, p0, z1.b
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: uaddv d2, p0, z2.b
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f25..4b401731e6f80 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -188,6 +188,103 @@ define i64 @uaddv_nxv2i64(<vscale x 2 x i64> %a) {
ret i64 %res
}
+define i32 @uaddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: uaddv_nxv16i8_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: uaddv d0, p0, z0.b
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %1 = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
+ ret i32 %2
+}
+
+define i64 @uaddv_nxv16i16_nxv16i64(<vscale x 16 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16_nxv16i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ret
+ %1 = zext <vscale x 16 x i16> %a to <vscale x 16 x i64>
+ %2 = call i64 @llvm.vector.reduce.add.nxv16i64(<vscale x 16 x i64> %1)
+ ret i64 %2
+}
+
+define i32 @uaddv_nxv16i16_nxv16i32(<vscale x 32 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: uaddv d2, p0, z2.h
+; CHECK-NEXT: uaddv d3, p0, z3.h
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %1 = zext <vscale x 32 x i16> %a to <vscale x 32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.nxv32i64(<vscale x 32 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @saddv_nxv16i8_nxv16i32(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: saddv_nxv16i8_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sunpkhi z1.h, z0.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEXT: sunpkhi z0.s, z0.h
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: add z1.s, z3.s, z2.s
+; CHECK-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+ %1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @uaddv_nxv32i16_nxv32i32(ptr %a) {
+; CHECK-LABEL: uaddv_nxv32i16_nxv32i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: uaddv d0, p0, z0.h
+; CHECK-NEXT: uaddv d1, p0, z1.h
+; CHECK-NEXT: uaddv d2, p0, z2.h
+; CHECK-NEXT: uaddv d3, p0, z3.h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: add w8, w9, w8
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %1 = load <vscale x 32 x i16>, ptr %a, align 16
+ %2 = zext <vscale x 32 x i16> %1 to <vscale x 32 x i32>
+ %3 = call i32 @llvm.vector.reduce.add.nxv32i32(<vscale x 32 x i32> %2)
+ ret i32 %3
+}
+
; UMINV
define i8 @umin_nxv16i8(<vscale x 16 x i8> %a) {
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
new file mode 100644
index 0000000000000..9f076bd3c8e40
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,NO_STREAMING
+; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE_128
+; RUN: llc -mattr=+sve -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE_MIN_256
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @reduce_uadd_v16i8(<32 x i8> %a) #0 {
+; NO_STREAMING-LABEL: reduce_uadd_v16i8:
+; NO_STREAMING: // %bb.0:
+; NO_STREAMING-NEXT: ushll2 v2.8h, v1.16b, #0
+; NO_STREAMING-NEXT: ushll2 v3.8h, v0.16b, #0
+; NO_STREAMING-NEXT: ushll v1.8h, v1.8b, #0
+; NO_STREAMING-NEXT: ushll v0.8h, v0.8b, #0
+; NO_STREAMING-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
+; NO_STREAMING-NEXT: uaddl v2.4s, v3.4h, v2.4h
+; NO_STREAMING-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
+; NO_STREAMING-NEXT: uaddl v0.4s, v0.4h, v1.4h
+; NO_STREAMING-NEXT: add v1.4s, v5.4s, v4.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s
+; NO_STREAMING-NEXT: addv s0, v0.4s
+; NO_STREAMING-NEXT: fmov w0, s0
+; NO_STREAMING-NEXT: ret
+;
+; SVE_128-LABEL: reduce_uadd_v16i8:
+; SVE_128: // %bb.0:
+; SVE_128-NEXT: ptrue p0.b
+; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_128-NEXT: uaddv d1, p0, z1.b
+; SVE_128-NEXT: uaddv d0, p0, z0.b
+; SVE_128-NEXT: fmov x8, d1
+; SVE_128-NEXT: fmov x9, d0
+; SVE_128-NEXT: add w0, w9, w8
+; SVE_128-NEXT: ret
+;
+; SVE_MIN_256-LABEL: reduce_uadd_v16i8:
+; SVE_MIN_256: // %bb.0:
+; SVE_MIN_256-NEXT: ptrue p0.b, vl16
+; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_MIN_256-NEXT: splice z0.b, p0, z0.b, z1.b
+; SVE_MIN_256-NEXT: ptrue p0.b
+; SVE_MIN_256-NEXT: uaddv d0, p0, z0.b
+; SVE_MIN_256-NEXT: fmov x0, d0
+; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SVE_MIN_256-NEXT: ret
+ %1 = zext <32 x i8> %a to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
+ ret i32 %2
+}
+
+define i32 @reduce_sadd_v16i8(<32 x i8> %a) #0 {
+; NO_STREAMING-LABEL: reduce_sadd_v16i8:
+; NO_STREAMING: // %bb.0:
+; NO_STREAMING-NEXT: sshll2 v2.8h, v1.16b, #0
+; NO_STREAMING-NEXT: sshll2 v3.8h, v0.16b, #0
+; NO_STREAMING-NEXT: sshll v1.8h, v1.8b, #0
+; NO_STREAMING-NEXT: sshll v0.8h, v0.8b, #0
+; NO_STREAMING-NEXT: saddl2 v4.4s, v3.8h, v2.8h
+; NO_STREAMING-NEXT: saddl v2.4s, v3.4h, v2.4h
+; NO_STREAMING-NEXT: saddl2 v5.4s, v0.8h, v1.8h
+; NO_STREAMING-NEXT: saddl v0.4s, v0.4h, v1.4h
+; NO_STREAMING-NEXT: add v1.4s, v5.4s, v4.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v2.4s
+; NO_STREAMING-NEXT: add v0.4s, v0.4s, v1.4s
+; NO_STREAMING-NEXT: addv s0, v0.4s
+; NO_STREAMING-NEXT: fmov w0, s0
+; NO_STREAMING-NEXT: ret
+;
+; SVE_128-LABEL: reduce_sadd_v16i8:
+; SVE_128: // %bb.0:
+; SVE_128-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_128-NEXT: sunpklo z2.h, z1.b
+; SVE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_128-NEXT: sunpklo z3.h, z0.b
+; SVE_128-NEXT: ptrue p0.s
+; SVE_128-NEXT: ext z1.b, z1.b, z1.b, #8
+; SVE_128-NEXT: ext z0.b, z0.b, z0.b, #8
+; SVE_128-NEXT: sunpklo z1.h, z1.b
+; SVE_128-NEXT: sunpklo z0.h, z0.b
+; SVE_128-NEXT: sunpklo z4.s, z2.h
+; SVE_128-NEXT: ext z2.b, z2.b, z2.b, #8
+; SVE_128-NEXT: sunpklo z6.s, z3.h
+; SVE_128-NEXT: ext z3.b, z3.b, z3.b, #8
+; SVE_128-NEXT: mov z5.d, z1.d
+; SVE_128-NEXT: sunpklo z7.s, z0.h
+; SVE_128-NEXT: ext z0.b, z0.b, z0.b, #8
+; SVE_128-NEXT: sunpklo z2.s, z2.h
+; SVE_128-NEXT: sunpklo z3.s, z3.h
+; SVE_128-NEXT: add z4.s, z6.s, z4.s
+; SVE_128-NEXT: ext z5.b, z5.b, z1.b, #8
+; SVE_128-NEXT: sunpklo z1.s, z1.h
+; SVE_128-NEXT: sunpklo z0.s, z0.h
+; SVE_128-NEXT: add z2.s, z3.s, z2.s
+; SVE_128-NEXT: sunpklo z5.s, z5.h
+; SVE_128-NEXT: add z1.s, z7.s, z1.s
+; SVE_128-NEXT: add z0.s, z0.s, z5.s
+; SVE_128-NEXT: add z1.s, z4.s, z1.s
+; SVE_128-NEXT: add z0.s, z2.s, z0.s
+; SVE_128-NEXT: add z0.s, z1.s, z0.s
+; SVE_128-NEXT: uaddv d0, p0, z0.s
+; SVE_128-NEXT: fmov x0, d0
+; SVE_128-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SVE_128-NEXT: ret
+;
+; SVE_MIN_256-LABEL: reduce_sadd_v16i8:
+; SVE_MIN_256: // %bb.0:
+; SVE_MIN_256-NEXT: // kill: def $q1 killed $q1 def $z1
+; SVE_MIN_256-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE_MIN_256-NEXT: sunpklo z2.h, z1.b
+; SVE_MIN_256-NEXT: sunpklo z3.h, z0.b
+; SVE_MIN_256-NEXT: ptrue p0.s, vl8
+; SVE_MIN_256-NEXT: ext z1.b, z1.b, z1.b, #8
+; SVE_MIN_256-NEXT: ext z0.b, z0.b, z0.b, #8
+; SVE_MIN_256-NEXT: sunpklo z1.h, z1.b
+; SVE_MIN_256-NEXT: sunpklo z0.h, z0.b
+; SVE_MIN_256-NEXT: add z2.h, z3.h, z2.h
+; SVE_MIN_256-NEXT: add z0.h, z0.h, z1.h
+; SVE_MIN_256-NEXT: sunpklo z1.s, z2.h
+; SVE_MIN_256-NEXT: sunpklo z0.s, z0.h
+; SVE_MIN_256-NEXT: add z0.s, z1.s, z0.s
+; SVE_MIN_256-NEXT: uaddv d0, p0, z0.s
+; SVE_MIN_256-NEXT: fmov x0, d0
+; SVE_MIN_256-NEXT: // kill: def $w0 killed $w0 killed $x0
+; SVE_MIN_256-NEXT: ret
+ %1 = sext <32 x i8> %a to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v16i32(<32 x i32> %1)
+ ret i32 %2
+}
+
+attributes #0 = { "target-features"="+sve" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
|
SplitVals[0].getValueType()))) | ||
return SDValue(); | ||
|
||
for (unsigned Num = 0; Num < SplitVals.size(); ++Num) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems like this is just repeating the work done in the code guarded by if (TLI.isTypeLegal(VecVT))
. Could you treat a legal type as a subset of the wider problem, and just add the value directly to the list for processing?
You can also iterate directly over the members of SplitVals, instead of performing array indexing separately inside the loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for combining the two parts. You're still iterating via explicit indexing though, where you could use a range-based for loop for planting both the uaddv instructions and the subsequent scalar adds.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SplitVals[0].getValueType()))) | ||
return SDValue(); | ||
|
||
for (unsigned Num = 0; Num < SplitVals.size(); ++Num) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for combining the two parts. You're still iterating via explicit indexing though, where you could use a range-based for loop for planting both the uaddv instructions and the subsequent scalar adds.
if (TLI.isTypeLegal(ResultValues[0].getValueType())) | ||
break; | ||
PrevValues.clear(); | ||
std::copy(ResultValues.begin(), ResultValues.end(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to be almost reinventing a deque? I think std::deque would be tidier. Get the front item and pop it, perform the split, then push the split results to the back until the front item is legal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
✅ With the latest revision this PR passed the C/C++ code formatter. |
8c5adb6
to
a152d0d
Compare
ResultValues.push_back(VecOp); | ||
|
||
// Split the input vectors if not legal. | ||
while (!TLI.isTypeLegal(ResultValues.front().getValueType())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This checks only for isTypeLegal
, but doesn't check what kind of legalization is required. I think the only type of legalization you want to catch here is type splitting (because the vector is too wide). I think it's worth testing that explicitly. Can you also write a test where the type is not legal, but requires a different type of legalization (e.g. promote or widen)? e.g. vecreduce_add(zext nxv2i8 -> nxv2i32)
/*OverrideNEON=*/Subtarget.useSVEForFixedLengthVectors( | ||
ResultValues[0].getValueType()))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't look right. It should override NEON only when NEON is not available, i.e. when !Subtarget.isNeonAvailable()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
std::deque<SDValue> ResultValues; | ||
ResultValues.push_back(VecOp); | ||
|
||
// Split the input vectors if not legal. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My suggestion would be to use the free recursion that SelectionDAG gives, rather than writing an explicit loop. This aligns with how most legalization code in SelectionDAG works and makes the code a little simpler.
That would require distinguishing two cases:
CASE 1: If the input operand to the extend requires splitting, then split the operation into two VECREDUCE_ADD operations with a scalar ADD to combine their results, and return that value.
Example:
i32 (vecreduce_add (zext nxv32i8 %op to nxv32i32))
->
i32 (add
(i32 vecreduce_add (zext nxv16i8 %op.lo to nxv16i32)),
(i32 vecreduce_add (zext nxv16i8 %op.hi to nxv16i32)))
The DAGCombiner would revisit all nodes created above. So when it revisits the vecreduce_add nodes and the input operand to the extend are legal, then the combine can map this directly to a UADDV/SADDV operation.
CASE 2: If the input operand to the extend is legal, then map directly to UADDV/SADDV operation.
Example:
i32 (vecreduce_add (zext nxv16i8 %op to nxv16i32))
->
i32 (UADDV nxv16i8:%op)
Locally I made some hacky changes to your patch to try this, and found that it seems to improve codegen for fixed-length vectors a little bit, and it also simplifies the logic in this function because there is no longer a need for a dequeue or iterating parts of the input vector.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SDValue VecOp = N->getOperand(0).getOperand(0); | ||
SDLoc DL(N); | ||
|
||
bool IsScalableType = VecOp.getValueType().isScalableVector(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All places where you use IsScalableType
you invert its value first. So perhaps it makes more sense to have a variable named IsFixedLengthType = VecOp.getValueType().isFixedLengthVector();
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
For SVE we don't have to zero extend and sum part of the result before issuing UADDV instruction. Also this change allows to handle bigger than a legal vector type more efficiently and lower a fixed-length vector type to SVE's UADDV where appropriate.
a152d0d
to
65d0900
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/2157 Here is the relevant piece of the build log for the reference:
|
This issue looks like related #98491 to the failure. |
…uctions. (#97339) Summary: For SVE we don't have to zero extend and sum part of the result before issuing UADDV instruction. Also this change allows to handle bigger than a legal vector type more efficiently and lower a fixed-length vector type to SVE's UADDV where appropriate. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251320
… add reductions. (#97339)" Summary: This reverts commit b7b0071. The change caused regression in a performance testing. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251123
The original pull-request llvm#97339 from @dtemirbulatov got reverted due to some regressions with NEON. This PR removes the additional type legalisation for vectors that are too wide, which should make this change less contentious.
The original pull-request llvm#97339 from @dtemirbulatov got reverted due to some regressions with NEON. This PR removes the additional type legalisation for vectors that are too wide, which should make this change less contentious.
For SVE we don't have to zero extend and sum part of the result before issuing UADDV instruction. Also this change allows to handle bigger than a legal vector type more efficiently and lower a fixed-length vector type to SVE's UADDV where appropriate.