-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAGCombiner] Add support for scalarising extracts of a vector setcc #116031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-webassembly @llvm/pr-subscribers-backend-aarch64 Author: David Sherwood (david-arm) ChangesFor IR like this: %icmp = icmp ult <4 x i32> %a, splat (i32 5) where there is only one use of %icmp we can take a similar approach %ext = extractelement <4 x i32> %a, i32 1 For AArch64 targets at least the scalar boolean result will almost NOTE: The optimisations don't apply for tests such as CodeGen/AArch64/extract-vector-cmp.ll because scalarizeExtractedBinOp only works if one of the input Full diff: https://github.com/llvm/llvm-project/pull/116031.diff 4 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e0b638201a0474..9c86dc5ee4f016 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3334,6 +3334,10 @@ class TargetLoweringBase {
return false;
}
+ /// Try to convert an extract element of a vector setcc operation into an
+ /// extract element followed by a scalar operation.
+ virtual bool shouldScalarizeSetCC(SDValue VecOp) const { return false; }
+
/// Return true if extraction of a scalar element from the given vector type
/// at the given index is cheap. For example, if scalar operations occur on
/// the same register file as vector operations, then an extract element may
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6059229cd6d9a4..164ac6dc50668e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22743,19 +22743,23 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
/// Transform a vector binary operation into a scalar binary operation by moving
/// the math/logic after an extract element of a vector.
-static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
- const SDLoc &DL, bool LegalOperations) {
+static bool scalarizeExtractedBinOpCommon(SDNode *ExtElt, SelectionDAG &DAG,
+ const SDLoc &DL, bool IsSetCC,
+ SDValue &ScalarOp1,
+ SDValue &ScalarOp2) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vec = ExtElt->getOperand(0);
SDValue Index = ExtElt->getOperand(1);
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
- if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
- Vec->getNumValues() != 1)
- return SDValue();
+ if (!IndexC || !Vec.hasOneUse() || Vec->getNumValues() != 1)
+ return false;
// Targets may want to avoid this to prevent an expensive register transfer.
- if (!TLI.shouldScalarizeBinop(Vec))
- return SDValue();
+ if ((IsSetCC &&
+ (Vec.getOpcode() != ISD::SETCC || !TLI.shouldScalarizeSetCC(Vec))) ||
+ (!IsSetCC &&
+ (!TLI.isBinOp(Vec.getOpcode()) || !TLI.shouldScalarizeBinop(Vec))))
+ return false;
// Extracting an element of a vector constant is constant-folded, so this
// transform is just replacing a vector op with a scalar op while moving the
@@ -22769,13 +22773,38 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
// extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
// extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
- EVT VT = ExtElt->getValueType(0);
- SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
- SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
- return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
+ // extractelt (setcc X, C, op), IndexC -> setcc (extractelt X, IndexC)), C
+ // extractelt (setcc C, X, op), IndexC -> setcc (extractelt IndexC, X)), C
+ EVT VT = Op0->getValueType(0).getVectorElementType();
+ ScalarOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
+ ScalarOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
+ return true;
}
- return SDValue();
+ return false;
+}
+
+static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ SDValue Op1, Op2;
+ SDValue Vec = ExtElt->getOperand(0);
+ if (!scalarizeExtractedBinOpCommon(ExtElt, DAG, DL, false, Op1, Op2))
+ return SDValue();
+
+ EVT VT = ExtElt->getValueType(0);
+ return DAG.getNode(Vec.getOpcode(), DL, VT, Op1, Op2);
+}
+
+static SDValue scalarizeExtractedSetCC(SDNode *ExtElt, SelectionDAG &DAG,
+ const SDLoc &DL) {
+ SDValue Op1, Op2;
+ SDValue Vec = ExtElt->getOperand(0);
+ if (!scalarizeExtractedBinOpCommon(ExtElt, DAG, DL, true, Op1, Op2))
+ return SDValue();
+
+ EVT VT = ExtElt->getValueType(0);
+ return DAG.getSetCC(DL, VT, Op1, Op2,
+ cast<CondCodeSDNode>(Vec->getOperand(2))->get());
}
// Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23008,9 +23037,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
}
}
- if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
+ if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
return BO;
+ // extract (setcc x, splat(y)), i -> setcc (extract x, i)), y
+ if (ScalarVT == VecVT.getVectorElementType())
+ if (SDValue SetCC = scalarizeExtractedSetCC(N, DAG, DL))
+ return SetCC;
+
if (VecVT.isScalableVector())
return SDValue();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d696355bb062a8..9c79f37a14779a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1346,6 +1346,8 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMinimumJumpTableEntries() const override;
bool softPromoteHalfType() const override { return true; }
+
+ bool shouldScalarizeSetCC(SDValue VecOp) const override { return true; }
};
namespace AArch64 {
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
new file mode 100644
index 00000000000000..2388a0f206a51c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: cmp w8, #5
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %icmp = icmp ult <4 x i32> %a, splat (i32 5)
+ %ext = extractelement <4 x i1> %icmp, i32 1
+ ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: cmp w8, #7
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ret
+ %icmp = icmp ult <4 x i32> splat(i32 7), %a
+ %ext = extractelement <4 x i1> %icmp, i32 1
+ ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
+; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: cmp w8, #234
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
+ %ext = extractelement <4 x i1> %icmp, i32 1
+ ret i1 %ext
+}
+
+define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
+; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov s0, v0.s[1]
+; CHECK-NEXT: fmov s1, #4.00000000
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+ %fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
+ %ext = extractelement <4 x i1> %fcmp, i32 1
+ ret i1 %ext
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: index z0.d, #0, #1
+; CHECK-NEXT: mov w8, #4 // =0x4
+; CHECK-NEXT: mov w9, #16 // =0x10
+; CHECK-NEXT: dup v2.2d, x8
+; CHECK-NEXT: add x8, x0, #8
+; CHECK-NEXT: mov w10, #1 // =0x1
+; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: add z1.d, z1.d, #2 // =0x2
+; CHECK-NEXT: b .LBB4_2
+; CHECK-NEXT: .LBB4_1: // %pred.store.continue18
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: subs x9, x9, #4
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: b.eq .LBB4_10
+; CHECK-NEXT: .LBB4_2: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: fmov x11, d0
+; CHECK-NEXT: cmp x11, #14
+; CHECK-NEXT: b.hi .LBB4_4
+; CHECK-NEXT: // %bb.3: // %pred.store.if
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: stur w10, [x8, #-8]
+; CHECK-NEXT: .LBB4_4: // %pred.store.continue
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: mov x11, v0.d[1]
+; CHECK-NEXT: cmp x11, #14
+; CHECK-NEXT: b.hi .LBB4_6
+; CHECK-NEXT: // %bb.5: // %pred.store.if5
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: stur w10, [x8, #-4]
+; CHECK-NEXT: .LBB4_6: // %pred.store.continue6
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: fmov x11, d1
+; CHECK-NEXT: cmp x11, #14
+; CHECK-NEXT: b.hi .LBB4_8
+; CHECK-NEXT: // %bb.7: // %pred.store.if7
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: str w10, [x8]
+; CHECK-NEXT: .LBB4_8: // %pred.store.continue8
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: mov x11, v1.d[1]
+; CHECK-NEXT: cmp x11, #14
+; CHECK-NEXT: b.hi .LBB4_1
+; CHECK-NEXT: // %bb.9: // %pred.store.if9
+; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: str w10, [x8, #4]
+; CHECK-NEXT: b .LBB4_1
+; CHECK-NEXT: .LBB4_10: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+ %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+ %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+ %1 = extractelement <4 x i1> %0, i64 0
+ br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+ %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+ store i32 1, ptr %2, align 4
+ br label %pred.store.continue
+
+pred.store.continue:
+ %3 = extractelement <4 x i1> %0, i64 1
+ br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+ %4 = or disjoint i64 %index, 1
+ %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+ store i32 1, ptr %5, align 4
+ br label %pred.store.continue6
+
+pred.store.continue6:
+ %6 = extractelement <4 x i1> %0, i64 2
+ br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+ %7 = or disjoint i64 %index, 2
+ %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+ store i32 1, ptr %8, align 4
+ br label %pred.store.continue8
+
+pred.store.continue8:
+ %9 = extractelement <4 x i1> %0, i64 3
+ br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+ %10 = or disjoint i64 %index, 3
+ %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+ store i32 1, ptr %11, align 4
+ br label %pred.store.continue18
+
+pred.store.continue18:
+ %index.next = add i64 %index, 4
+ %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+ %24 = icmp eq i64 %index.next, 16
+ br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+ ret void
+}
+
+
+; Negative tests
+
+define i1 @extract_icmp_v4i32_splat_rhs(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v1.4s, w0
+; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: umov w8, v0.h[1]
+; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: ret
+ %ins = insertelement <4 x i32> poison, i32 %b, i32 0
+ %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+ %icmp = icmp ult <4 x i32> %a, %splat
+ %ext = extractelement <4 x i1> %icmp, i32 1
+ ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #235
+; CHECK-NEXT: adrp x9, .LCPI6_0
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_0]
+; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v1.4h, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: umov w9, v1.h[1]
+; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: and w0, w9, #0x1
+; CHECK-NEXT: strb w10, [x8]
+; CHECK-NEXT: ret
+ %icmp = icmp ult <4 x i32> %a, splat(i32 235)
+ %ext = extractelement <4 x i1> %icmp, i32 1
+ store <4 x i1> %icmp, ptr %p, align 4
+ ret i1 %ext
+}
+
+define i1 @extract_icmp_v4i32_splat_rhs_unknown_idx(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_unknown_idx:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: movi v1.4s, #127
+; CHECK-NEXT: add x8, sp, #8
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: bfi x8, x0, #1, #2
+; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: str d0, [sp, #8]
+; CHECK-NEXT: ldrh w8, [x8]
+; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %icmp = icmp ult <4 x i32> %a, splat(i32 127)
+ %ext = extractelement <4 x i1> %icmp, i32 %c
+ ret i1 %ext
+}
|
@@ -3334,6 +3334,10 @@ class TargetLoweringBase { | |||
return false; | |||
} | |||
|
|||
/// Try to convert an extract element of a vector setcc operation into an | |||
/// extract element followed by a scalar operation. | |||
virtual bool shouldScalarizeSetCC(SDValue VecOp) const { return false; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see why this needs a special case hook. We already do this for extractelement other_op for the hasOneUse case
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was simply following the existing pattern for shouldScalarizeBinOp as it looks like not all targets prefer to do this type of optimisation. I did try enabling this for all targets and it seems to give regressions on X86. Or are you referring to something else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could just reuse the existing shouldScalarizeBinOp
hook, but the setcc op feels like a special case because the result type is often different to the input (e.g. v4i1 output, v4i32 input, etc.) and may set condition registers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, perhaps I didn't understand your comment. Are you implying that I can simply reuse shouldScalarizeBinOp
and check the opcode of VecOp
instead of adding an extra hook just for setcc? If so, then I'm happy to do that - it just requires modifying all existing shouldScalarizeBinOp
hooks to bail out for the setcc case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't looked at the context that closely, but we need way fewer points of configuration. The ones we do have should be much more general. This stuff is not discoverable by backend maintainers, and is high effort to find and implement. But consider this a hard -1 on adding a new super specific thing. If a target hook as no arguments, it's a major code smell.
But yes, shouldScalarizeBinOp would be preferable. The changing result type probably doesn't matter in practice, it's approximately fixed per target
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
63ecfc0
to
8d1fc17
Compare
Fixed failing test and pushed some of the opcode-related checks from |
SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index); | ||
return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1); | ||
// extractelt (setcc X, C, op), IndexC -> setcc (extractelt X, IndexC)), C | ||
// extractelt (setcc C, X, op), IndexC -> setcc (extractelt IndexC, X)), C |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// extractelt (setcc C, X, op), IndexC -> setcc (extractelt IndexC, X)), C | |
// extractelt (setcc C, X, op), IndexC -> setcc C, (extractelt X, IndexC)) |
That said, I don't think this, now common, function needs to call out every operation. Perhaps just change binop
to op
to make the original comment more general?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed extra comments and changed wording
const SDLoc &DL, bool LegalOperations) { | ||
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | ||
static bool scalarizeExtractedBinOpCommon(SDNode *ExtElt, SelectionDAG &DAG, | ||
const SDLoc &DL, bool IsSetCC, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IsSetCC
is unused?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, removed.
; CHECK-NEXT: fmov d0, xzr | ||
; CHECK-NEXT: ptrue p0.d | ||
; CHECK-NEXT: mov z16.d, #0 // =0x0 | ||
; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 | ||
; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s | ||
; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h | ||
; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b | ||
; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't look like an intended change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've stopped the new setcc fold from happening by adding an extra use of the icmp!
8d1fc17
to
2e3f5db
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of recommendations but otherwise this looks good to me.
Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op0, Index); | ||
Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Op1, Index); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the ResVT == Vec.getValueType().getVectorElementType()
restriction, can you use ResVT
here and remove the need for OpVT
?
Sorry, I just realised this question is bogus given the setcc
result is unlikely to match its operands.
For IR like this: %icmp = icmp ult <4 x i32> %a, splat (i32 5) %res = extractelement <4 x i1> %icmp, i32 1 where there is only one use of %icmp we can take a similar approach to what we already do for binary ops such add, sub, etc. and convert this into %ext = extractelement <4 x i32> %a, i32 1 %res = icmp ult i32 %ext, 5 For AArch64 targets at least the scalar boolean result will almost certainly need to be in a GPR anyway, since it will probably be used by branches for control flow. I've tried to reuse existing code in scalarizeExtractedBinop to also work for setcc. NOTE: The optimisations don't apply for tests such as extract_icmp_v4i32_splat_rhs in the file CodeGen/AArch64/extract-vector-cmp.ll because scalarizeExtractedBinOp only works if one of the input operands is a constant.
2e3f5db
to
65b02f3
Compare
Rebase. Hopefully tests pass this time! |
Hi @david-arm , I start seeing
|
OK I'll take a look now. If I can't provide an immediate fix I'll revert the patch. |
…r setcc (llvm#116031)" This reverts commit 22ec44f.
Thanks! |
For IR like this:
%icmp = icmp ult <4 x i32> %a, splat (i32 5)
%res = extractelement <4 x i1> %icmp, i32 1
where there is only one use of %icmp we can take a similar approach
to what we already do for binary ops such add, sub, etc. and convert
this into
%ext = extractelement <4 x i32> %a, i32 1
%res = icmp ult i32 %ext, 5
For AArch64 targets at least the scalar boolean result will almost
certainly need to be in a GPR anyway, since it will probably be
used by branches for control flow. I've tried to reuse existing code
in scalarizeExtractedBinop to also work for setcc.
NOTE: The optimisations don't apply for tests such as
extract_icmp_v4i32_splat_rhs in the file
CodeGen/AArch64/extract-vector-cmp.ll
because scalarizeExtractedBinOp only works if one of the input
operands is a constant.