Skip to content

Commit 3568333

Browse files
committed
[AArch64] Perform last active true vector combine
Test bit of lane EC-1 can use P register directly, eg: Materialize : Idx = (add (mul vscale, NumEls), -1) i1 = extract_vector_elt t37, Constant:i64<Idx> ... into: "ptrue p, all" + PTEST Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D121180
1 parent cf63e9d commit 3568333

File tree

2 files changed

+65
-2
lines changed

2 files changed

+65
-2
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14410,12 +14410,57 @@ performFirstTrueTestVectorCombine(SDNode *N,
1441014410
return getPTest(DAG, N->getValueType(0), Pg, SetCC, AArch64CC::FIRST_ACTIVE);
1441114411
}
1441214412

14413+
// Materialize : Idx = (add (mul vscale, NumEls), -1)
14414+
// i1 = extract_vector_elt t37, Constant:i64<Idx>
14415+
// ... into: "ptrue p, all" + PTEST
14416+
static SDValue
14417+
performLastTrueTestVectorCombine(SDNode *N,
14418+
TargetLowering::DAGCombinerInfo &DCI,
14419+
const AArch64Subtarget *Subtarget) {
14420+
assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
14421+
// Make sure PTEST is legal types.
14422+
if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
14423+
return SDValue();
14424+
14425+
SDValue SetCC = N->getOperand(0);
14426+
EVT OpVT = SetCC.getValueType();
14427+
14428+
if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
14429+
return SDValue();
14430+
14431+
// Idx == (add (mul vscale, NumEls), -1)
14432+
SDValue Idx = N->getOperand(1);
14433+
if (Idx.getOpcode() != ISD::ADD)
14434+
return SDValue();
14435+
14436+
SDValue VS = Idx.getOperand(0);
14437+
if (VS.getOpcode() != ISD::VSCALE)
14438+
return SDValue();
14439+
14440+
unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
14441+
if (VS.getConstantOperandVal(0) != NumEls)
14442+
return SDValue();
14443+
14444+
// Restricted the DAG combine to only cases where we're extracting from a
14445+
// flag-setting operation
14446+
auto *CI = dyn_cast<ConstantSDNode>(Idx.getOperand(1));
14447+
if (!CI || !CI->isAllOnes() || SetCC.getOpcode() != ISD::SETCC)
14448+
return SDValue();
14449+
14450+
// Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
14451+
SelectionDAG &DAG = DCI.DAG;
14452+
SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
14453+
return getPTest(DAG, N->getValueType(0), Pg, SetCC, AArch64CC::LAST_ACTIVE);
14454+
}
14455+
1441314456
static SDValue
1441414457
performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1441514458
const AArch64Subtarget *Subtarget) {
1441614459
assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
1441714460
if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
1441814461
return Res;
14462+
if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
14463+
return Res;
1441914464

1442014465
SelectionDAG &DAG = DCI.DAG;
1442114466
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

llvm/test/CodeGen/AArch64/sve-cmp-folds.ll

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ define <vscale x 4 x i1> @not_fcmp_uge_nxv4f32(<vscale x 4 x float> %a, <vscale
5353
ret <vscale x 4 x i1> %not
5454
}
5555

56-
define i1 @foo(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
57-
; CHECK-LABEL: foo:
56+
define i1 @foo_first(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
57+
; CHECK-LABEL: foo_first:
5858
; CHECK: // %bb.0:
5959
; CHECK-NEXT: ptrue p0.s
6060
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
@@ -66,3 +66,21 @@ define i1 @foo(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
6666
ret i1 %bit
6767
}
6868

69+
define i1 @foo_last(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
70+
; CHECK-LABEL: foo_last:
71+
; CHECK: // %bb.0:
72+
; CHECK-NEXT: ptrue p0.s
73+
; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
74+
; CHECK-NEXT: ptest p0, p1.b
75+
; CHECK-NEXT: cset w0, lo
76+
; CHECK-NEXT: ret
77+
%vcond = fcmp oeq <vscale x 4 x float> %a, %b
78+
%vscale = call i64 @llvm.vscale.i64()
79+
%shl2 = shl nuw nsw i64 %vscale, 2
80+
%idx = add nuw nsw i64 %shl2, -1
81+
%bit = extractelement <vscale x 4 x i1> %vcond, i64 %idx
82+
ret i1 %bit
83+
}
84+
85+
86+
declare i64 @llvm.vscale.i64()

0 commit comments

Comments
 (0)