Skip to content

Commit 5474d49

Browse files
committed
[AArch64] Remove copy instruction between uaddlv and urshr
If there are copy instructions between uaddlv and urshr for transfer from gpr to fpr, and vice versa, try to remove them. Differential Revision: https://reviews.llvm.org/D159265
1 parent 94b1435 commit 5474d49

File tree

2 files changed

+75
-2
lines changed

2 files changed

+75
-2
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
10131013
setTargetDAGCombine(ISD::VECREDUCE_OR);
10141014
setTargetDAGCombine(ISD::VECREDUCE_XOR);
10151015

1016+
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1017+
10161018
// In case of strict alignment, avoid an excessive number of byte wide stores.
10171019
MaxStoresPerMemsetOptSize = 8;
10181020
MaxStoresPerMemset =
@@ -23121,6 +23123,55 @@ static SDValue performMULLCombine(SDNode *N,
2312123123
return SDValue();
2312223124
}
2312323125

23126+
static SDValue
23127+
performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23128+
SelectionDAG &DAG) {
23129+
// Let's do below transform.
23130+
//
23131+
// t34: v4i32 = AArch64ISD::UADDLV t2
23132+
// t35: i32 = extract_vector_elt t34, Constant:i64<0>
23133+
// t7: i64 = zero_extend t35
23134+
// t20: v1i64 = scalar_to_vector t7
23135+
// ==>
23136+
// t34: v4i32 = AArch64ISD::UADDLV t2
23137+
// t39: v2i32 = extract_subvector t34, Constant:i64<0>
23138+
// t40: v1i64 = AArch64ISD::NVCAST t39
23139+
if (DCI.isBeforeLegalizeOps())
23140+
return SDValue();
23141+
23142+
EVT VT = N->getValueType(0);
23143+
if (VT != MVT::v1i64)
23144+
return SDValue();
23145+
23146+
SDValue ZEXT = N->getOperand(0);
23147+
if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
23148+
return SDValue();
23149+
23150+
SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
23151+
if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23152+
EXTRACT_VEC_ELT.getValueType() != MVT::i32)
23153+
return SDValue();
23154+
23155+
if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
23156+
return SDValue();
23157+
23158+
SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
23159+
if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
23160+
UADDLV.getValueType() != MVT::v4i32 ||
23161+
UADDLV.getOperand(0).getValueType() != MVT::v8i8)
23162+
return SDValue();
23163+
23164+
// Let's generate new sequence with AArch64ISD::NVCAST.
23165+
SDLoc DL(N);
23166+
SDValue EXTRACT_SUBVEC =
23167+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
23168+
DAG.getConstant(0, DL, MVT::i64));
23169+
SDValue NVCAST =
23170+
DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
23171+
23172+
return NVCAST;
23173+
}
23174+
2312423175
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2312523176
DAGCombinerInfo &DCI) const {
2312623177
SelectionDAG &DAG = DCI.DAG;
@@ -23436,6 +23487,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2343623487
return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
2343723488
case ISD::CTLZ:
2343823489
return performCTLZCombine(N, DAG, Subtarget);
23490+
case ISD::SCALAR_TO_VECTOR:
23491+
return performScalarToVectorCombine(N, DCI, DAG);
2343923492
}
2344023493
return SDValue();
2344123494
}

llvm/test/CodeGen/AArch64/neon-addlv.ll

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ entry:
178178
ret i32 %0
179179
}
180180

181-
define dso_local <8 x i8> @bar(<8 x i8> noundef %a) local_unnamed_addr #0 {
182-
; CHECK-LABEL: bar:
181+
define dso_local <8 x i8> @uaddlv_v8i8_dup(<8 x i8> %a) {
182+
; CHECK-LABEL: uaddlv_v8i8_dup:
183183
; CHECK: // %bb.0: // %entry
184184
; CHECK-NEXT: uaddlv h0, v0.8b
185185
; CHECK-NEXT: dup v0.8h, v0.h[0]
@@ -195,3 +195,23 @@ entry:
195195
}
196196

197197
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
198+
199+
declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)
200+
201+
define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) {
202+
; CHECK-LABEL: uaddlv_v8i8_urshr:
203+
; CHECK: // %bb.0: // %entry
204+
; CHECK-NEXT: uaddlv h0, v0.8b
205+
; CHECK-NEXT: urshr d0, d0, #3
206+
; CHECK-NEXT: dup v0.8b, v0.b[0]
207+
; CHECK-NEXT: ret
208+
entry:
209+
%vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
210+
%0 = and i32 %vaddlv.i, 65535
211+
%conv = zext i32 %0 to i64
212+
%vrshr_n = tail call i64 @llvm.aarch64.neon.urshl.i64(i64 %conv, i64 -3)
213+
%conv1 = trunc i64 %vrshr_n to i8
214+
%vecinit.i = insertelement <8 x i8> undef, i8 %conv1, i64 0
215+
%vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer
216+
ret <8 x i8> %vecinit7.i
217+
}

0 commit comments

Comments
 (0)