Skip to content

[AArch64] Don't expand RSHRN intrinsics to add+srl+trunc. #67451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19390,17 +19390,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_sshl:
case Intrinsic::aarch64_neon_ushl:
return tryCombineShiftImm(IID, N, DAG);
case Intrinsic::aarch64_neon_rshrn: {
EVT VT = N->getOperand(1).getValueType();
SDLoc DL(N);
SDValue Imm =
DAG.getConstant(1LLU << (N->getConstantOperandVal(2) - 1), DL, VT);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Imm);
SDValue Sht =
DAG.getNode(ISD::SRL, DL, VT, Add,
DAG.getConstant(N->getConstantOperandVal(2), DL, VT));
return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht);
}
case Intrinsic::aarch64_neon_sabd:
return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
Expand Down
30 changes: 28 additions & 2 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,9 @@ def AArch64faddp : PatFrags<(ops node:$Rn, node:$Rm),
[(AArch64addp_n node:$Rn, node:$Rm),
(int_aarch64_neon_faddp node:$Rn, node:$Rm)]>;
def AArch64roundingvlshr : ComplexPattern<vAny, 2, "SelectRoundingVLShr", [AArch64vlshr]>;
def AArch64rshrn : PatFrags<(ops node:$LHS, node:$RHS),
[(trunc (AArch64roundingvlshr node:$LHS, node:$RHS)),
(int_aarch64_neon_rshrn node:$LHS, node:$RHS)]>;
def AArch64facge : PatFrags<(ops node:$Rn, node:$Rm),
[(AArch64fcmge (fabs node:$Rn), (fabs node:$Rm)),
(int_aarch64_neon_facge node:$Rn, node:$Rm)]>;
Expand Down Expand Up @@ -7191,8 +7194,7 @@ defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs
defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
int_aarch64_neon_vcvtfxs2fp>;
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
BinOpFrag<(trunc (AArch64roundingvlshr node:$LHS, node:$RHS))>>;
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", AArch64rshrn>;
defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;

// X << 1 ==> X + X
Expand Down Expand Up @@ -7263,6 +7265,12 @@ def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift
let AddedComplexity = 5 in
def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))),
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))),
(RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;

// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
def : Pat<(v16i8 (concat_vectors
Expand All @@ -7284,6 +7292,24 @@ def : Pat<(v4i32 (concat_vectors
(RADDHNv2i64_v4i32
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v2i64 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v16i8 (concat_vectors
(v8i8 V64:$Vd),
(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))),
(RADDHNv8i16_v16i8
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v8i16 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v8i16 (concat_vectors
(v4i16 V64:$Vd),
(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))),
(RADDHNv4i32_v8i16
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v4i32 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v4i32 (concat_vectors
(v2i32 V64:$Vd),
(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))),
(RADDHNv2i64_v4i32
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v2i64 (MOVIv2d_ns (i32 0))))>;

// SHRN patterns for when a logical right shift was used instead of arithmetic
// (the immediate guarantees no sign bits actually end up in the result so it
Expand Down
18 changes: 6 additions & 12 deletions llvm/test/CodeGen/AArch64/arm64-vshift.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3531,11 +3531,8 @@ entry:
define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) {
; CHECK-LABEL: sext_rshrn:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi.4s v1, #16, lsl #8
; CHECK-NEXT: add.4s v0, v0, v1
; CHECK-NEXT: ushr.4s v0, v0, #13
; CHECK-NEXT: shl.4s v0, v0, #16
; CHECK-NEXT: sshr.4s v0, v0, #16
; CHECK-NEXT: rshrn.4h v0, v0, #13
; CHECK-NEXT: sshll.4s v0, v0, #0
; CHECK-NEXT: ret
entry:
%vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
Expand All @@ -3546,10 +3543,8 @@ entry:
define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) {
; CHECK-LABEL: zext_rshrn:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi.4s v1, #16, lsl #8
; CHECK-NEXT: add.4s v0, v0, v1
; CHECK-NEXT: ushr.4s v0, v0, #13
; CHECK-NEXT: bic.4s v0, #7, lsl #16
; CHECK-NEXT: rshrn.4h v0, v0, #13
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ret
entry:
%vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
Expand All @@ -3560,10 +3555,9 @@ entry:
define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) {
; CHECK-LABEL: mul_rshrn:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, #4099 // =0x1003
; CHECK-NEXT: dup.4s v1, w8
; CHECK-NEXT: movi.4s v1, #3
; CHECK-NEXT: add.4s v0, v0, v1
; CHECK-NEXT: shrn.4h v0, v0, #13
; CHECK-NEXT: rshrn.4h v0, v0, #13
; CHECK-NEXT: ret
entry:
%b = add <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
Expand Down