Skip to content

Commit 7cc83c5

Browse files
committed
[AArch64] Don't expand RSHRN intrinsics to add+srl+trunc.
We expand aarch64_neon_rshrn intrinsics to trunc(srl(add)), having tablegen patterns to combine the results back into rshrn. See D140297. Unfortunately, but perhaps not surprisingly, other combines can happen that prevent us converting back. For example sext(rshrn) becomes sext(trunc(srl(add))) which will turn into sext_inreg(srl(add))). This patch just prevents the expansion of rshrn intrinsics, reinstating the old tablegen patterns for selecting them. This should allow us to still regognize the rshrn instructions from trunc+shift+add, without performing any negative optimizations for the intrinsics. Closes #67451
1 parent 7fc25ae commit 7cc83c5

File tree

3 files changed

+34
-25
lines changed

3 files changed

+34
-25
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19390,17 +19390,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
1939019390
case Intrinsic::aarch64_neon_sshl:
1939119391
case Intrinsic::aarch64_neon_ushl:
1939219392
return tryCombineShiftImm(IID, N, DAG);
19393-
case Intrinsic::aarch64_neon_rshrn: {
19394-
EVT VT = N->getOperand(1).getValueType();
19395-
SDLoc DL(N);
19396-
SDValue Imm =
19397-
DAG.getConstant(1LLU << (N->getConstantOperandVal(2) - 1), DL, VT);
19398-
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Imm);
19399-
SDValue Sht =
19400-
DAG.getNode(ISD::SRL, DL, VT, Add,
19401-
DAG.getConstant(N->getConstantOperandVal(2), DL, VT));
19402-
return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Sht);
19403-
}
1940419393
case Intrinsic::aarch64_neon_sabd:
1940519394
return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
1940619395
N->getOperand(1), N->getOperand(2));

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,9 @@ def AArch64faddp : PatFrags<(ops node:$Rn, node:$Rm),
777777
[(AArch64addp_n node:$Rn, node:$Rm),
778778
(int_aarch64_neon_faddp node:$Rn, node:$Rm)]>;
779779
def AArch64roundingvlshr : ComplexPattern<vAny, 2, "SelectRoundingVLShr", [AArch64vlshr]>;
780+
def AArch64rshrn : PatFrags<(ops node:$LHS, node:$RHS),
781+
[(trunc (AArch64roundingvlshr node:$LHS, node:$RHS)),
782+
(int_aarch64_neon_rshrn node:$LHS, node:$RHS)]>;
780783
def AArch64facge : PatFrags<(ops node:$Rn, node:$Rm),
781784
[(AArch64fcmge (fabs node:$Rn), (fabs node:$Rm)),
782785
(int_aarch64_neon_facge node:$Rn, node:$Rm)]>;
@@ -7191,8 +7194,7 @@ defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs
71917194
defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
71927195
defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
71937196
int_aarch64_neon_vcvtfxs2fp>;
7194-
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
7195-
BinOpFrag<(trunc (AArch64roundingvlshr node:$LHS, node:$RHS))>>;
7197+
defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", AArch64rshrn>;
71967198
defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
71977199

71987200
// X << 1 ==> X + X
@@ -7263,6 +7265,12 @@ def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift
72637265
let AddedComplexity = 5 in
72647266
def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))),
72657267
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
7268+
def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
7269+
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
7270+
def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))),
7271+
(RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
7272+
def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
7273+
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
72667274

72677275
// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
72687276
def : Pat<(v16i8 (concat_vectors
@@ -7284,6 +7292,24 @@ def : Pat<(v4i32 (concat_vectors
72847292
(RADDHNv2i64_v4i32
72857293
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
72867294
(v2i64 (MOVIv2d_ns (i32 0))))>;
7295+
def : Pat<(v16i8 (concat_vectors
7296+
(v8i8 V64:$Vd),
7297+
(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))),
7298+
(RADDHNv8i16_v16i8
7299+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
7300+
(v8i16 (MOVIv2d_ns (i32 0))))>;
7301+
def : Pat<(v8i16 (concat_vectors
7302+
(v4i16 V64:$Vd),
7303+
(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))),
7304+
(RADDHNv4i32_v8i16
7305+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
7306+
(v4i32 (MOVIv2d_ns (i32 0))))>;
7307+
def : Pat<(v4i32 (concat_vectors
7308+
(v2i32 V64:$Vd),
7309+
(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))),
7310+
(RADDHNv2i64_v4i32
7311+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
7312+
(v2i64 (MOVIv2d_ns (i32 0))))>;
72877313

72887314
// SHRN patterns for when a logical right shift was used instead of arithmetic
72897315
// (the immediate guarantees no sign bits actually end up in the result so it

llvm/test/CodeGen/AArch64/arm64-vshift.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3531,11 +3531,8 @@ entry:
35313531
define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) {
35323532
; CHECK-LABEL: sext_rshrn:
35333533
; CHECK: // %bb.0: // %entry
3534-
; CHECK-NEXT: movi.4s v1, #16, lsl #8
3535-
; CHECK-NEXT: add.4s v0, v0, v1
3536-
; CHECK-NEXT: ushr.4s v0, v0, #13
3537-
; CHECK-NEXT: shl.4s v0, v0, #16
3538-
; CHECK-NEXT: sshr.4s v0, v0, #16
3534+
; CHECK-NEXT: rshrn.4h v0, v0, #13
3535+
; CHECK-NEXT: sshll.4s v0, v0, #0
35393536
; CHECK-NEXT: ret
35403537
entry:
35413538
%vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
@@ -3546,10 +3543,8 @@ entry:
35463543
define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) {
35473544
; CHECK-LABEL: zext_rshrn:
35483545
; CHECK: // %bb.0: // %entry
3549-
; CHECK-NEXT: movi.4s v1, #16, lsl #8
3550-
; CHECK-NEXT: add.4s v0, v0, v1
3551-
; CHECK-NEXT: ushr.4s v0, v0, #13
3552-
; CHECK-NEXT: bic.4s v0, #7, lsl #16
3546+
; CHECK-NEXT: rshrn.4h v0, v0, #13
3547+
; CHECK-NEXT: ushll.4s v0, v0, #0
35533548
; CHECK-NEXT: ret
35543549
entry:
35553550
%vrshrn_n1 = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %a, i32 13)
@@ -3560,10 +3555,9 @@ entry:
35603555
define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) {
35613556
; CHECK-LABEL: mul_rshrn:
35623557
; CHECK: // %bb.0: // %entry
3563-
; CHECK-NEXT: mov w8, #4099 // =0x1003
3564-
; CHECK-NEXT: dup.4s v1, w8
3558+
; CHECK-NEXT: movi.4s v1, #3
35653559
; CHECK-NEXT: add.4s v0, v0, v1
3566-
; CHECK-NEXT: shrn.4h v0, v0, #13
3560+
; CHECK-NEXT: rshrn.4h v0, v0, #13
35673561
; CHECK-NEXT: ret
35683562
entry:
35693563
%b = add <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>

0 commit comments

Comments
 (0)