Skip to content

[AArch64] Fold away zext of extract of uzp. #107367

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22240,6 +22240,70 @@ static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
}

// This comes up similar to the above when lowering deinterleaving shuffles from
// zexts. We have legalized the operations in the generally case to
// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
// the extract is to the low half and the uzp is uzp1. There would be an extra
// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
// there could also be an existing and / shift that can be combined in, either
// before of after the extract.
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (N->getOpcode() != ISD::ZERO_EXTEND ||
(VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
return SDValue();

SDValue Op = N->getOperand(0);
unsigned ExtOffset = (unsigned)-1;
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
ExtOffset = Op.getConstantOperandVal(1);
Op = Op.getOperand(0);
}

unsigned Shift = 0;
APInt Mask = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
Op.getValueType().getScalarSizeInBits());

if (Op.getOpcode() == AArch64ISD::VLSHR) {
Shift = Op.getConstantOperandVal(1);
Op = Op.getOperand(0);
Mask = Mask.lshr(Shift);
}
if (Op.getOpcode() == ISD::AND &&
ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
Op = Op.getOperand(0);
Mask = Mask.zext(VT.getScalarSizeInBits());
} else if (Op.getOpcode() == AArch64ISD::BICi) {
Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
Mask = Mask.zext(VT.getScalarSizeInBits());
Op = Op.getOperand(0);
}

if (ExtOffset == (unsigned)-1) {
if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
ExtOffset = Op.getConstantOperandVal(1);
Op = Op.getOperand(0);
} else
return SDValue();
}
if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
return SDValue();

if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
return SDValue();
if (Op.getOpcode() == AArch64ISD::UZP2)
Shift += VT.getScalarSizeInBits() / 2;

SDLoc DL(N);
SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
Op.getOperand(ExtOffset == 0 ? 0 : 1));
if (Shift != 0)
BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
DAG.getConstant(Shift, DL, MVT::i32));
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
}

static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
Expand All @@ -22262,6 +22326,8 @@ static SDValue performExtendCombine(SDNode *N,

if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
return R;
if (SDValue R = performZExtUZPCombine(N, DAG))
return R;

if (N->getValueType(0).isFixedLengthVector() &&
N->getOpcode() == ISD::SIGN_EXTEND &&
Expand Down
162 changes: 69 additions & 93 deletions llvm/test/CodeGen/AArch64/zext-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,11 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_15913:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-NEXT: ushr v0.2d, v0.2d, #16
; CHECK-NEXT: ushr v1.2d, v1.2d, #16
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
%z1 = zext <4 x i16> %s1 to <4 x i64>
Expand All @@ -117,10 +118,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) {
define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) {
; CHECK-LABEL: v2i64_i16_371115:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ushr v0.2d, v0.2d, #48
; CHECK-NEXT: ushr v1.2d, v1.2d, #48
; CHECK-NEXT: ret
%s1 = shufflevector <16 x i16> %a, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
%z1 = zext <4 x i16> %s1 to <4 x i64>
Expand All @@ -142,8 +141,7 @@ define <4 x i32> @v4i32_0246(<8 x i16> %a, <8 x i16> %b) {
define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: v4i32_1357:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v0.8h
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%d = zext <4 x i16> %c to <4 x i32>
Expand Down Expand Up @@ -210,8 +208,7 @@ define <8 x i16> @v8i16_0246(<16 x i8> %a, <16 x i8> %b) {
define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i16_1357:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%d = zext <8 x i8> %c to <8 x i16>
Expand Down Expand Up @@ -278,8 +275,7 @@ define <8 x i32> @v8i32_0246(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_1357:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ret
Expand All @@ -291,10 +287,9 @@ define <8 x i32> @v8i32_1357(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_04812:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%d = zext <8 x i8> %c to <8 x i32>
Expand All @@ -304,10 +299,11 @@ define <8 x i32> @v8i32_04812(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_15913:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-NEXT: ushr v0.4s, v0.4s, #8
; CHECK-NEXT: ushr v1.4s, v1.4s, #8
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
%d = zext <8 x i8> %c to <8 x i32>
Expand All @@ -317,10 +313,10 @@ define <8 x i32> @v8i32_15913(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_261014:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushr v0.4s, v0.4s, #16
; CHECK-NEXT: ushr v1.4s, v1.4s, #16
; CHECK-NEXT: bic v0.4s, #255, lsl #8
; CHECK-NEXT: bic v1.4s, #255, lsl #8
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
%d = zext <8 x i8> %c to <8 x i32>
Expand All @@ -330,10 +326,8 @@ define <8 x i32> @v8i32_261014(<16 x i8> %a, <16 x i8> %b) {
define <8 x i32> @v8i32_371115(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: v8i32_371115:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ushr v0.8h, v0.8h, #8
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushr v0.4s, v0.4s, #24
; CHECK-NEXT: ushr v1.4s, v1.4s, #24
; CHECK-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
%d = zext <8 x i8> %c to <8 x i32>
Expand Down Expand Up @@ -407,77 +401,59 @@ define <8 x i64> @zext_load_add(ptr %p) {
define <8 x double> @uitofp_fadd(<32 x i16> %l) {
; CHECK-LABEL: uitofp_fadd:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v5.4s, v0.4s, v3.4s
; CHECK-NEXT: uzp1 v6.4s, v0.4s, v1.4s
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi d4, #0x00ffff0000ffff
; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s
; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8
; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #8
; CHECK-NEXT: uzp2 v1.4s, v0.4s, v3.4s
; CHECK-NEXT: and v17.8b, v6.8b, v4.8b
; CHECK-NEXT: and v18.8b, v7.8b, v4.8b
; CHECK-NEXT: ushr v6.2s, v6.2s, #16
; CHECK-NEXT: ushr v7.2s, v7.2s, #16
; CHECK-NEXT: and v21.8b, v0.8b, v4.8b
; CHECK-NEXT: and v22.8b, v2.8b, v4.8b
; CHECK-NEXT: ushr v2.2s, v2.2s, #16
; CHECK-NEXT: and v19.8b, v16.8b, v4.8b
; CHECK-NEXT: and v20.8b, v5.8b, v4.8b
; CHECK-NEXT: ushll v3.2d, v17.2s, #0
; CHECK-NEXT: ushll v17.2d, v18.2s, #0
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: ushr v16.2s, v16.2s, #16
; CHECK-NEXT: ushr v5.2s, v5.2s, #16
; CHECK-NEXT: ushll v6.2d, v6.2s, #0
; CHECK-NEXT: ushll v7.2d, v7.2s, #0
; CHECK-NEXT: ushll v18.2d, v19.2s, #0
; CHECK-NEXT: ushll v19.2d, v20.2s, #0
; CHECK-NEXT: ext v20.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: ushr v0.2s, v0.2s, #16
; CHECK-NEXT: ushll v16.2d, v16.2s, #0
; CHECK-NEXT: ushll v21.2d, v21.2s, #0
; CHECK-NEXT: ushll v5.2d, v5.2s, #0
; CHECK-NEXT: ushll v22.2d, v22.2s, #0
; CHECK-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-NEXT: ucvtf v3.2d, v3.2d
; CHECK-NEXT: ucvtf v17.2d, v17.2d
; CHECK-NEXT: ucvtf v6.2d, v6.2d
; CHECK-NEXT: and v23.8b, v20.8b, v4.8b
; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
; CHECK-NEXT: ushr v20.2s, v20.2s, #16
; CHECK-NEXT: ushr v1.2s, v1.2s, #16
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ucvtf v7.2d, v7.2d
; CHECK-NEXT: movi v4.2d, #0x0000000000ffff
; CHECK-NEXT: ushr v5.2d, v0.2d, #16
; CHECK-NEXT: ushr v6.2d, v1.2d, #16
; CHECK-NEXT: ushr v7.2d, v2.2d, #16
; CHECK-NEXT: ushr v17.2d, v3.2d, #16
; CHECK-NEXT: ushr v20.2d, v0.2d, #32
; CHECK-NEXT: ushr v22.2d, v1.2d, #32
; CHECK-NEXT: ushr v23.2d, v2.2d, #32
; CHECK-NEXT: ushr v24.2d, v3.2d, #32
; CHECK-NEXT: and v16.16b, v0.16b, v4.16b
; CHECK-NEXT: and v18.16b, v1.16b, v4.16b
; CHECK-NEXT: and v19.16b, v2.16b, v4.16b
; CHECK-NEXT: and v21.16b, v3.16b, v4.16b
; CHECK-NEXT: and v5.16b, v5.16b, v4.16b
; CHECK-NEXT: and v6.16b, v6.16b, v4.16b
; CHECK-NEXT: and v7.16b, v7.16b, v4.16b
; CHECK-NEXT: and v17.16b, v17.16b, v4.16b
; CHECK-NEXT: and v20.16b, v20.16b, v4.16b
; CHECK-NEXT: and v22.16b, v22.16b, v4.16b
; CHECK-NEXT: and v23.16b, v23.16b, v4.16b
; CHECK-NEXT: and v4.16b, v24.16b, v4.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #48
; CHECK-NEXT: ushr v1.2d, v1.2d, #48
; CHECK-NEXT: ushr v2.2d, v2.2d, #48
; CHECK-NEXT: ushr v3.2d, v3.2d, #48
; CHECK-NEXT: ucvtf v16.2d, v16.2d
; CHECK-NEXT: ucvtf v18.2d, v18.2d
; CHECK-NEXT: ucvtf v19.2d, v19.2d
; CHECK-NEXT: ucvtf v16.2d, v16.2d
; CHECK-NEXT: ushll v23.2d, v23.2s, #0
; CHECK-NEXT: ushll v4.2d, v4.2s, #0
; CHECK-NEXT: ushll v20.2d, v20.2s, #0
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: ucvtf v5.2d, v5.2d
; CHECK-NEXT: ucvtf v21.2d, v21.2d
; CHECK-NEXT: ucvtf v5.2d, v5.2d
; CHECK-NEXT: ucvtf v6.2d, v6.2d
; CHECK-NEXT: ucvtf v7.2d, v7.2d
; CHECK-NEXT: ucvtf v17.2d, v17.2d
; CHECK-NEXT: ucvtf v20.2d, v20.2d
; CHECK-NEXT: ucvtf v22.2d, v22.2d
; CHECK-NEXT: ucvtf v0.2d, v0.2d
; CHECK-NEXT: ucvtf v2.2d, v2.2d
; CHECK-NEXT: ucvtf v23.2d, v23.2d
; CHECK-NEXT: ucvtf v4.2d, v4.2d
; CHECK-NEXT: ucvtf v20.2d, v20.2d
; CHECK-NEXT: ucvtf v0.2d, v0.2d
; CHECK-NEXT: ucvtf v1.2d, v1.2d
; CHECK-NEXT: fadd v16.2d, v18.2d, v16.2d
; CHECK-NEXT: fadd v7.2d, v17.2d, v7.2d
; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d
; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d
; CHECK-NEXT: fadd v0.2d, v21.2d, v0.2d
; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d
; CHECK-NEXT: fadd v4.2d, v4.2d, v1.2d
; CHECK-NEXT: fadd v1.2d, v23.2d, v20.2d
; CHECK-NEXT: fadd v0.2d, v3.2d, v0.2d
; CHECK-NEXT: ucvtf v2.2d, v2.2d
; CHECK-NEXT: ucvtf v3.2d, v3.2d
; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d
; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d
; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d
; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d
; CHECK-NEXT: fadd v0.2d, v20.2d, v0.2d
; CHECK-NEXT: fadd v1.2d, v22.2d, v1.2d
; CHECK-NEXT: fadd v3.2d, v4.2d, v3.2d
; CHECK-NEXT: fadd v2.2d, v23.2d, v2.2d
; CHECK-NEXT: fadd v0.2d, v5.2d, v0.2d
; CHECK-NEXT: fadd v1.2d, v6.2d, v1.2d
; CHECK-NEXT: fadd v2.2d, v7.2d, v2.2d
; CHECK-NEXT: fadd v1.2d, v16.2d, v1.2d
; CHECK-NEXT: fadd v3.2d, v5.2d, v4.2d
; CHECK-NEXT: fadd v3.2d, v17.2d, v3.2d
; CHECK-NEXT: ret
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%z1 = uitofp <8 x i16> %s1 to <8 x double>
Expand Down
Loading