Skip to content

[AArch64] Handle v2i16 and v2i8 in concat load combine. #86264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18629,32 +18629,35 @@ static SDValue performConcatVectorsCombine(SDNode *N,
}
}

if (N->getOperand(0).getValueType() == MVT::v4i8) {
if (N->getOperand(0).getValueType() == MVT::v4i8 ||
N->getOperand(0).getValueType() == MVT::v2i16 ||
N->getOperand(0).getValueType() == MVT::v2i8) {
EVT SrcVT = N->getOperand(0).getValueType();
// If we have a concat of v4i8 loads, convert them to a buildvector of f32
// loads to prevent having to go through the v4i8 load legalization that
// needs to extend each element into a larger type.
if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
if (V.getValueType() != MVT::v4i8)
if (N->getNumOperands() % 2 == 0 &&
all_of(N->op_values(), [SrcVT](SDValue V) {
if (V.getValueType() != SrcVT)
return false;
if (V.isUndef())
return true;
LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
LD->getExtensionType() == ISD::NON_EXTLOAD;
})) {
EVT NVT =
EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
SmallVector<SDValue> Ops;

for (unsigned i = 0; i < N->getNumOperands(); i++) {
SDValue V = N->getOperand(i);
if (V.isUndef())
Ops.push_back(DAG.getUNDEF(MVT::f32));
Ops.push_back(DAG.getUNDEF(FVT));
else {
LoadSDNode *LD = cast<LoadSDNode>(V);
SDValue NewLoad =
DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
LD->getMemOperand());
SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
LD->getBasePtr(), LD->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
Ops.push_back(NewLoad);
}
Expand Down
85 changes: 16 additions & 69 deletions llvm/test/CodeGen/AArch64/insert-subvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,8 @@ define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_1:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v2.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[0], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand All @@ -395,12 +391,9 @@ define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_15:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v0.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: adrp x8, .LCPI33_0
; CHECK-NEXT: mov v0.b[1], v0.b[4]
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
Expand All @@ -414,12 +407,8 @@ define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v2.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[1], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand All @@ -432,12 +421,8 @@ define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_3:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v2.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[2], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand All @@ -450,12 +435,8 @@ define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_4:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v2.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[3], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand All @@ -468,11 +449,9 @@ define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_1:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v0.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: mov v0.s[1], v1.s[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand All @@ -485,10 +464,8 @@ define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v0.b }[0], [x0]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
Expand All @@ -504,13 +481,8 @@ define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_1:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[0], v2.s[0]
; CHECK-NEXT: ld1 { v0.s }[0], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -521,13 +493,9 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_15:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1
; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: adrp x8, .LCPI40_0
; CHECK-NEXT: ld1 { v2.h }[2], [x9]
; CHECK-NEXT: xtn v0.4h, v2.4s
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI40_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
Expand All @@ -540,13 +508,8 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: ld1 { v0.s }[1], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -557,13 +520,8 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_3:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[2], v2.s[0]
; CHECK-NEXT: ld1 { v0.s }[2], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -574,13 +532,8 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_4:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[3], v2.s[0]
; CHECK-NEXT: ld1 { v0.s }[3], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -591,11 +544,8 @@ define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v4i16_2_1:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v0.h }[0], [x0]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: mov v0.s[1], v1.s[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand All @@ -608,11 +558,8 @@ define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v4i16_2_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1 { v0.h }[0], [x0]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: ldr s2, [x0]
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
Expand Down