Skip to content

[AArch64] Do not generate uitofp(ld4) where and/shift can be used. #107538

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17079,6 +17079,16 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
return false;

// Check if the interleave is a zext(shuffle), that can be better optimized
// into shift / and masks. For the moment we do this just for uitofp (not
// zext) to avoid issues with widening instructions.
if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
SI->getType()->getScalarSizeInBits() * 4 ==
SI->user_back()->getType()->getScalarSizeInBits();
}))
return false;

unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);

auto *FVTy = cast<FixedVectorType>(VTy);
Expand Down
94 changes: 48 additions & 46 deletions llvm/test/CodeGen/AArch64/zext-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -472,59 +472,61 @@ define <8 x double> @uitofp_fadd(<32 x i16> %l) {
define <8 x double> @uitofp_load_fadd(ptr %p) {
; CHECK-LABEL: uitofp_load_fadd:
; CHECK: // %bb.0:
; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0
; CHECK-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0
; CHECK-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-NEXT: ushll2 v16.4s, v2.8h, #0
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
; CHECK-NEXT: ushll v0.4s, v3.4h, #0
; CHECK-NEXT: ushll2 v1.2d, v4.4s, #0
; CHECK-NEXT: ushll2 v2.2d, v5.4s, #0
; CHECK-NEXT: ushll v3.2d, v4.2s, #0
; CHECK-NEXT: ushll v4.2d, v5.2s, #0
; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0
; CHECK-NEXT: ushll2 v19.2d, v7.4s, #0
; CHECK-NEXT: ushll v6.2d, v6.2s, #0
; CHECK-NEXT: ushll v7.2d, v7.2s, #0
; CHECK-NEXT: ushll2 v20.2d, v16.4s, #0
; CHECK-NEXT: ushll2 v21.2d, v17.4s, #0
; CHECK-NEXT: ushll v16.2d, v16.2s, #0
; CHECK-NEXT: ushll v17.2d, v17.2s, #0
; CHECK-NEXT: ushll v22.2d, v0.2s, #0
; CHECK-NEXT: ushll2 v23.2d, v18.4s, #0
; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-NEXT: ushll v18.2d, v18.2s, #0
; CHECK-NEXT: ucvtf v1.2d, v1.2d
; CHECK-NEXT: ucvtf v2.2d, v2.2d
; CHECK-NEXT: ucvtf v3.2d, v3.2d
; CHECK-NEXT: ucvtf v4.2d, v4.2d
; CHECK-NEXT: ucvtf v5.2d, v5.2d
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: movi v0.2d, #0x0000000000ffff
; CHECK-NEXT: ldp q3, q4, [x0, #32]
; CHECK-NEXT: ushr v5.2d, v1.2d, #16
; CHECK-NEXT: ushr v6.2d, v2.2d, #16
; CHECK-NEXT: ushr v20.2d, v1.2d, #32
; CHECK-NEXT: ushr v7.2d, v3.2d, #16
; CHECK-NEXT: ushr v17.2d, v4.2d, #16
; CHECK-NEXT: ushr v22.2d, v2.2d, #32
; CHECK-NEXT: ushr v23.2d, v3.2d, #32
; CHECK-NEXT: ushr v24.2d, v4.2d, #32
; CHECK-NEXT: and v16.16b, v1.16b, v0.16b
; CHECK-NEXT: and v18.16b, v2.16b, v0.16b
; CHECK-NEXT: and v19.16b, v3.16b, v0.16b
; CHECK-NEXT: and v21.16b, v4.16b, v0.16b
; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
; CHECK-NEXT: and v6.16b, v6.16b, v0.16b
; CHECK-NEXT: and v7.16b, v7.16b, v0.16b
; CHECK-NEXT: and v17.16b, v17.16b, v0.16b
; CHECK-NEXT: and v20.16b, v20.16b, v0.16b
; CHECK-NEXT: and v22.16b, v22.16b, v0.16b
; CHECK-NEXT: and v23.16b, v23.16b, v0.16b
; CHECK-NEXT: and v0.16b, v24.16b, v0.16b
; CHECK-NEXT: ushr v1.2d, v1.2d, #48
; CHECK-NEXT: ushr v2.2d, v2.2d, #48
; CHECK-NEXT: ushr v3.2d, v3.2d, #48
; CHECK-NEXT: ushr v4.2d, v4.2d, #48
; CHECK-NEXT: ucvtf v16.2d, v16.2d
; CHECK-NEXT: ucvtf v18.2d, v18.2d
; CHECK-NEXT: ucvtf v19.2d, v19.2d
; CHECK-NEXT: ucvtf v21.2d, v21.2d
; CHECK-NEXT: ucvtf v5.2d, v5.2d
; CHECK-NEXT: ucvtf v6.2d, v6.2d
; CHECK-NEXT: ucvtf v7.2d, v7.2d
; CHECK-NEXT: ucvtf v20.2d, v20.2d
; CHECK-NEXT: ucvtf v21.2d, v21.2d
; CHECK-NEXT: ucvtf v16.2d, v16.2d
; CHECK-NEXT: ucvtf v17.2d, v17.2d
; CHECK-NEXT: ucvtf v20.2d, v20.2d
; CHECK-NEXT: ucvtf v22.2d, v22.2d
; CHECK-NEXT: ucvtf v23.2d, v23.2d
; CHECK-NEXT: ucvtf v0.2d, v0.2d
; CHECK-NEXT: ucvtf v18.2d, v18.2d
; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d
; CHECK-NEXT: fadd v4.2d, v4.2d, v7.2d
; CHECK-NEXT: fadd v6.2d, v3.2d, v6.2d
; CHECK-NEXT: fadd v2.2d, v2.2d, v19.2d
; CHECK-NEXT: fadd v3.2d, v17.2d, v22.2d
; CHECK-NEXT: fadd v5.2d, v16.2d, v18.2d
; CHECK-NEXT: fadd v7.2d, v21.2d, v0.2d
; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d
; CHECK-NEXT: fadd v0.2d, v4.2d, v3.2d
; CHECK-NEXT: fadd v3.2d, v1.2d, v16.2d
; CHECK-NEXT: fadd v1.2d, v2.2d, v7.2d
; CHECK-NEXT: fadd v2.2d, v6.2d, v5.2d
; CHECK-NEXT: ucvtf v1.2d, v1.2d
; CHECK-NEXT: ucvtf v2.2d, v2.2d
; CHECK-NEXT: ucvtf v3.2d, v3.2d
; CHECK-NEXT: ucvtf v4.2d, v4.2d
; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d
; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d
; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d
; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d
; CHECK-NEXT: fadd v1.2d, v20.2d, v1.2d
; CHECK-NEXT: fadd v3.2d, v23.2d, v3.2d
; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d
; CHECK-NEXT: fadd v4.2d, v0.2d, v4.2d
; CHECK-NEXT: fadd v0.2d, v5.2d, v1.2d
; CHECK-NEXT: fadd v1.2d, v6.2d, v2.2d
; CHECK-NEXT: fadd v2.2d, v7.2d, v3.2d
; CHECK-NEXT: fadd v3.2d, v17.2d, v4.2d
; CHECK-NEXT: ret
%l = load <32 x i16>, ptr %p
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
Expand Down
Loading