Skip to content

Commit 307713a

Browse files
authored
[AArch64] Do not generate uitofp(ld4) where and/shift can be used. (llvm#107538)
After llvm#107201 and llvm#107367 the codegen for zext(ld4) can use and / shift to extract the lanes out of the original vectors elements. This avoids the need for the expensive ld4 operations, so can lead to performance improvements over using the interleaving loads and ushll. This patch stops the generation of ld4 for uitofp(ld4) that would become uitofp(zext(ld4)). It doesn't handle zext yet to make sure that widening instructions like mull and addl are not adversely affected.
1 parent 3ba0755 commit 307713a

File tree

2 files changed

+58
-46
lines changed

2 files changed

+58
-46
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17090,6 +17090,16 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
1709017090
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
1709117091
return false;
1709217092

17093+
// Check if the interleave is a zext(shuffle), that can be better optimized
17094+
// into shift / and masks. For the moment we do this just for uitofp (not
17095+
// zext) to avoid issues with widening instructions.
17096+
if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17097+
return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17098+
SI->getType()->getScalarSizeInBits() * 4 ==
17099+
SI->user_back()->getType()->getScalarSizeInBits();
17100+
}))
17101+
return false;
17102+
1709317103
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
1709417104

1709517105
auto *FVTy = cast<FixedVectorType>(VTy);

llvm/test/CodeGen/AArch64/zext-shuffle.ll

Lines changed: 48 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -472,59 +472,61 @@ define <8 x double> @uitofp_fadd(<32 x i16> %l) {
472472
define <8 x double> @uitofp_load_fadd(ptr %p) {
473473
; CHECK-LABEL: uitofp_load_fadd:
474474
; CHECK: // %bb.0:
475-
; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
476-
; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0
477-
; CHECK-NEXT: ushll v5.4s, v0.4h, #0
478-
; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0
479-
; CHECK-NEXT: ushll v7.4s, v1.4h, #0
480-
; CHECK-NEXT: ushll2 v16.4s, v2.8h, #0
481-
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
482-
; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
483-
; CHECK-NEXT: ushll v0.4s, v3.4h, #0
484-
; CHECK-NEXT: ushll2 v1.2d, v4.4s, #0
485-
; CHECK-NEXT: ushll2 v2.2d, v5.4s, #0
486-
; CHECK-NEXT: ushll v3.2d, v4.2s, #0
487-
; CHECK-NEXT: ushll v4.2d, v5.2s, #0
488-
; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0
489-
; CHECK-NEXT: ushll2 v19.2d, v7.4s, #0
490-
; CHECK-NEXT: ushll v6.2d, v6.2s, #0
491-
; CHECK-NEXT: ushll v7.2d, v7.2s, #0
492-
; CHECK-NEXT: ushll2 v20.2d, v16.4s, #0
493-
; CHECK-NEXT: ushll2 v21.2d, v17.4s, #0
494-
; CHECK-NEXT: ushll v16.2d, v16.2s, #0
495-
; CHECK-NEXT: ushll v17.2d, v17.2s, #0
496-
; CHECK-NEXT: ushll v22.2d, v0.2s, #0
497-
; CHECK-NEXT: ushll2 v23.2d, v18.4s, #0
498-
; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
499-
; CHECK-NEXT: ushll v18.2d, v18.2s, #0
500-
; CHECK-NEXT: ucvtf v1.2d, v1.2d
501-
; CHECK-NEXT: ucvtf v2.2d, v2.2d
502-
; CHECK-NEXT: ucvtf v3.2d, v3.2d
503-
; CHECK-NEXT: ucvtf v4.2d, v4.2d
504-
; CHECK-NEXT: ucvtf v5.2d, v5.2d
475+
; CHECK-NEXT: ldp q1, q2, [x0]
476+
; CHECK-NEXT: movi v0.2d, #0x0000000000ffff
477+
; CHECK-NEXT: ldp q3, q4, [x0, #32]
478+
; CHECK-NEXT: ushr v5.2d, v1.2d, #16
479+
; CHECK-NEXT: ushr v6.2d, v2.2d, #16
480+
; CHECK-NEXT: ushr v20.2d, v1.2d, #32
481+
; CHECK-NEXT: ushr v7.2d, v3.2d, #16
482+
; CHECK-NEXT: ushr v17.2d, v4.2d, #16
483+
; CHECK-NEXT: ushr v22.2d, v2.2d, #32
484+
; CHECK-NEXT: ushr v23.2d, v3.2d, #32
485+
; CHECK-NEXT: ushr v24.2d, v4.2d, #32
486+
; CHECK-NEXT: and v16.16b, v1.16b, v0.16b
487+
; CHECK-NEXT: and v18.16b, v2.16b, v0.16b
488+
; CHECK-NEXT: and v19.16b, v3.16b, v0.16b
489+
; CHECK-NEXT: and v21.16b, v4.16b, v0.16b
490+
; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
491+
; CHECK-NEXT: and v6.16b, v6.16b, v0.16b
492+
; CHECK-NEXT: and v7.16b, v7.16b, v0.16b
493+
; CHECK-NEXT: and v17.16b, v17.16b, v0.16b
494+
; CHECK-NEXT: and v20.16b, v20.16b, v0.16b
495+
; CHECK-NEXT: and v22.16b, v22.16b, v0.16b
496+
; CHECK-NEXT: and v23.16b, v23.16b, v0.16b
497+
; CHECK-NEXT: and v0.16b, v24.16b, v0.16b
498+
; CHECK-NEXT: ushr v1.2d, v1.2d, #48
499+
; CHECK-NEXT: ushr v2.2d, v2.2d, #48
500+
; CHECK-NEXT: ushr v3.2d, v3.2d, #48
501+
; CHECK-NEXT: ushr v4.2d, v4.2d, #48
502+
; CHECK-NEXT: ucvtf v16.2d, v16.2d
503+
; CHECK-NEXT: ucvtf v18.2d, v18.2d
505504
; CHECK-NEXT: ucvtf v19.2d, v19.2d
505+
; CHECK-NEXT: ucvtf v21.2d, v21.2d
506+
; CHECK-NEXT: ucvtf v5.2d, v5.2d
506507
; CHECK-NEXT: ucvtf v6.2d, v6.2d
507508
; CHECK-NEXT: ucvtf v7.2d, v7.2d
508-
; CHECK-NEXT: ucvtf v20.2d, v20.2d
509-
; CHECK-NEXT: ucvtf v21.2d, v21.2d
510-
; CHECK-NEXT: ucvtf v16.2d, v16.2d
511509
; CHECK-NEXT: ucvtf v17.2d, v17.2d
510+
; CHECK-NEXT: ucvtf v20.2d, v20.2d
512511
; CHECK-NEXT: ucvtf v22.2d, v22.2d
513512
; CHECK-NEXT: ucvtf v23.2d, v23.2d
514513
; CHECK-NEXT: ucvtf v0.2d, v0.2d
515-
; CHECK-NEXT: ucvtf v18.2d, v18.2d
516-
; CHECK-NEXT: fadd v1.2d, v1.2d, v5.2d
517-
; CHECK-NEXT: fadd v4.2d, v4.2d, v7.2d
518-
; CHECK-NEXT: fadd v6.2d, v3.2d, v6.2d
519-
; CHECK-NEXT: fadd v2.2d, v2.2d, v19.2d
520-
; CHECK-NEXT: fadd v3.2d, v17.2d, v22.2d
521-
; CHECK-NEXT: fadd v5.2d, v16.2d, v18.2d
522-
; CHECK-NEXT: fadd v7.2d, v21.2d, v0.2d
523-
; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d
524-
; CHECK-NEXT: fadd v0.2d, v4.2d, v3.2d
525-
; CHECK-NEXT: fadd v3.2d, v1.2d, v16.2d
526-
; CHECK-NEXT: fadd v1.2d, v2.2d, v7.2d
527-
; CHECK-NEXT: fadd v2.2d, v6.2d, v5.2d
514+
; CHECK-NEXT: ucvtf v1.2d, v1.2d
515+
; CHECK-NEXT: ucvtf v2.2d, v2.2d
516+
; CHECK-NEXT: ucvtf v3.2d, v3.2d
517+
; CHECK-NEXT: ucvtf v4.2d, v4.2d
518+
; CHECK-NEXT: fadd v6.2d, v18.2d, v6.2d
519+
; CHECK-NEXT: fadd v5.2d, v16.2d, v5.2d
520+
; CHECK-NEXT: fadd v17.2d, v21.2d, v17.2d
521+
; CHECK-NEXT: fadd v7.2d, v19.2d, v7.2d
522+
; CHECK-NEXT: fadd v1.2d, v20.2d, v1.2d
523+
; CHECK-NEXT: fadd v3.2d, v23.2d, v3.2d
524+
; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d
525+
; CHECK-NEXT: fadd v4.2d, v0.2d, v4.2d
526+
; CHECK-NEXT: fadd v0.2d, v5.2d, v1.2d
527+
; CHECK-NEXT: fadd v1.2d, v6.2d, v2.2d
528+
; CHECK-NEXT: fadd v2.2d, v7.2d, v3.2d
529+
; CHECK-NEXT: fadd v3.2d, v17.2d, v4.2d
528530
; CHECK-NEXT: ret
529531
%l = load <32 x i16>, ptr %p
530532
%s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>

0 commit comments

Comments
 (0)