Skip to content

Commit 4f0403f

Browse files
authored
[CodeGen][AArch64] Sink splat operands of FMul instructions (#116222)
Sink shuffle operands of FMul instructions if these are splats, as we can generate lane-indexed variants for these.
1 parent 01a1ca7 commit 4f0403f

File tree

2 files changed

+159
-11
lines changed

2 files changed

+159
-11
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5239,6 +5239,22 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
52395239
// Is it profitable to sink if we found two of the same type of extends.
52405240
return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
52415241
}
5242+
case Instruction::FMul: {
5243+
// For SVE the lane-indexing is within 128-bits, so we can't fold splats.
5244+
if (I->getType()->isScalableTy())
5245+
return false;
5246+
5247+
if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
5248+
!ST->hasFullFP16())
5249+
return false;
5250+
5251+
// Sink splats for index lane variants
5252+
if (isSplatShuffle(I->getOperand(0)))
5253+
Ops.push_back(&I->getOperandUse(0));
5254+
if (isSplatShuffle(I->getOperand(1)))
5255+
Ops.push_back(&I->getOperandUse(1));
5256+
return !Ops.empty();
5257+
}
52425258
default:
52435259
return false;
52445260
}

llvm/test/CodeGen/AArch64/sinksplat.ll

Lines changed: 143 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,29 +230,34 @@ l2:
230230
ret <4 x i32> %c
231231
}
232232

233-
define <4 x float> @fmul(<4 x float> %x, ptr %y) {
233+
define <4 x float> @fmul(ptr %x, ptr %y) {
234234
; CHECK-LABEL: fmul:
235235
; CHECK: // %bb.0: // %entry
236-
; CHECK-NEXT: mov v1.16b, v0.16b
237-
; CHECK-NEXT: ldr q2, [x0]
238236
; CHECK-NEXT: movi v0.2d, #0000000000000000
239-
; CHECK-NEXT: mov w8, #1 // =0x1
240-
; CHECK-NEXT: fmul v1.4s, v2.4s, v1.s[3]
237+
; CHECK-NEXT: ldr s1, [x0]
238+
; CHECK-NEXT: mov x8, xzr
241239
; CHECK-NEXT: .LBB7_1: // %l1
242240
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
243-
; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
244-
; CHECK-NEXT: subs w8, w8, #1
241+
; CHECK-NEXT: ldr q2, [x1, x8]
242+
; CHECK-NEXT: add x8, x8, #16
243+
; CHECK-NEXT: cmp w8, #16
244+
; CHECK-NEXT: fmul v2.4s, v2.4s, v1.s[0]
245+
; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
245246
; CHECK-NEXT: b.eq .LBB7_1
246247
; CHECK-NEXT: // %bb.2: // %l2
247248
; CHECK-NEXT: ret
248249
entry:
249-
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
250+
%x.val = load float, ptr %x
251+
%x.ins = insertelement <4 x float> poison, float %x.val, i64 0
252+
%a = shufflevector <4 x float> %x.ins, <4 x float> undef, <4 x i32> zeroinitializer
250253
br label %l1
251254

252255
l1:
253256
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
254257
%q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
255-
%l = load <4 x float>, ptr %y
258+
%idx.y = mul nuw nsw i32 %p, 4
259+
%ptr.y = getelementptr float, ptr %y, i32 %idx.y
260+
%l = load <4 x float>, ptr %ptr.y
256261
%b = fmul <4 x float> %l, %a
257262
%c = fadd <4 x float> %b, %q
258263
%pa = add i32 %p, 1
@@ -270,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
270275
; CHECK-NEXT: movi v0.2d, #0000000000000000
271276
; CHECK-NEXT: ldr q2, [x0]
272277
; CHECK-NEXT: mov w8, #1 // =0x1
273-
; CHECK-NEXT: dup v1.4s, v1.s[3]
274278
; CHECK-NEXT: .LBB8_1: // %l1
275279
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
276-
; CHECK-NEXT: fmla v0.4s, v1.4s, v2.4s
280+
; CHECK-NEXT: fmla v0.4s, v2.4s, v1.s[3]
277281
; CHECK-NEXT: subs w8, w8, #1
278282
; CHECK-NEXT: b.eq .LBB8_1
279283
; CHECK-NEXT: // %bb.2: // %l2
@@ -418,6 +422,134 @@ l2:
418422
ret <4 x i32> %r
419423
}
420424

425+
; We shouldn't sink without fullfp16.
426+
define <4 x half> @fmul_half(ptr %x, ptr %y) {
427+
; CHECK-LABEL: fmul_half:
428+
; CHECK: // %bb.0: // %entry
429+
; CHECK-NEXT: ld1r { v1.4h }, [x0]
430+
; CHECK-NEXT: movi d0, #0000000000000000
431+
; CHECK-NEXT: mov x8, xzr
432+
; CHECK-NEXT: fcvtl v1.4s, v1.4h
433+
; CHECK-NEXT: .LBB13_1: // %l1
434+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
435+
; CHECK-NEXT: ldr d2, [x1, x8]
436+
; CHECK-NEXT: fcvtl v0.4s, v0.4h
437+
; CHECK-NEXT: add x8, x8, #8
438+
; CHECK-NEXT: cmp w8, #8
439+
; CHECK-NEXT: fcvtl v2.4s, v2.4h
440+
; CHECK-NEXT: fmul v2.4s, v2.4s, v1.4s
441+
; CHECK-NEXT: fcvtn v2.4h, v2.4s
442+
; CHECK-NEXT: fcvtl v2.4s, v2.4h
443+
; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
444+
; CHECK-NEXT: fcvtn v0.4h, v0.4s
445+
; CHECK-NEXT: b.eq .LBB13_1
446+
; CHECK-NEXT: // %bb.2: // %l2
447+
; CHECK-NEXT: ret
448+
entry:
449+
%x.val = load half, ptr %x
450+
%x.ins = insertelement <4 x half> poison, half %x.val, i64 0
451+
%a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
452+
br label %l1
453+
454+
l1:
455+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
456+
%q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
457+
%idx.y = mul nuw nsw i32 %p, 4
458+
%ptr.y = getelementptr half, ptr %y, i32 %idx.y
459+
%l = load <4 x half>, ptr %ptr.y
460+
%b = fmul <4 x half> %l, %a
461+
%c = fadd <4 x half> %b, %q
462+
%pa = add i32 %p, 1
463+
%c1 = icmp eq i32 %p, 0
464+
br i1 %c1, label %l1, label %l2
465+
466+
l2:
467+
ret <4 x half> %c
468+
}
469+
470+
define <4 x half> @fmul_half_fullfp16(ptr %x, ptr %y) "target-features"="+fullfp16" {
471+
; CHECK-LABEL: fmul_half_fullfp16:
472+
; CHECK: // %bb.0: // %entry
473+
; CHECK-NEXT: movi d0, #0000000000000000
474+
; CHECK-NEXT: ldr h1, [x0]
475+
; CHECK-NEXT: mov x8, xzr
476+
; CHECK-NEXT: .LBB14_1: // %l1
477+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
478+
; CHECK-NEXT: ldr d2, [x1, x8]
479+
; CHECK-NEXT: add x8, x8, #8
480+
; CHECK-NEXT: cmp w8, #8
481+
; CHECK-NEXT: fmul v2.4h, v2.4h, v1.h[0]
482+
; CHECK-NEXT: fadd v0.4h, v2.4h, v0.4h
483+
; CHECK-NEXT: b.eq .LBB14_1
484+
; CHECK-NEXT: // %bb.2: // %l2
485+
; CHECK-NEXT: ret
486+
entry:
487+
%x.val = load half, ptr %x
488+
%x.ins = insertelement <4 x half> poison, half %x.val, i64 0
489+
%a = shufflevector <4 x half> %x.ins, <4 x half> undef, <4 x i32> zeroinitializer
490+
br label %l1
491+
492+
l1:
493+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
494+
%q = phi <4 x half> [ zeroinitializer, %entry ], [ %c, %l1 ]
495+
%idx.y = mul nuw nsw i32 %p, 4
496+
%ptr.y = getelementptr half, ptr %y, i32 %idx.y
497+
%l = load <4 x half>, ptr %ptr.y
498+
%b = fmul <4 x half> %l, %a
499+
%c = fadd <4 x half> %b, %q
500+
%pa = add i32 %p, 1
501+
%c1 = icmp eq i32 %p, 0
502+
br i1 %c1, label %l1, label %l2
503+
504+
l2:
505+
ret <4 x half> %c
506+
}
507+
508+
; We shouldn't sink the splat operand for scalable vectors.
509+
define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) "target-features"="+sve" {
510+
; CHECK-LABEL: fmul_scalable:
511+
; CHECK: // %bb.0: // %entry
512+
; CHECK-NEXT: ptrue p0.s
513+
; CHECK-NEXT: rdvl x8, #1
514+
; CHECK-NEXT: mov z0.s, #0 // =0x0
515+
; CHECK-NEXT: sxtw x8, w8
516+
; CHECK-NEXT: mov w9, #1 // =0x1
517+
; CHECK-NEXT: ld1rw { z1.s }, p0/z, [x0]
518+
; CHECK-NEXT: lsl x8, x8, #2
519+
; CHECK-NEXT: .LBB15_1: // %l1
520+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
521+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
522+
; CHECK-NEXT: subs w9, w9, #1
523+
; CHECK-NEXT: add x1, x1, x8
524+
; CHECK-NEXT: fmul z2.s, z2.s, z1.s
525+
; CHECK-NEXT: fadd z0.s, z2.s, z0.s
526+
; CHECK-NEXT: b.eq .LBB15_1
527+
; CHECK-NEXT: // %bb.2: // %l2
528+
; CHECK-NEXT: ret
529+
entry:
530+
%x.val = load float, ptr %x
531+
%x.ins = insertelement <vscale x 4 x float> poison, float %x.val, i64 0
532+
%a = shufflevector <vscale x 4 x float> %x.ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
533+
%33 = tail call i32 @llvm.vscale.i32()
534+
%34 = shl nuw nsw i32 %33, 4
535+
br label %l1
536+
537+
l1:
538+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
539+
%q = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
540+
%idx.y = mul nuw nsw i32 %p, %34
541+
%ptr.y = getelementptr float, ptr %y, i32 %idx.y
542+
%l = load <vscale x 4 x float>, ptr %ptr.y
543+
%b = fmul <vscale x 4 x float> %l, %a
544+
%c = fadd <vscale x 4 x float> %b, %q
545+
%pa = add i32 %p, 1
546+
%c1 = icmp eq i32 %p, 0
547+
br i1 %c1, label %l1, label %l2
548+
549+
l2:
550+
ret <vscale x 4 x float> %c
551+
}
552+
421553

422554
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
423555
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)

0 commit comments

Comments
 (0)