Skip to content

Commit fb20017

Browse files
committed
[AArch64] Improve codegen for some fixed-width partial reductions
This patch teaches optimizeExtendOrTruncateConversion to bail out if the user of a zero-extend is a partial reduction intrinsic that we know will get lowered efficiently to a udot instruction.
1 parent e449634 commit fb20017

File tree

2 files changed

+42
-30
lines changed

2 files changed

+42
-30
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16866,9 +16866,14 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1686616866
// mul(zext(i8), sext) can be transformed into smull(zext, sext) which
1686716867
// performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
1686816868
// most one extra extend step is needed and using tbl is not profitable.
16869+
// Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
16870+
// udot instruction.
1686916871
if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
1687016872
auto *SingleUser = cast<Instruction>(*I->user_begin());
16871-
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
16873+
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))) ||
16874+
(isa<IntrinsicInst>(SingleUser) &&
16875+
!shouldExpandPartialReductionIntrinsic(
16876+
cast<IntrinsicInst>(SingleUser))))
1687216877
return false;
1687316878
}
1687416879

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -574,35 +574,42 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
574574
}
575575

576576
define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
577-
; CHECK-LABEL: udot_no_bin_op_in_loop:
578-
; CHECK: // %bb.0: // %entry
579-
; CHECK-NEXT: adrp x8, .LCPI16_0
580-
; CHECK-NEXT: movi v4.2d, #0000000000000000
581-
; CHECK-NEXT: adrp x9, .LCPI16_2
582-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
583-
; CHECK-NEXT: adrp x8, .LCPI16_1
584-
; CHECK-NEXT: adrp x10, .LCPI16_3
585-
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
586-
; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI16_2]
587-
; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI16_3]
588-
; CHECK-NEXT: mov x8, xzr
589-
; CHECK-NEXT: .LBB16_1: // %vector.body
590-
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
591-
; CHECK-NEXT: ldr q6, [x0, x8]
592-
; CHECK-NEXT: mov v0.16b, v4.16b
593-
; CHECK-NEXT: add x8, x8, #16
594-
; CHECK-NEXT: cmp x8, #16
595-
; CHECK-NEXT: tbl v7.16b, { v6.16b }, v2.16b
596-
; CHECK-NEXT: tbl v4.16b, { v6.16b }, v1.16b
597-
; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
598-
; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
599-
; CHECK-NEXT: add v7.4s, v0.4s, v7.4s
600-
; CHECK-NEXT: add v6.4s, v6.4s, v16.4s
601-
; CHECK-NEXT: add v4.4s, v4.4s, v7.4s
602-
; CHECK-NEXT: add v4.4s, v6.4s, v4.4s
603-
; CHECK-NEXT: b.ne .LBB16_1
604-
; CHECK-NEXT: // %bb.2: // %end
605-
; CHECK-NEXT: ret
577+
; CHECK-DOT-LABEL: udot_no_bin_op_in_loop:
578+
; CHECK-DOT: // %bb.0: // %entry
579+
; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
580+
; CHECK-DOT-NEXT: movi v2.16b, #1
581+
; CHECK-DOT-NEXT: mov x8, xzr
582+
; CHECK-DOT-NEXT: .LBB16_1: // %vector.body
583+
; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
584+
; CHECK-DOT-NEXT: ldr q3, [x0, x8]
585+
; CHECK-DOT-NEXT: mov v0.16b, v1.16b
586+
; CHECK-DOT-NEXT: add x8, x8, #16
587+
; CHECK-DOT-NEXT: cmp x8, #16
588+
; CHECK-DOT-NEXT: udot v1.4s, v3.16b, v2.16b
589+
; CHECK-DOT-NEXT: b.ne .LBB16_1
590+
; CHECK-DOT-NEXT: // %bb.2: // %end
591+
; CHECK-DOT-NEXT: ret
592+
;
593+
; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
594+
; CHECK-NODOT: // %bb.0: // %entry
595+
; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
596+
; CHECK-NODOT-NEXT: mov x8, xzr
597+
; CHECK-NODOT-NEXT: .LBB16_1: // %vector.body
598+
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
599+
; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
600+
; CHECK-NODOT-NEXT: add x8, x8, #16
601+
; CHECK-NODOT-NEXT: cmp x8, #16
602+
; CHECK-NODOT-NEXT: ushll v2.8h, v0.8b, #0
603+
; CHECK-NODOT-NEXT: ushll2 v3.8h, v0.16b, #0
604+
; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
605+
; CHECK-NODOT-NEXT: ushll v1.4s, v3.4h, #0
606+
; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v2.4h
607+
; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h
608+
; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v3.8h
609+
; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
610+
; CHECK-NODOT-NEXT: b.ne .LBB16_1
611+
; CHECK-NODOT-NEXT: // %bb.2: // %end
612+
; CHECK-NODOT-NEXT: ret
606613
entry:
607614
br label %vector.body
608615

0 commit comments

Comments
 (0)