Skip to content

Commit 58c8696

Browse files
authored
[AArch64] Generate usdot instruction with multiple zext users in loop (#129718)
Currently, `partial_reduce(acc,mul(sext, zext))` is reduced to `usdot` in loop only if `zext` has single user i.e. `mul` If there are two partial reduce equations in loop body such as: ``` partial_reduce1(acc1,mul1(sext1, zext)) partial_reduce2(acc2,mul2(sext2, zext)) ``` and `zext` has no other users other than `mul1`/`mul2`, then this won't result in `usdot` instructions. This patch checks if multiple users of `zext`, like above, satisfy the same set of conditions as for a single user so that `usdot` instructions are generated.
1 parent 92a3073 commit 58c8696

File tree

2 files changed

+63
-52
lines changed

2 files changed

+63
-52
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16905,14 +16905,18 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
1690516905
// most one extra extend step is needed and using tbl is not profitable.
1690616906
// Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
1690716907
// udot instruction.
16908-
if (SrcWidth * 4 <= DstWidth && I->hasOneUser()) {
16909-
auto *SingleUser = cast<Instruction>(*I->user_begin());
16910-
if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))) ||
16911-
(match(SingleUser,
16912-
m_Intrinsic<Intrinsic::experimental_vector_partial_reduce_add>(
16913-
m_Value(), m_Specific(I))) &&
16914-
!shouldExpandPartialReductionIntrinsic(
16915-
cast<IntrinsicInst>(SingleUser))))
16908+
if (SrcWidth * 4 <= DstWidth) {
16909+
if (all_of(I->users(), [&](auto *U) {
16910+
auto *SingleUser = cast<Instruction>(&*U);
16911+
return (
16912+
match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))) ||
16913+
(match(SingleUser,
16914+
m_Intrinsic<
16915+
Intrinsic::experimental_vector_partial_reduce_add>(
16916+
m_Value(), m_Specific(I))) &&
16917+
!shouldExpandPartialReductionIntrinsic(
16918+
cast<IntrinsicInst>(SingleUser))));
16919+
}))
1691616920
return false;
1691716921
}
1691816922

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -916,50 +916,57 @@ entry:
916916
}
917917

918918
define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
919-
; CHECK-LABEL: usdot_multiple_zext_users:
920-
; CHECK: // %bb.0: // %entry
921-
; CHECK-NEXT: adrp x8, .LCPI28_0
922-
; CHECK-NEXT: movi v0.2d, #0000000000000000
923-
; CHECK-NEXT: movi v2.2d, #0000000000000000
924-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0]
925-
; CHECK-NEXT: adrp x8, .LCPI28_1
926-
; CHECK-NEXT: adrp x9, .LCPI28_2
927-
; CHECK-NEXT: adrp x10, .LCPI28_3
928-
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI28_1]
929-
; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI28_2]
930-
; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI28_3]
931-
; CHECK-NEXT: mov x8, xzr
932-
; CHECK-NEXT: .LBB28_1: // %vector.body
933-
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
934-
; CHECK-NEXT: ldr q6, [x2, x8]
935-
; CHECK-NEXT: ldr q18, [x0, x8]
936-
; CHECK-NEXT: ldr q19, [x1, x8]
937-
; CHECK-NEXT: add x8, x8, #16
938-
; CHECK-NEXT: tbl v7.16b, { v6.16b }, v1.16b
939-
; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
940-
; CHECK-NEXT: tbl v17.16b, { v6.16b }, v4.16b
941-
; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
942-
; CHECK-NEXT: cmp x8, #1024
943-
; CHECK-NEXT: uzp1 v7.8h, v16.8h, v7.8h
944-
; CHECK-NEXT: sshll v16.8h, v18.8b, #0
945-
; CHECK-NEXT: uzp1 v6.8h, v6.8h, v17.8h
946-
; CHECK-NEXT: sshll2 v17.8h, v18.16b, #0
947-
; CHECK-NEXT: sshll v18.8h, v19.8b, #0
948-
; CHECK-NEXT: sshll2 v19.8h, v19.16b, #0
949-
; CHECK-NEXT: smlal v0.4s, v16.4h, v7.4h
950-
; CHECK-NEXT: smlal v2.4s, v18.4h, v7.4h
951-
; CHECK-NEXT: smull v20.4s, v17.4h, v6.4h
952-
; CHECK-NEXT: smull v21.4s, v19.4h, v6.4h
953-
; CHECK-NEXT: smlal2 v0.4s, v17.8h, v6.8h
954-
; CHECK-NEXT: smlal2 v2.4s, v19.8h, v6.8h
955-
; CHECK-NEXT: smlal2 v20.4s, v16.8h, v7.8h
956-
; CHECK-NEXT: smlal2 v21.4s, v18.8h, v7.8h
957-
; CHECK-NEXT: add v0.4s, v20.4s, v0.4s
958-
; CHECK-NEXT: add v2.4s, v21.4s, v2.4s
959-
; CHECK-NEXT: b.ne .LBB28_1
960-
; CHECK-NEXT: // %bb.2: // %end
961-
; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
962-
; CHECK-NEXT: ret
919+
; CHECK-NOI8MM-LABEL: usdot_multiple_zext_users:
920+
; CHECK-NOI8MM: // %bb.0: // %entry
921+
; CHECK-NOI8MM-NEXT: movi v0.2d, #0000000000000000
922+
; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
923+
; CHECK-NOI8MM-NEXT: mov x8, xzr
924+
; CHECK-NOI8MM-NEXT: .LBB28_1: // %vector.body
925+
; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
926+
; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8]
927+
; CHECK-NOI8MM-NEXT: ldr q3, [x2, x8]
928+
; CHECK-NOI8MM-NEXT: ldr q4, [x1, x8]
929+
; CHECK-NOI8MM-NEXT: add x8, x8, #16
930+
; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0
931+
; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
932+
; CHECK-NOI8MM-NEXT: ushll2 v6.8h, v3.16b, #0
933+
; CHECK-NOI8MM-NEXT: ushll v3.8h, v3.8b, #0
934+
; CHECK-NOI8MM-NEXT: sshll v7.8h, v4.8b, #0
935+
; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v4.16b, #0
936+
; CHECK-NOI8MM-NEXT: cmp x8, #1024
937+
; CHECK-NOI8MM-NEXT: smull v16.4s, v2.4h, v6.4h
938+
; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v3.4h
939+
; CHECK-NOI8MM-NEXT: smull v17.4s, v4.4h, v6.4h
940+
; CHECK-NOI8MM-NEXT: smlal v1.4s, v7.4h, v3.4h
941+
; CHECK-NOI8MM-NEXT: smlal2 v16.4s, v5.8h, v3.8h
942+
; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v6.8h
943+
; CHECK-NOI8MM-NEXT: smlal2 v17.4s, v7.8h, v3.8h
944+
; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v6.8h
945+
; CHECK-NOI8MM-NEXT: add v0.4s, v16.4s, v0.4s
946+
; CHECK-NOI8MM-NEXT: add v1.4s, v17.4s, v1.4s
947+
; CHECK-NOI8MM-NEXT: b.ne .LBB28_1
948+
; CHECK-NOI8MM-NEXT: // %bb.2: // %end
949+
; CHECK-NOI8MM-NEXT: add v0.4s, v1.4s, v0.4s
950+
; CHECK-NOI8MM-NEXT: ret
951+
;
952+
; CHECK-I8MM-LABEL: usdot_multiple_zext_users:
953+
; CHECK-I8MM: // %bb.0: // %entry
954+
; CHECK-I8MM-NEXT: movi v0.2d, #0000000000000000
955+
; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
956+
; CHECK-I8MM-NEXT: mov x8, xzr
957+
; CHECK-I8MM-NEXT: .LBB28_1: // %vector.body
958+
; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
959+
; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
960+
; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
961+
; CHECK-I8MM-NEXT: ldr q4, [x2, x8]
962+
; CHECK-I8MM-NEXT: add x8, x8, #16
963+
; CHECK-I8MM-NEXT: usdot v0.4s, v4.16b, v2.16b
964+
; CHECK-I8MM-NEXT: usdot v1.4s, v4.16b, v3.16b
965+
; CHECK-I8MM-NEXT: cmp x8, #1024
966+
; CHECK-I8MM-NEXT: b.ne .LBB28_1
967+
; CHECK-I8MM-NEXT: // %bb.2: // %end
968+
; CHECK-I8MM-NEXT: add v0.4s, v1.4s, v0.4s
969+
; CHECK-I8MM-NEXT: ret
963970
entry:
964971
br label %vector.body
965972

0 commit comments

Comments
 (0)