-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Improve operand sinking for mul instructions #116604
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
- Sink splat operands to mul instructions for types where we can use the lane-indexed variants. - When sinking operands for [su]mull, also sink the ext instruction.
@llvm/pr-subscribers-backend-aarch64 Author: Hari Limaye (hazzlim) Changes
Patch is 26.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116604.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a97b0d3b1db92a..615c2854852824 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5168,26 +5168,41 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
return false;
}
case Instruction::Mul: {
+ auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
+ auto VT = MVT::getVT(V->getType(), /*HandleUnknown=*/true);
+ return (VT == MVT::v4i16 || VT == MVT::v8i16 || VT == MVT::v2i32 ||
+ VT == MVT::v4i32);
+ };
+
int NumZExts = 0, NumSExts = 0;
for (auto &Op : I->operands()) {
// Make sure we are not already sinking this operand
if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
continue;
- if (match(&Op, m_SExt(m_Value()))) {
- NumSExts++;
- continue;
- } else if (match(&Op, m_ZExt(m_Value()))) {
- NumZExts++;
+ if (match(&Op, m_ZExtOrSExt(m_Value()))) {
+ auto *Ext = cast<Instruction>(Op);
+ auto *ExtOp = Ext->getOperand(0);
+ if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
+ Ops.push_back(&Ext->getOperandUse(0));
+ Ops.push_back(&Op);
+
+ if (isa<SExtInst>(Ext))
+ NumSExts++;
+ else
+ NumZExts++;
+
continue;
}
ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
+ if (!Shuffle)
+ continue;
// If the Shuffle is a splat and the operand is a zext/sext, sinking the
// operand and the s/zext can help create indexed s/umull. This is
// especially useful to prevent i64 mul being scalarized.
- if (Shuffle && isSplatShuffle(Shuffle) &&
+ if (isSplatShuffle(Shuffle) &&
match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
Ops.push_back(&Shuffle->getOperandUse(0));
Ops.push_back(&Op);
@@ -5198,9 +5213,6 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
continue;
}
- if (!Shuffle)
- continue;
-
Value *ShuffleOperand = Shuffle->getOperand(0);
InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
if (!Insert)
@@ -5232,12 +5244,26 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
NumZExts++;
}
+ Ops.push_back(&Insert->getOperandUse(1));
Ops.push_back(&Shuffle->getOperandUse(0));
Ops.push_back(&Op);
}
- // Is it profitable to sink if we found two of the same type of extends.
- return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
+ // It is profitable to sink if we found two of the same type of extends.
+ if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
+ return true;
+
+ // Otherwise, see if we should sink splats for indexed variants.
+ if (!ShouldSinkSplatForIndexedVariant(I))
+ return false;
+
+ Ops.clear();
+ if (isSplatShuffle(I->getOperand(0)))
+ Ops.push_back(&I->getOperandUse(0));
+ if (isSplatShuffle(I->getOperand(1)))
+ Ops.push_back(&I->getOperandUse(1));
+
+ return !Ops.empty();
}
default:
return false;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
index ef54cc4bbf7180..482135b721da49 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
@@ -10,14 +10,18 @@ target triple = "aarch64-unknown-linux-gnu"
define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr {
; CHECK-LABEL: dupext_crashtest:
; CHECK: // %bb.0: // %for.body.lr.ph
-; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: dup v0.2s, w8
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d1, [x8]
-; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: str d1, [x8]
+; CHECK-NEXT: ldr d0, [x8]
+; CHECK-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: mul w9, w0, w9
+; CHECK-NEXT: mul w8, w0, w8
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: b .LBB0_1
for.body.lr.ph:
%conv314 = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0c7a61739695fb..3432b15abfbf22 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -5,9 +5,8 @@
define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
; CHECK-SD-LABEL: matrix_mul_unsigned:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: .LBB0_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -91,9 +90,8 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
; CHECK-SD-LABEL: matrix_mul_signed:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: sxth w8, w3
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: .LBB1_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -179,9 +177,8 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) {
; CHECK-SD-LABEL: matrix_mul_double_shuffle:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0
; CHECK-SD-NEXT: .LBB2_1: // %vector.body
@@ -261,44 +258,44 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: cmp w3, #1
; CHECK-SD-NEXT: b.lt .LBB3_8
; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-SD-NEXT: sxth w8, w1
; CHECK-SD-NEXT: cmp w3, #15
-; CHECK-SD-NEXT: mov w9, w3
+; CHECK-SD-NEXT: mov w8, w3
; CHECK-SD-NEXT: b.hi .LBB3_3
; CHECK-SD-NEXT: // %bb.2:
-; CHECK-SD-NEXT: mov x10, xzr
+; CHECK-SD-NEXT: mov x9, xzr
; CHECK-SD-NEXT: b .LBB3_6
; CHECK-SD-NEXT: .LBB3_3: // %vector.ph
-; CHECK-SD-NEXT: dup v0.8h, w8
-; CHECK-SD-NEXT: and x10, x9, #0xfffffff0
-; CHECK-SD-NEXT: add x11, x2, #32
-; CHECK-SD-NEXT: add x12, x0, #16
-; CHECK-SD-NEXT: mov x13, x10
+; CHECK-SD-NEXT: dup v0.8h, w1
+; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
+; CHECK-SD-NEXT: add x10, x2, #32
+; CHECK-SD-NEXT: add x11, x0, #16
+; CHECK-SD-NEXT: mov x12, x9
; CHECK-SD-NEXT: .LBB3_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldp q1, q2, [x12, #-16]
-; CHECK-SD-NEXT: subs x13, x13, #16
-; CHECK-SD-NEXT: add x12, x12, #32
+; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
+; CHECK-SD-NEXT: subs x12, x12, #16
+; CHECK-SD-NEXT: add x11, x11, #32
; CHECK-SD-NEXT: smull2 v3.4s, v0.8h, v1.8h
; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: smull2 v4.4s, v0.8h, v2.8h
; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h
-; CHECK-SD-NEXT: stp q1, q3, [x11, #-32]
-; CHECK-SD-NEXT: stp q2, q4, [x11], #64
+; CHECK-SD-NEXT: stp q1, q3, [x10, #-32]
+; CHECK-SD-NEXT: stp q2, q4, [x10], #64
; CHECK-SD-NEXT: b.ne .LBB3_4
; CHECK-SD-NEXT: // %bb.5: // %middle.block
-; CHECK-SD-NEXT: cmp x10, x9
+; CHECK-SD-NEXT: cmp x9, x8
; CHECK-SD-NEXT: b.eq .LBB3_8
; CHECK-SD-NEXT: .LBB3_6: // %for.body.preheader1
-; CHECK-SD-NEXT: add x11, x2, x10, lsl #2
-; CHECK-SD-NEXT: add x12, x0, x10, lsl #1
-; CHECK-SD-NEXT: sub x9, x9, x10
+; CHECK-SD-NEXT: sxth w10, w1
+; CHECK-SD-NEXT: add x11, x2, x9, lsl #2
+; CHECK-SD-NEXT: add x12, x0, x9, lsl #1
+; CHECK-SD-NEXT: sub x8, x8, x9
; CHECK-SD-NEXT: .LBB3_7: // %for.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldrsh w10, [x12], #2
-; CHECK-SD-NEXT: subs x9, x9, #1
-; CHECK-SD-NEXT: mul w10, w10, w8
-; CHECK-SD-NEXT: str w10, [x11], #4
+; CHECK-SD-NEXT: ldrsh w9, [x12], #2
+; CHECK-SD-NEXT: subs x8, x8, #1
+; CHECK-SD-NEXT: mul w9, w9, w10
+; CHECK-SD-NEXT: str w9, [x11], #4
; CHECK-SD-NEXT: b.ne .LBB3_7
; CHECK-SD-NEXT: .LBB3_8: // %for.cond.cleanup
; CHECK-SD-NEXT: ret
@@ -424,43 +421,43 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: b.lt .LBB4_8
; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader
; CHECK-SD-NEXT: cmp w3, #15
-; CHECK-SD-NEXT: and w8, w1, #0xffff
-; CHECK-SD-NEXT: mov w9, w3
+; CHECK-SD-NEXT: mov w8, w3
; CHECK-SD-NEXT: b.hi .LBB4_3
; CHECK-SD-NEXT: // %bb.2:
-; CHECK-SD-NEXT: mov x10, xzr
+; CHECK-SD-NEXT: mov x9, xzr
; CHECK-SD-NEXT: b .LBB4_6
; CHECK-SD-NEXT: .LBB4_3: // %vector.ph
-; CHECK-SD-NEXT: dup v0.8h, w8
-; CHECK-SD-NEXT: and x10, x9, #0xfffffff0
-; CHECK-SD-NEXT: add x11, x2, #32
-; CHECK-SD-NEXT: add x12, x0, #16
-; CHECK-SD-NEXT: mov x13, x10
+; CHECK-SD-NEXT: dup v0.8h, w1
+; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
+; CHECK-SD-NEXT: add x10, x2, #32
+; CHECK-SD-NEXT: add x11, x0, #16
+; CHECK-SD-NEXT: mov x12, x9
; CHECK-SD-NEXT: .LBB4_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldp q1, q2, [x12, #-16]
-; CHECK-SD-NEXT: subs x13, x13, #16
-; CHECK-SD-NEXT: add x12, x12, #32
+; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
+; CHECK-SD-NEXT: subs x12, x12, #16
+; CHECK-SD-NEXT: add x11, x11, #32
; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h
; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h
; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h
-; CHECK-SD-NEXT: stp q1, q3, [x11, #-32]
-; CHECK-SD-NEXT: stp q2, q4, [x11], #64
+; CHECK-SD-NEXT: stp q1, q3, [x10, #-32]
+; CHECK-SD-NEXT: stp q2, q4, [x10], #64
; CHECK-SD-NEXT: b.ne .LBB4_4
; CHECK-SD-NEXT: // %bb.5: // %middle.block
-; CHECK-SD-NEXT: cmp x10, x9
+; CHECK-SD-NEXT: cmp x9, x8
; CHECK-SD-NEXT: b.eq .LBB4_8
; CHECK-SD-NEXT: .LBB4_6: // %for.body.preheader1
-; CHECK-SD-NEXT: add x11, x2, x10, lsl #2
-; CHECK-SD-NEXT: add x12, x0, x10, lsl #1
-; CHECK-SD-NEXT: sub x9, x9, x10
+; CHECK-SD-NEXT: add x10, x2, x9, lsl #2
+; CHECK-SD-NEXT: add x11, x0, x9, lsl #1
+; CHECK-SD-NEXT: and w12, w1, #0xffff
+; CHECK-SD-NEXT: sub x8, x8, x9
; CHECK-SD-NEXT: .LBB4_7: // %for.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldrh w10, [x12], #2
-; CHECK-SD-NEXT: subs x9, x9, #1
-; CHECK-SD-NEXT: mul w10, w10, w8
-; CHECK-SD-NEXT: str w10, [x11], #4
+; CHECK-SD-NEXT: ldrh w9, [x11], #2
+; CHECK-SD-NEXT: subs x8, x8, #1
+; CHECK-SD-NEXT: mul w9, w9, w12
+; CHECK-SD-NEXT: str w9, [x10], #4
; CHECK-SD-NEXT: b.ne .LBB4_7
; CHECK-SD-NEXT: .LBB4_8: // %for.cond.cleanup
; CHECK-SD-NEXT: ret
@@ -470,47 +467,48 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-GI-NEXT: cmp w3, #0
; CHECK-GI-NEXT: b.le .LBB4_7
; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-GI-NEXT: mov x9, xzr
+; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: cmp w3, #16
-; CHECK-GI-NEXT: and w8, w1, #0xffff
-; CHECK-GI-NEXT: mov w10, w3
+; CHECK-GI-NEXT: mov w9, w3
; CHECK-GI-NEXT: b.lo .LBB4_5
; CHECK-GI-NEXT: // %bb.2: // %vector.ph
-; CHECK-GI-NEXT: dup v0.4s, w8
-; CHECK-GI-NEXT: and x9, x10, #0xfffffff0
-; CHECK-GI-NEXT: add x11, x2, #32
-; CHECK-GI-NEXT: add x12, x0, #16
-; CHECK-GI-NEXT: mov x13, x9
+; CHECK-GI-NEXT: and x8, x9, #0xfffffff0
+; CHECK-GI-NEXT: add x10, x2, #32
+; CHECK-GI-NEXT: add x11, x0, #16
+; CHECK-GI-NEXT: mov x12, x8
; CHECK-GI-NEXT: .LBB4_3: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16]
-; CHECK-GI-NEXT: mov x14, x11
-; CHECK-GI-NEXT: subs x13, x13, #16
-; CHECK-GI-NEXT: add x12, x12, #32
-; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16]
+; CHECK-GI-NEXT: and w13, w1, #0xffff
+; CHECK-GI-NEXT: dup v2.4s, w13
+; CHECK-GI-NEXT: mov x13, x10
+; CHECK-GI-NEXT: subs x12, x12, #16
+; CHECK-GI-NEXT: add x11, x11, #32
+; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s
-; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]!
-; CHECK-GI-NEXT: stp q4, q2, [x11], #64
+; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]!
+; CHECK-GI-NEXT: stp q4, q1, [x10], #64
; CHECK-GI-NEXT: b.ne .LBB4_3
; CHECK-GI-NEXT: // %bb.4: // %middle.block
-; CHECK-GI-NEXT: cmp x9, x10
+; CHECK-GI-NEXT: cmp x8, x9
; CHECK-GI-NEXT: b.eq .LBB4_7
; CHECK-GI-NEXT: .LBB4_5: // %for.body.preheader1
-; CHECK-GI-NEXT: add x11, x2, x9, lsl #2
-; CHECK-GI-NEXT: add x12, x0, x9, lsl #1
-; CHECK-GI-NEXT: sub x9, x10, x9
+; CHECK-GI-NEXT: add x10, x2, x8, lsl #2
+; CHECK-GI-NEXT: add x11, x0, x8, lsl #1
+; CHECK-GI-NEXT: and w12, w1, #0xffff
+; CHECK-GI-NEXT: sub x8, x9, x8
; CHECK-GI-NEXT: .LBB4_6: // %for.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldrh w10, [x12], #2
-; CHECK-GI-NEXT: subs x9, x9, #1
-; CHECK-GI-NEXT: mul w10, w10, w8
-; CHECK-GI-NEXT: str w10, [x11], #4
+; CHECK-GI-NEXT: ldrh w9, [x11], #2
+; CHECK-GI-NEXT: subs x8, x8, #1
+; CHECK-GI-NEXT: mul w9, w9, w12
+; CHECK-GI-NEXT: str w9, [x10], #4
; CHECK-GI-NEXT: b.ne .LBB4_6
; CHECK-GI-NEXT: .LBB4_7: // %for.cond.cleanup
; CHECK-GI-NEXT: ret
@@ -600,7 +598,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
-; CHECK-SD-NEXT: dup v2.8h, w9
+; CHECK-SD-NEXT: fmov s2, w9
; CHECK-SD-NEXT: add x8, x0, #8
; CHECK-SD-NEXT: mov x12, x11
; CHECK-SD-NEXT: .LBB5_5: // %vector.body
@@ -610,8 +608,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: add x8, x8, #16
; CHECK-SD-NEXT: ushll v3.8h, v3.8b, #0
; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0
-; CHECK-SD-NEXT: mla v0.8h, v2.8h, v3.8h
-; CHECK-SD-NEXT: mla v1.8h, v2.8h, v4.8h
+; CHECK-SD-NEXT: mla v0.8h, v3.8h, v2.h[0]
+; CHECK-SD-NEXT: mla v1.8h, v4.8h, v2.h[0]
; CHECK-SD-NEXT: b.ne .LBB5_5
; CHECK-SD-NEXT: // %bb.6: // %middle.block
; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h
@@ -1025,9 +1023,8 @@ exit:
define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_unsigned_and:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: .LBB10_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -1111,9 +1108,8 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_unsigned_and_double:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: dup v0.8h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.8h, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
; CHECK-SD-NEXT: .LBB11_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
@@ -1207,10 +1203,10 @@ for.end12: ; preds = %vector.body
define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) {
; CHECK-SD-LABEL: matrix_mul_signed_and:
; CHECK-SD: // %bb.0: // %vector.header
-; CHECK-SD-NEXT: and w8, w3, #0xffff
+; CHECK-SD-NEXT: and w9, w3, #0xffff
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-SD-NEXT: dup v0.4s, w8
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
+; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: .LBB12_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
@@ -1220,8 +1216,8 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-SD-NEXT: add w0, w0, #8
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-SD-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-SD-NEXT: mul v2.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0]
+; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q2, [x9]
; CHECK-SD-NEXT: b.ne .LBB12_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
diff --git a/llvm/test/CodeGen/AArch64/sink-mul-exts.ll b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
new file mode 100644
index 00000000000000..d52ac7847f8146
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define <8 x i16> @mul_splat_sext_v8i16(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_splat_sext_v8i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: dup v1.8b, v1.b[3]
+; CHECK-NEXT: .LBB0_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #4
+; CHECK-NEXT: cmp w8, #4
+; CHECK-NEXT: smlal v0.8h, v2.8b, v1.8b
+; CHECK-NEXT: b.eq .LBB0_1
+; CHECK-NEXT: // %bb.2: // %l2
+; CHECK-NEXT: ret
+entry:
+ %x.val = load <8 x i8>, ptr %x
+ %x.ext = sext <8 x i8> %x.val to <8 x i16>
+ %a = shufflevector <8 x i16> %x.ext, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ br label %l1
+
+l1:
+ %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+ %q = phi <8 x i16> [ zeroinitializer, %entry ], [ %c, %l1 ]
+ %y.idx = mul nuw nsw i32 %p, 4
+ %y.ptr = getelementptr i8, ptr %y, i32 %y.idx
+ %y.val = load <8 x i8>, ptr %y.ptr
+ %y.ext = sext <8 x i8> %y.val to <8 x i16>
+ %b = mul <8 x i16> %y.ext, %a
+ %c = add <8 x i16> %q, %b
+ %pa = add i32 %p, 1
+ %c1 = icmp eq i32 %p, 0
+ br i1 %c1, label %l1, label %l2
+
+l2:
+ ret <8 x i16> %c
+}
+
+define <4 x i32> @mul_splat_sext_v4i32(ptr %x, ptr %y) {
+; CHECK-LABEL: mul_splat_sext_v4i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB1_1: // %l1
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr d2, [x1, x8]
+; CHECK-NEXT: add x8, x8, #8
+; CHECK-NEXT: cmp ...
[truncated]
|
; CHECK-NEXT: ldr d1, [x8] | ||
; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s | ||
; CHECK-NEXT: xtn v1.2s, v1.2d | ||
; CHECK-NEXT: str d1, [x8] | ||
; CHECK-NEXT: ldr d0, [x8] | ||
; CHECK-NEXT: ushll v0.2d, v0.2s, #0 | ||
; CHECK-NEXT: fmov x9, d0 | ||
; CHECK-NEXT: mov x8, v0.d[1] | ||
; CHECK-NEXT: mul w9, w0, w9 | ||
; CHECK-NEXT: mul w8, w0, w8 | ||
; CHECK-NEXT: fmov d0, x9 | ||
; CHECK-NEXT: mov v0.d[1], x8 | ||
; CHECK-NEXT: xtn v0.2s, v0.2d | ||
; CHECK-NEXT: str d0, [x8] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a regression, but I have a patch that fixes it by teaching
to handle ANY_EXTEND
s, which seem to get generated via KnownBits queries when we visit the truncate nodes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. Is it possible to write a separate test for it too, with the anyext already in place?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seemed to make sense to put this into a seperate, follow-up PR - see #118308
Is it possible to write a separate test for it too, with the anyext already in place?
I've added the test @dupzext_v2i32_v2i64_trunc
in that PR that should generate the anyext via the truncate - I'm not sure how I would do this otherwise, as unless I'm missing something there's no anyext in LLVM IR?
return (VT == MVT::v4i16 || VT == MVT::v8i16 || VT == MVT::v2i32 || | ||
VT == MVT::v4i32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we add handling for v16i16 and similar larger types too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point - I've refactored it to sink any (non-scalable) vector type with i16 or i32 elements, rather than adding all the possible element counts, because that seemed to make more sense - I'm not sure if there's a reason not to do it this way?
; CHECK-NEXT: ldr d1, [x8] | ||
; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s | ||
; CHECK-NEXT: xtn v1.2s, v1.2d | ||
; CHECK-NEXT: str d1, [x8] | ||
; CHECK-NEXT: ldr d0, [x8] | ||
; CHECK-NEXT: ushll v0.2d, v0.2s, #0 | ||
; CHECK-NEXT: fmov x9, d0 | ||
; CHECK-NEXT: mov x8, v0.d[1] | ||
; CHECK-NEXT: mul w9, w0, w9 | ||
; CHECK-NEXT: mul w8, w0, w8 | ||
; CHECK-NEXT: fmov d0, x9 | ||
; CHECK-NEXT: mov v0.d[1], x8 | ||
; CHECK-NEXT: xtn v0.2s, v0.2d | ||
; CHECK-NEXT: str d0, [x8] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. Is it possible to write a separate test for it too, with the anyext already in place?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - thanks.
auto ElemVT = MVT::getVT(Ty->getElementType(), /*HandleUnknown=*/true); | ||
return (ElemVT == MVT::i16 || ElemVT == MVT::i32); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is possible to avoid the MVT types by checking Ty->getScalarSizeInBits() == 16 || == 32
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah ok nice, that's a better way of doing it. I've updated to do this.
Sink splat operands to mul instructions for types where we can use the
lane-indexed variants.
When sinking operands for [su]mull, also sink the ext instruction.