Skip to content

Commit bc8af08

Browse files
committed
Return lower cost for dupq
1 parent 1b8c7cb commit bc8af08

File tree

2 files changed

+25
-5
lines changed

2 files changed

+25
-5
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5583,6 +5583,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
55835583
Kind = TTI::SK_PermuteSingleSrc;
55845584
}
55855585

5586+
// Segmented shuffle matching.
5587+
if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
5588+
Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
5589+
Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
5590+
5591+
FixedVectorType *VTy = cast<FixedVectorType>(Tp);
5592+
unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
5593+
unsigned SegmentElts = VTy->getNumElements() / Segments;
5594+
5595+
// dupq zd.t, zn.t[idx]
5596+
unsigned Lane = (unsigned)Mask[0];
5597+
if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
5598+
bool IsDupQ = true;
5599+
for (unsigned I = 1; I < Mask.size(); ++I)
5600+
IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
5601+
if (IsDupQ)
5602+
return LT.first;
5603+
}
5604+
}
5605+
55865606
// Check for broadcast loads, which are supported by the LD1R instruction.
55875607
// In terms of code-size, the shuffle vector is free when a load + dup get
55885608
// folded into a LD1R. That's what we check and return here. For performance

llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
55
define void @dup_within_each_segment() #0 {
66
; CHECK-LABEL: 'dup_within_each_segment'
7-
; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
8-
; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
9-
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
7+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
1313
;
1414
%dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,

0 commit comments

Comments
 (0)