Skip to content

Commit 2db3cc4

Browse files
authored
[AArch64][CostModel] Lower cost of dupq (SVE2.1) (#144918)
With codegen in place to match shuffles to dupq, we can now lower the cost to something reasonable.
1 parent bb72424 commit 2db3cc4

File tree

5 files changed

+160
-35
lines changed

5 files changed

+160
-35
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13404,30 +13404,6 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
1340413404
return true;
1340513405
}
1340613406

13407-
/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
13408-
/// the first vector operand.
13409-
static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
13410-
assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
13411-
unsigned Lane = (unsigned)M[0];
13412-
unsigned Segments = VT.getFixedSizeInBits() / 128;
13413-
unsigned SegmentElts = VT.getVectorNumElements() / Segments;
13414-
13415-
// Make sure there's no size changes.
13416-
if (SegmentElts * Segments != M.size())
13417-
return std::nullopt;
13418-
13419-
// Check the first index corresponds to one of the lanes in the first segment.
13420-
if (Lane >= SegmentElts)
13421-
return std::nullopt;
13422-
13423-
// Check that all lanes match the first, adjusted for segment.
13424-
for (unsigned I = 0; I < M.size(); ++I)
13425-
if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
13426-
return std::nullopt;
13427-
13428-
return Lane;
13429-
}
13430-
1343113407
/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
1343213408
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
1343313409
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -30029,8 +30005,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
3002930005
DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
3003030006
}
3003130007

30032-
if (Subtarget->hasSVE2p1()) {
30033-
if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
30008+
if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
30009+
Subtarget->isSVEorStreamingSVEAvailable()) {
30010+
assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
30011+
"Unsupported SVE vector size");
30012+
30013+
unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
30014+
unsigned SegmentElts = VT.getVectorNumElements() / Segments;
30015+
if (std::optional<unsigned> Lane =
30016+
isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
3003430017
SDValue IID =
3003530018
DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
3003630019
return convertFromScalableVector(

llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
1616

1717
#include "llvm/ADT/ArrayRef.h"
18+
#include "llvm/ADT/STLExtras.h"
1819

1920
namespace llvm {
2021

@@ -6723,6 +6724,32 @@ inline bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
67236724
return true;
67246725
}
67256726

6727+
/// isDUPQMask - matches a splat of equivalent lanes within segments of a given
6728+
/// number of elements.
6729+
inline std::optional<unsigned> isDUPQMask(ArrayRef<int> Mask, unsigned Segments,
6730+
unsigned SegmentSize) {
6731+
unsigned Lane = unsigned(Mask[0]);
6732+
6733+
// Make sure there's no size changes.
6734+
if (SegmentSize * Segments != Mask.size())
6735+
return std::nullopt;
6736+
6737+
// Check the first index corresponds to one of the lanes in the first segment.
6738+
if (Lane >= SegmentSize)
6739+
return std::nullopt;
6740+
6741+
// Check that all lanes match the first, adjusted for segment.
6742+
// Undef/poison lanes (<0) are also accepted.
6743+
if (all_of(enumerate(Mask), [&](auto P) {
6744+
const unsigned SegmentIndex = P.index() / SegmentSize;
6745+
return P.value() < 0 ||
6746+
unsigned(P.value()) == Lane + SegmentIndex * SegmentSize;
6747+
}))
6748+
return Lane;
6749+
6750+
return std::nullopt;
6751+
}
6752+
67266753
} // namespace llvm
67276754

67286755
#endif

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5599,6 +5599,23 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
55995599
SrcTy = DstTy;
56005600
}
56015601

5602+
// Segmented shuffle matching.
5603+
if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
5604+
ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc &&
5605+
isa<FixedVectorType>(SrcTy) && !Mask.empty() &&
5606+
SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5607+
AArch64::SVEBitsPerBlock)) {
5608+
5609+
FixedVectorType *VTy = cast<FixedVectorType>(SrcTy);
5610+
unsigned Segments =
5611+
VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
5612+
unsigned SegmentElts = VTy->getNumElements() / Segments;
5613+
5614+
// dupq zd.t, zn.t[idx]
5615+
if (isDUPQMask(Mask, Segments, SegmentElts))
5616+
return LT.first;
5617+
}
5618+
56025619
// Check for broadcast loads, which are supported by the LD1R instruction.
56035620
// In terms of code-size, the shuffle vector is free when a load + dup get
56045621
// folded into a LD1R. That's what we check and return here. For performance
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
3+
; RUN: opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sme2p1 -force-streaming < %s | FileCheck %s
4+
5+
;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
6+
define void @dup_within_each_segment_256b() #0 {
7+
; CHECK-LABEL: 'dup_within_each_segment_256b'
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
15+
;
16+
%dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
17+
i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
18+
%dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
19+
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
20+
%dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
21+
i32 7, i32 7, i32 7, i32 7>
22+
%dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
23+
%dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
24+
%dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
25+
i32 7, i32 poison, i32 7, i32 7>
26+
ret void
27+
}
28+
29+
define void @dup_within_each_segment_512b() #1 {
30+
; CHECK-LABEL: 'dup_within_each_segment_512b'
31+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
32+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
33+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
34+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
35+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
36+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3, i32 7, i32 poison, i32 7, i32 7>
37+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
38+
;
39+
%dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
40+
i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
41+
%dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
42+
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
43+
%dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
44+
i32 7, i32 7, i32 7, i32 7>
45+
%dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
46+
%dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
47+
%dupq_s3_with_poison = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 poison, i32 3,
48+
i32 7, i32 poison, i32 7, i32 7>
49+
ret void
50+
}
51+
52+
attributes #0 = { noinline vscale_range(2,2) }
53+
attributes #1 = { noinline vscale_range(4,4) }

llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE
3+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1,+bf16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
34

45
define void @dupq_i8_256b(ptr %addr) #0 {
56
; CHECK-LABEL: dupq_i8_256b:
@@ -71,13 +72,43 @@ define void @dupq_f16_256b(ptr %addr) #0 {
7172
}
7273

7374
define void @dupq_bf16_256b(ptr %addr) #0 {
74-
; CHECK-LABEL: dupq_bf16_256b:
75-
; CHECK: // %bb.0:
76-
; CHECK-NEXT: ldp q0, q1, [x0]
77-
; CHECK-NEXT: dup v0.8h, v0.h[2]
78-
; CHECK-NEXT: dup v1.8h, v1.h[2]
79-
; CHECK-NEXT: stp q0, q1, [x0]
80-
; CHECK-NEXT: ret
75+
; SVE-LABEL: dupq_bf16_256b:
76+
; SVE: // %bb.0:
77+
; SVE-NEXT: ldp q0, q1, [x0]
78+
; SVE-NEXT: dup v0.8h, v0.h[2]
79+
; SVE-NEXT: dup v1.8h, v1.h[2]
80+
; SVE-NEXT: stp q0, q1, [x0]
81+
; SVE-NEXT: ret
82+
;
83+
; SME-LABEL: dupq_bf16_256b:
84+
; SME: // %bb.0:
85+
; SME-NEXT: ldp q1, q0, [x0]
86+
; SME-NEXT: str q0, [sp, #-64]!
87+
; SME-NEXT: .cfi_def_cfa_offset 64
88+
; SME-NEXT: ldr h0, [sp, #4]
89+
; SME-NEXT: str q1, [sp, #32]
90+
; SME-NEXT: str h0, [sp, #30]
91+
; SME-NEXT: str h0, [sp, #28]
92+
; SME-NEXT: str h0, [sp, #26]
93+
; SME-NEXT: str h0, [sp, #24]
94+
; SME-NEXT: str h0, [sp, #22]
95+
; SME-NEXT: str h0, [sp, #20]
96+
; SME-NEXT: str h0, [sp, #18]
97+
; SME-NEXT: str h0, [sp, #16]
98+
; SME-NEXT: ldr h0, [sp, #36]
99+
; SME-NEXT: ldr q1, [sp, #16]
100+
; SME-NEXT: str h0, [sp, #62]
101+
; SME-NEXT: str h0, [sp, #60]
102+
; SME-NEXT: str h0, [sp, #58]
103+
; SME-NEXT: str h0, [sp, #56]
104+
; SME-NEXT: str h0, [sp, #54]
105+
; SME-NEXT: str h0, [sp, #52]
106+
; SME-NEXT: str h0, [sp, #50]
107+
; SME-NEXT: str h0, [sp, #48]
108+
; SME-NEXT: ldr q0, [sp, #48]
109+
; SME-NEXT: stp q0, q1, [x0]
110+
; SME-NEXT: add sp, sp, #64
111+
; SME-NEXT: ret
81112
%load = load <16 x bfloat>, ptr %addr
82113
%splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
83114
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
@@ -112,4 +143,18 @@ define void @dupq_f64_256b(ptr %addr) #0 {
112143
ret void
113144
}
114145

115-
attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }
146+
define void @dupq_f32_256b_with_poison(ptr %addr) #0 {
147+
; CHECK-LABEL: dupq_f32_256b_with_poison:
148+
; CHECK: // %bb.0:
149+
; CHECK-NEXT: ldr z0, [x0]
150+
; CHECK-NEXT: dupq z0.s, z0.s[3]
151+
; CHECK-NEXT: str z0, [x0]
152+
; CHECK-NEXT: ret
153+
%load = load <8 x float>, ptr %addr
154+
%splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 3, i32 3,
155+
i32 7, i32 7, i32 7, i32 poison>
156+
store <8 x float> %splat.lanes, ptr %addr
157+
ret void
158+
}
159+
160+
attributes #0 = { noinline vscale_range(2,2) }

0 commit comments

Comments
 (0)