Skip to content

Commit 8b8a369

Browse files
authored
[AArch64] Use dupq (SVE2.1) for segmented lane splats (#144482)
Use the dupq instructions (when available) to represent a splat of the same lane within each 128b segment of a wider fixed vector.
1 parent 3af4d4e commit 8b8a369

File tree

2 files changed

+152
-0
lines changed

2 files changed

+152
-0
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13392,6 +13392,30 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
1339213392
return true;
1339313393
}
1339413394

13395+
/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
13396+
/// the first vector operand.
13397+
static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
13398+
assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
13399+
unsigned Lane = (unsigned)M[0];
13400+
unsigned Segments = VT.getFixedSizeInBits() / 128;
13401+
unsigned SegmentElts = VT.getVectorNumElements() / Segments;
13402+
13403+
// Make sure there's no size changes.
13404+
if (SegmentElts * Segments != M.size())
13405+
return std::nullopt;
13406+
13407+
// Check the first index corresponds to one of the lanes in the first segment.
13408+
if (Lane >= SegmentElts)
13409+
return std::nullopt;
13410+
13411+
// Check that all lanes match the first, adjusted for segment.
13412+
for (unsigned I = 0; I < M.size(); ++I)
13413+
if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
13414+
return std::nullopt;
13415+
13416+
return Lane;
13417+
}
13418+
1339513419
/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
1339613420
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
1339713421
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
@@ -29981,6 +30005,19 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
2998130005
return convertFromScalableVector(
2998230006
DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
2998330007
}
30008+
30009+
if (Subtarget->hasSVE2p1()) {
30010+
if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
30011+
SDValue IID =
30012+
DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
30013+
return convertFromScalableVector(
30014+
DAG, VT,
30015+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
30016+
{IID, Op1,
30017+
DAG.getConstant(*Lane, DL, MVT::i64,
30018+
/*isTarget=*/true)}));
30019+
}
30020+
}
2998430021
}
2998530022

2998630023
// Try to widen the shuffle before generating a possibly expensive SVE TBL.
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
3+
4+
define void @dupq_i8_256b(ptr %addr) #0 {
5+
; CHECK-LABEL: dupq_i8_256b:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: ldr z0, [x0]
8+
; CHECK-NEXT: dupq z0.b, z0.b[15]
9+
; CHECK-NEXT: str z0, [x0]
10+
; CHECK-NEXT: ret
11+
%load = load <32 x i8>, ptr %addr
12+
%splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15,
13+
i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
14+
store <32 x i8> %splat.lanes, ptr %addr
15+
ret void
16+
}
17+
18+
define void @dupq_i16_256b(ptr %addr) #0 {
19+
; CHECK-LABEL: dupq_i16_256b:
20+
; CHECK: // %bb.0:
21+
; CHECK-NEXT: ldr z0, [x0]
22+
; CHECK-NEXT: dupq z0.h, z0.h[2]
23+
; CHECK-NEXT: str z0, [x0]
24+
; CHECK-NEXT: ret
25+
%load = load <16 x i16>, ptr %addr
26+
%splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
27+
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
28+
store <16 x i16> %splat.lanes, ptr %addr
29+
ret void
30+
}
31+
32+
define void @dupq_i32_256b(ptr %addr) #0 {
33+
; CHECK-LABEL: dupq_i32_256b:
34+
; CHECK: // %bb.0:
35+
; CHECK-NEXT: ldr z0, [x0]
36+
; CHECK-NEXT: dupq z0.s, z0.s[3]
37+
; CHECK-NEXT: str z0, [x0]
38+
; CHECK-NEXT: ret
39+
%load = load <8 x i32>, ptr %addr
40+
%splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
41+
i32 7, i32 7, i32 7, i32 7>
42+
store <8 x i32> %splat.lanes, ptr %addr
43+
ret void
44+
}
45+
46+
define void @dupq_i64_256b(ptr %addr) #0 {
47+
; CHECK-LABEL: dupq_i64_256b:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: ldr z0, [x0]
50+
; CHECK-NEXT: trn1 z0.d, z0.d, z0.d
51+
; CHECK-NEXT: str z0, [x0]
52+
; CHECK-NEXT: ret
53+
%load = load <4 x i64>, ptr %addr
54+
%splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
55+
store <4 x i64> %splat.lanes, ptr %addr
56+
ret void
57+
}
58+
59+
define void @dupq_f16_256b(ptr %addr) #0 {
60+
; CHECK-LABEL: dupq_f16_256b:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: ldr z0, [x0]
63+
; CHECK-NEXT: dupq z0.h, z0.h[2]
64+
; CHECK-NEXT: str z0, [x0]
65+
; CHECK-NEXT: ret
66+
%load = load <16 x half>, ptr %addr
67+
%splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
68+
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
69+
store <16 x half> %splat.lanes, ptr %addr
70+
ret void
71+
}
72+
73+
define void @dupq_bf16_256b(ptr %addr) #0 {
74+
; CHECK-LABEL: dupq_bf16_256b:
75+
; CHECK: // %bb.0:
76+
; CHECK-NEXT: ldp q0, q1, [x0]
77+
; CHECK-NEXT: dup v0.8h, v0.h[2]
78+
; CHECK-NEXT: dup v1.8h, v1.h[2]
79+
; CHECK-NEXT: stp q0, q1, [x0]
80+
; CHECK-NEXT: ret
81+
%load = load <16 x bfloat>, ptr %addr
82+
%splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
83+
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
84+
store <16 x bfloat> %splat.lanes, ptr %addr
85+
ret void
86+
}
87+
88+
define void @dupq_f32_256b(ptr %addr) #0 {
89+
; CHECK-LABEL: dupq_f32_256b:
90+
; CHECK: // %bb.0:
91+
; CHECK-NEXT: ldr z0, [x0]
92+
; CHECK-NEXT: dupq z0.s, z0.s[3]
93+
; CHECK-NEXT: str z0, [x0]
94+
; CHECK-NEXT: ret
95+
%load = load <8 x float>, ptr %addr
96+
%splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
97+
i32 7, i32 7, i32 7, i32 7>
98+
store <8 x float> %splat.lanes, ptr %addr
99+
ret void
100+
}
101+
102+
define void @dupq_f64_256b(ptr %addr) #0 {
103+
; CHECK-LABEL: dupq_f64_256b:
104+
; CHECK: // %bb.0:
105+
; CHECK-NEXT: ldr z0, [x0]
106+
; CHECK-NEXT: trn1 z0.d, z0.d, z0.d
107+
; CHECK-NEXT: str z0, [x0]
108+
; CHECK-NEXT: ret
109+
%load = load <4 x double>, ptr %addr
110+
%splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
111+
store <4 x double> %splat.lanes, ptr %addr
112+
ret void
113+
}
114+
115+
attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }

0 commit comments

Comments
 (0)