Skip to content

[AArch64] Use dupq (SVE2.1) for segmented lane splats #144482

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13430,6 +13430,30 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return true;
}

/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in
/// the first vector operand.
static std::optional<unsigned> isDUPQMask(ArrayRef<int> M, EVT VT) {
assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size");
unsigned Lane = (unsigned)M[0];
unsigned Segments = VT.getFixedSizeInBits() / 128;
unsigned SegmentElts = VT.getVectorNumElements() / Segments;

// Make sure there's no size changes.
if (SegmentElts * Segments != M.size())
return std::nullopt;

// Check the first index corresponds to one of the lanes in the first segment.
if (Lane >= SegmentElts)
return std::nullopt;

// Check that all lanes match the first, adjusted for segment.
for (unsigned I = 0; I < M.size(); ++I)
if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts)))
return std::nullopt;

return Lane;
}

/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
Expand Down Expand Up @@ -30013,6 +30037,19 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
return convertFromScalableVector(
DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
}

if (Subtarget->hasSVE2p1()) {
if (std::optional<unsigned> Lane = isDUPQMask(ShuffleMask, VT)) {
SDValue IID =
DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
return convertFromScalableVector(
DAG, VT,
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
{IID, Op1,
DAG.getConstant(*Lane, DL, MVT::i64,
/*isTarget=*/true)}));
}
}
}

// Try to widen the shuffle before generating a possibly expensive SVE TBL.
Expand Down
115 changes: 115 additions & 0 deletions llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s

define void @dupq_i8_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i8_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.b, z0.b[15]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <32 x i8>, ptr %addr
%splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15,
i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
store <32 x i8> %splat.lanes, ptr %addr
ret void
}

define void @dupq_i16_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.h, z0.h[2]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <16 x i16>, ptr %addr
%splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
store <16 x i16> %splat.lanes, ptr %addr
ret void
}

define void @dupq_i32_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.s, z0.s[3]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <8 x i32>, ptr %addr
%splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
store <8 x i32> %splat.lanes, ptr %addr
ret void
}

define void @dupq_i64_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i64_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: trn1 z0.d, z0.d, z0.d
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <4 x i64>, ptr %addr
%splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
store <4 x i64> %splat.lanes, ptr %addr
ret void
}

define void @dupq_f16_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_f16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.h, z0.h[2]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <16 x half>, ptr %addr
%splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
store <16 x half> %splat.lanes, ptr %addr
ret void
}

define void @dupq_bf16_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_bf16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: dup v1.8h, v1.h[2]
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
%load = load <16 x bfloat>, ptr %addr
%splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
store <16 x bfloat> %splat.lanes, ptr %addr
ret void
}

define void @dupq_f32_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_f32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.s, z0.s[3]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <8 x float>, ptr %addr
%splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
store <8 x float> %splat.lanes, ptr %addr
ret void
}

define void @dupq_f64_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_f64_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: trn1 z0.d, z0.d, z0.d
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <4 x double>, ptr %addr
%splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
store <4 x double> %splat.lanes, ptr %addr
ret void
}

attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }
Loading