Skip to content

[AArch64] Use indexed dup for 128b segmented splat #144688

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29317,6 +29317,16 @@ SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
EVT VT = Op.getValueType();
EVT SrcVT = SrcOp1.getValueType();

// Match a splat of 128b segments that fit in a single register.
if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Is this check actually required? (if it got to the point of lowering the operation, I would think this condition is always true)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently not. I did include a test (concat_i32q_512_with_256_vectors) to exercise this, expecting it to be split into a concat of concats, then to see each turned into a duplane128 here, but it seems that legalization might kick in to split it first. This function is only called if a check with useSVEForFixedLengthVectorVT succeeds, which does exclude wider VTs. Will remove.

SDValue Splat =
DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
convertToScalableVector(DAG, ContainerVT, SrcOp1),
DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
return convertFromScalableVector(DAG, VT, Splat);
}

if (NumOperands > 2) {
SmallVector<SDValue, 4> Ops;
EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
Expand Down
142 changes: 142 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s

;; Patterns that lower to concat_vectors where all incoming operands are the same.

define void @concat_i8q_256(<16 x i8> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_i8q_256:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the tests, could you add some cases that result in an indexed dup with an index other than 0? e.g. a load of a 256bit vector (which would result in a z-register), a shuffle vector that splats the top 128-bits of that vector over the low and high parts, which should result in a indexed dup instruction with index of 1.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's more for a followup patch -- this PR is focused on concat_vectors. concat_i16q_256 below does select the second segment in the shufflevector in IR, but is still lowered to concat_vectors instead of a vector_shuffle.

; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <16 x i8> %data, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
store <32 x i8> %splat, ptr %addr, align 1
ret void
}

define void @concat_i16q_256(<8 x i16> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_i16q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <8 x i16> %data, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <16 x i16> %splat, ptr %addr, align 1
ret void
}

define void @concat_i32q_256(<4 x i32> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_i32q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <4 x i32> %data, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3>
store <8 x i32> %splat, ptr %addr, align 1
ret void
}

define void @concat_i64q_256(<2 x i64> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_i64q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <2 x i64> %data, <2 x i64> poison, <4 x i32> <i32 0, i32 1,
i32 0, i32 1>
store <4 x i64> %splat, ptr %addr, align 1
ret void
}

define void @concat_f16q_256(<8 x half> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_f16q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <8 x half> %data, <8 x half> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <16 x half> %splat, ptr %addr, align 1
ret void
}

define void @concat_bf16q_256(<8 x bfloat> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_bf16q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <8 x bfloat> %data, <8 x bfloat> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <16 x bfloat> %splat, ptr %addr, align 1
ret void
}

define void @concat_f32q_256(<4 x float> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_f32q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <4 x float> %data, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3>
store <8 x float> %splat, ptr %addr, align 1
ret void
}

define void @concat_f64q_256(<2 x double> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_f64q_256:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <2 x double> %data, <2 x double> poison, <4 x i32> <i32 0, i32 1,
i32 0, i32 1>
store <4 x double> %splat, ptr %addr, align 1
ret void
}

;; Test a wider vector

define void @concat_i32q_512_with_256_vectors(<4 x i32> %data, ptr %addr) #0 {
; CHECK-LABEL: concat_i32q_512_with_256_vectors:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0, #1, mul vl]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3>
store <16 x i32> %splat, ptr %addr, align 1
ret void
}

define void @concat_i32q_512_with_512_vectors(<4 x i32> %data, ptr %addr) #1 {
; CHECK-LABEL: concat_i32q_512_with_512_vectors:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: mov z0.q, q0
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3>
store <16 x i32> %splat, ptr %addr, align 1
ret void
}

attributes #0 = { vscale_range(2,2) "target-features"="+sve,+bf16" }
attributes #1 = { vscale_range(4,4) "target-features"="+sve,+bf16" }
Loading