[AArch64] Use indexed dup for 128b segmented splat #144688

huntergr-arm · 2025-06-18T12:36:54Z

Matches a splat of 128b segments into a wider z register expressed as a concat_vectors sdnode and generate a dup zn.q, zd.q[0] instruction.

llvmbot · 2025-06-18T12:37:24Z

@llvm/pr-subscribers-backend-aarch64

Author: Graham Hunter (huntergr-arm)

Changes

Matches a splat of 128b segments into a wider z register expressed as a concat_vectors sdnode and generate a dup zn.q, zd.q[0] instruction.

Full diff: https://github.com/llvm/llvm-project/pull/144688.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+11)
(added) llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll (+142)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1169efce3123f..2546a49aaed2c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29317,6 +29317,17 @@ SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
   EVT VT = Op.getValueType();
   EVT SrcVT = SrcOp1.getValueType();
 
+  // Match a splat of 128b segments that fit in a single register.
+  if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values()) &&
+      VT.getSizeInBits() <= Subtarget->getMinSVEVectorSizeInBits()) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+    SDValue Splat =
+        DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
+                    convertToScalableVector(DAG, ContainerVT, SrcOp1),
+                    DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
+    return convertFromScalableVector(DAG, VT, Splat);
+  }
+
   if (NumOperands > 2) {
     SmallVector<SDValue, 4> Ops;
     EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll
new file mode 100644
index 0000000000000..f1ff37b640026
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+;; Patterns that lower to concat_vectors where all incoming operands are the same.
+
+define void @concat_i8q_256(<16 x i8> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i8q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <16 x i8> %data, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                                                        i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <32 x i8> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i16q_256(<8 x i16> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i16q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <8 x i16> poison, <8 x i16> %data, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                                                        i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i32q_256(<4 x i32> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i32q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x i32> %data, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                       i32 0, i32 1, i32 2, i32 3>
+  store <8 x i32> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i64q_256(<2 x i64> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i64q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <2 x i64> poison, <2 x i64> %data, <4 x i32> <i32 2, i32 3,
+                                                                       i32 2, i32 3>
+  store <4 x i64> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_f16q_256(<8 x half> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_f16q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <8 x half> poison, <8 x half> %data, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                                                          i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x half> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_bf16q_256(<8 x bfloat> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_bf16q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <8 x bfloat> poison, <8 x bfloat> %data, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x bfloat> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_f32q_256(<4 x float> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_f32q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x float> %data, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                           i32 0, i32 1, i32 2, i32 3>
+  store <8 x float> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_f64q_256(<2 x double> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_f64q_256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <2 x double> poison, <2 x double> %data, <4 x i32> <i32 2, i32 3,
+                                                                             i32 2, i32 3>
+  store <4 x double> %splat, ptr %addr, align 1
+  ret void
+}
+
+;; Test a wider vector
+
+define void @concat_i32q_512_with_256_vectors(<4 x i32> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i32q_512_with_256_vectors:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0, #1, mul vl]
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3>
+  store <16 x i32> %splat, ptr %addr, align 1
+  ret void
+}
+
+define void @concat_i32q_512_with_512_vectors(<4 x i32> %data, ptr %addr) #1 {
+; CHECK-LABEL: concat_i32q_512_with_512_vectors:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+  %splat = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3,
+                                                                        i32 0, i32 1, i32 2, i32 3>
+  store <16 x i32> %splat, ptr %addr, align 1
+  ret void
+}
+
+attributes #0 = { vscale_range(2,2) "target-features"="+sve,+bf16" }
+attributes #1 = { vscale_range(4,4) "target-features"="+sve,+bf16" }

sdesmalen-arm · 2025-06-18T12:40:16Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -29317,6 +29317,17 @@ SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
  EVT VT = Op.getValueType();
  EVT SrcVT = SrcOp1.getValueType();

+  // Match a splat of 128b segments that fit in a single register.
+  if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values()) &&
+      VT.getSizeInBits() <= Subtarget->getMinSVEVectorSizeInBits()) {


nit: Is this check actually required? (if it got to the point of lowering the operation, I would think this condition is always true)

Apparently not. I did include a test (concat_i32q_512_with_256_vectors) to exercise this, expecting it to be split into a concat of concats, then to see each turned into a duplane128 here, but it seems that legalization might kick in to split it first. This function is only called if a check with useSVEForFixedLengthVectorVT succeeds, which does exclude wider VTs. Will remove.

sdesmalen-arm · 2025-06-18T12:49:01Z

llvm/test/CodeGen/AArch64/sve-fixed-length-splat-segment.ll

+;; Patterns that lower to concat_vectors where all incoming operands are the same.
+
+define void @concat_i8q_256(<16 x i8> %data, ptr %addr) #0 {
+; CHECK-LABEL: concat_i8q_256:


For the tests, could you add some cases that result in an indexed dup with an index other than 0? e.g. a load of a 256bit vector (which would result in a z-register), a shuffle vector that splats the top 128-bits of that vector over the low and high parts, which should result in a indexed dup instruction with index of 1.

I think that's more for a followup patch -- this PR is focused on concat_vectors. concat_i16q_256 below does select the second segment in the shufflevector in IR, but is still lowered to concat_vectors instead of a vector_shuffle.

… the first yet

huntergr-arm added 2 commits June 18, 2025 12:03

Test precommit

523d70c

Match 128b splats expressed as concat_vectors

0d69724

huntergr-arm requested review from paulwalker-arm, sdesmalen-arm and gbossu June 18, 2025 12:36

llvmbot added the backend:AArch64 label Jun 18, 2025

sdesmalen-arm reviewed Jun 18, 2025

View reviewed changes

Remove unnecessary check

4307923

sdesmalen-arm approved these changes Jun 20, 2025

View reviewed changes

paulwalker-arm approved these changes Jun 20, 2025

View reviewed changes

Use indices from 0, since we're not testing splats of segments beyond…

c3a7b12

… the first yet

huntergr-arm merged commit 152d4b8 into llvm:main Jun 20, 2025
5 of 7 checks passed

huntergr-arm deleted the concat-segments-to-splat branch June 20, 2025 13:26

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Use indexed dup for 128b segmented splat #144688

[AArch64] Use indexed dup for 128b segmented splat #144688

Uh oh!

huntergr-arm commented Jun 18, 2025

Uh oh!

llvmbot commented Jun 18, 2025

Uh oh!

sdesmalen-arm Jun 18, 2025

Uh oh!

huntergr-arm Jun 18, 2025

Uh oh!

sdesmalen-arm Jun 18, 2025

Uh oh!

huntergr-arm Jun 18, 2025

Uh oh!

Uh oh!

Uh oh!

[AArch64] Use indexed dup for 128b segmented splat #144688

[AArch64] Use indexed dup for 128b segmented splat #144688

Uh oh!

Conversation

huntergr-arm commented Jun 18, 2025

Uh oh!

llvmbot commented Jun 18, 2025

Uh oh!

sdesmalen-arm Jun 18, 2025

Choose a reason for hiding this comment

Uh oh!

huntergr-arm Jun 18, 2025

Choose a reason for hiding this comment

Uh oh!

sdesmalen-arm Jun 18, 2025

Choose a reason for hiding this comment

Uh oh!

huntergr-arm Jun 18, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!