Skip to content

Commit 602f436

Browse files
[AArch64] Add patterns for constructive splice. (#113912)
SVE2 adds the constructive splice instruction, which takes a tuple. Even though the register allocator must ensure that the tuple uses consecutive registers for the tuple, it's likely to be more efficient than using the destructive splice instruction when the first operand is reused.
1 parent 84b7bcf commit 602f436

9 files changed

+837
-1267
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3851,7 +3851,7 @@ let Predicates = [HasSVE2] in {
38513851

38523852
let Predicates = [HasSVE2orSME] in {
38533853
// SVE2 vector splice (constructive)
3854-
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
3854+
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice", AArch64splice>;
38553855
} // End HasSVE2orSME
38563856

38573857
let Predicates = [HasSVE2] in {

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7314,11 +7314,33 @@ class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
73147314
let hasSideEffects = 0;
73157315
}
73167316

7317-
multiclass sve2_int_perm_splice_cons<string asm> {
7317+
multiclass sve2_int_perm_splice_cons<string asm, SDPatternOperator op> {
73187318
def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
73197319
def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
73207320
def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
73217321
def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
7322+
7323+
let AddedComplexity = 2 in {
7324+
foreach VT = [nxv16i8] in
7325+
def : Pat<(VT (op nxv16i1:$pred, VT:$zn1, VT:$zn2)),
7326+
(!cast<Instruction>(NAME # _B)
7327+
nxv16i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
7328+
7329+
foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
7330+
def : Pat<(VT (op nxv8i1:$pred, VT:$zn1, VT:$zn2)),
7331+
(!cast<Instruction>(NAME # _H)
7332+
nxv8i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
7333+
7334+
foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
7335+
def : Pat<(VT (op nxv4i1:$pred, VT:$zn1, VT:$zn2)),
7336+
(!cast<Instruction>(NAME # _S)
7337+
nxv4i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
7338+
7339+
foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
7340+
def : Pat<(VT (op nxv2i1:$pred, VT:$zn1, VT:$zn2)),
7341+
(!cast<Instruction>(NAME # _D)
7342+
nxv2i1:$pred, (REG_SEQUENCE ZPR2, VT:$zn1, zsub0, VT:$zn2, zsub1))>;
7343+
}
73227344
}
73237345

73247346
class sve2_int_perm_expand<bits<2> sz, string asm,

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll

Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
3-
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
2+
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
3+
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
44
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
55

66
target triple = "aarch64-unknown-linux-gnu"
@@ -61,10 +61,10 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) {
6161
define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) {
6262
; CHECK-LABEL: concat_v16i8:
6363
; CHECK: // %bb.0:
64+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
6465
; CHECK-NEXT: ptrue p0.b, vl8
65-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
66-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
67-
; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b
66+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
67+
; CHECK-NEXT: splice z0.b, p0, { z0.b, z1.b }
6868
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
6969
; CHECK-NEXT: ret
7070
;
@@ -172,10 +172,10 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) {
172172
define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) {
173173
; CHECK-LABEL: concat_v8i16:
174174
; CHECK: // %bb.0:
175+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
175176
; CHECK-NEXT: ptrue p0.h, vl4
176-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
177-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
178-
; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
177+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
178+
; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
179179
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
180180
; CHECK-NEXT: ret
181181
;
@@ -270,10 +270,10 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) {
270270
define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) {
271271
; CHECK-LABEL: concat_v4i32:
272272
; CHECK: // %bb.0:
273+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
273274
; CHECK-NEXT: ptrue p0.s, vl2
274-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
275-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
276-
; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
275+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
276+
; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s }
277277
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
278278
; CHECK-NEXT: ret
279279
;
@@ -340,10 +340,10 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
340340
define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) {
341341
; CHECK-LABEL: concat_v2i64:
342342
; CHECK: // %bb.0:
343+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
343344
; CHECK-NEXT: ptrue p0.d, vl1
344-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
345-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
346-
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
345+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
346+
; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d }
347347
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
348348
; CHECK-NEXT: ret
349349
;
@@ -406,17 +406,33 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
406406
;
407407

408408
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
409-
; CHECK-LABEL: concat_v4f16:
410-
; CHECK: // %bb.0:
411-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
412-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
413-
; CHECK-NEXT: mov z2.h, z1.h[1]
414-
; CHECK-NEXT: mov z3.h, z0.h[1]
415-
; CHECK-NEXT: zip1 z1.h, z1.h, z2.h
416-
; CHECK-NEXT: zip1 z0.h, z0.h, z3.h
417-
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
418-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
419-
; CHECK-NEXT: ret
409+
; SVE2-LABEL: concat_v4f16:
410+
; SVE2: // %bb.0:
411+
; SVE2-NEXT: cnth x8
412+
; SVE2-NEXT: adrp x9, .LCPI15_0
413+
; SVE2-NEXT: adrp x10, .LCPI15_1
414+
; SVE2-NEXT: mov z2.h, w8
415+
; SVE2-NEXT: ldr q3, [x9, :lo12:.LCPI15_0]
416+
; SVE2-NEXT: ldr q4, [x10, :lo12:.LCPI15_1]
417+
; SVE2-NEXT: ptrue p0.h, vl8
418+
; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
419+
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
420+
; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h
421+
; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h
422+
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
423+
; SVE2-NEXT: ret
424+
;
425+
; SME-LABEL: concat_v4f16:
426+
; SME: // %bb.0:
427+
; SME-NEXT: // kill: def $d1 killed $d1 def $z1
428+
; SME-NEXT: // kill: def $d0 killed $d0 def $z0
429+
; SME-NEXT: mov z2.h, z1.h[1]
430+
; SME-NEXT: mov z3.h, z0.h[1]
431+
; SME-NEXT: zip1 z1.h, z1.h, z2.h
432+
; SME-NEXT: zip1 z0.h, z0.h, z3.h
433+
; SME-NEXT: zip1 z0.s, z0.s, z1.s
434+
; SME-NEXT: // kill: def $d0 killed $d0 killed $z0
435+
; SME-NEXT: ret
420436
;
421437
; NONEON-NOSVE-LABEL: concat_v4f16:
422438
; NONEON-NOSVE: // %bb.0:
@@ -436,10 +452,10 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
436452
define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) {
437453
; CHECK-LABEL: concat_v8f16:
438454
; CHECK: // %bb.0:
455+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
439456
; CHECK-NEXT: ptrue p0.h, vl4
440-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
441-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
442-
; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
457+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
458+
; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h }
443459
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
444460
; CHECK-NEXT: ret
445461
;
@@ -534,10 +550,10 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) {
534550
define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) {
535551
; CHECK-LABEL: concat_v4f32:
536552
; CHECK: // %bb.0:
553+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
537554
; CHECK-NEXT: ptrue p0.s, vl2
538-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
539-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
540-
; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s
555+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
556+
; CHECK-NEXT: splice z0.s, p0, { z0.s, z1.s }
541557
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
542558
; CHECK-NEXT: ret
543559
;
@@ -604,10 +620,10 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
604620
define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) {
605621
; CHECK-LABEL: concat_v2f64:
606622
; CHECK: // %bb.0:
623+
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
607624
; CHECK-NEXT: ptrue p0.d, vl1
608-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
609-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
610-
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
625+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
626+
; CHECK-NEXT: splice z0.d, p0, { z0.d, z1.d }
611627
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
612628
; CHECK-NEXT: ret
613629
;

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE
3-
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
4-
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2
2+
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE
3+
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=SVE2
4+
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=SVE2
55
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
66

77
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -842,16 +842,16 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
842842
;
843843
; SVE2-LABEL: test_copysign_v4f32_v4f64:
844844
; SVE2: // %bb.0:
845-
; SVE2-NEXT: ldp q0, q1, [x1]
845+
; SVE2-NEXT: ldp q1, q0, [x1]
846846
; SVE2-NEXT: ptrue p0.d
847-
; SVE2-NEXT: ldr q2, [x0]
848-
; SVE2-NEXT: fcvt z1.s, p0/m, z1.d
849847
; SVE2-NEXT: fcvt z0.s, p0/m, z0.d
848+
; SVE2-NEXT: fcvt z1.s, p0/m, z1.d
850849
; SVE2-NEXT: ptrue p0.s, vl2
851-
; SVE2-NEXT: uzp1 z1.s, z1.s, z1.s
852-
; SVE2-NEXT: uzp1 z0.s, z0.s, z0.s
853-
; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
850+
; SVE2-NEXT: uzp1 z3.s, z0.s, z0.s
851+
; SVE2-NEXT: uzp1 z2.s, z1.s, z1.s
854852
; SVE2-NEXT: mov z1.s, #0x7fffffff
853+
; SVE2-NEXT: splice z0.s, p0, { z2.s, z3.s }
854+
; SVE2-NEXT: ldr q2, [x0]
855855
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
856856
; SVE2-NEXT: str q2, [x0]
857857
; SVE2-NEXT: ret
@@ -1237,16 +1237,16 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
12371237
;
12381238
; SVE2-LABEL: test_copysign_v8f16_v8f32:
12391239
; SVE2: // %bb.0:
1240-
; SVE2-NEXT: ldp q0, q1, [x1]
1240+
; SVE2-NEXT: ldp q1, q0, [x1]
12411241
; SVE2-NEXT: ptrue p0.s
1242-
; SVE2-NEXT: ldr q2, [x0]
1243-
; SVE2-NEXT: fcvt z1.h, p0/m, z1.s
12441242
; SVE2-NEXT: fcvt z0.h, p0/m, z0.s
1243+
; SVE2-NEXT: fcvt z1.h, p0/m, z1.s
12451244
; SVE2-NEXT: ptrue p0.h, vl4
1246-
; SVE2-NEXT: uzp1 z1.h, z1.h, z1.h
1247-
; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h
1248-
; SVE2-NEXT: splice z0.h, p0, z0.h, z1.h
1245+
; SVE2-NEXT: uzp1 z3.h, z0.h, z0.h
1246+
; SVE2-NEXT: uzp1 z2.h, z1.h, z1.h
12491247
; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
1248+
; SVE2-NEXT: splice z0.h, p0, { z2.h, z3.h }
1249+
; SVE2-NEXT: ldr q2, [x0]
12501250
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
12511251
; SVE2-NEXT: str q2, [x0]
12521252
; SVE2-NEXT: ret
@@ -1349,5 +1349,3 @@ declare <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b) #0
13491349

13501350
declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
13511351
declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
1352-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1353-
; CHECK: {{.*}}

0 commit comments

Comments
 (0)