Skip to content

Commit 37ad5b2

Browse files
[AArch64][SME] Extend FORM_TRANSPOSED pseudos to more SME multi-vector intrinsics
All uses of REG_SEQUENCE by multiclasses contained in SMEInstrFormats.td now use the FORM_TRANSPOSED_REG_TUPLE pseudos so that they can benefit from register allocation hints. One test has been added for each multiclass changed.
1 parent 57e2713 commit 37ad5b2

File tree

8 files changed

+215
-443
lines changed

8 files changed

+215
-443
lines changed

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -177,15 +177,15 @@ class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsi
177177
class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
178178
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm1, vt:$Zm2),
179179
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
180-
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
181-
(REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>;
180+
(FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2),
181+
(FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zm1, vt:$Zm2))>;
182182

183183
class SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
184184
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
185185
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm1, vt:$Zm2, vt:$Zm3, vt:$Zm4),
186186
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
187-
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
188-
(REG_SEQUENCE ZPR4Mul4, vt:$Zm1, zsub0, vt:$Zm2, zsub1, vt:$Zm3, zsub2, vt:$Zm4, zsub3))>;
187+
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
188+
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zm1, vt:$Zm2, vt:$Zm3, vt:$Zm4))>;
189189

190190
class SME2_ZA_TwoOp_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
191191
Operand imm_ty, ComplexPattern tileslice>
@@ -209,32 +209,32 @@ class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic
209209

210210
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
211211
: Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, (i32 imm_ty:$i))),
212-
(!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1), imm_ty:$i)>;
212+
(!cast<Instruction>(name) (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO in_vt:$Zn1, in_vt:$Zn2), imm_ty:$i)>;
213213

214214
class SME2_Sat_Shift_VG4_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
215215
: Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4, (i32 imm_ty:$i))),
216-
(!cast<Instruction>(name) (REG_SEQUENCE ZPR4Mul4, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1, in_vt:$Zn3, zsub2, in_vt:$Zn4, zsub3),
216+
(!cast<Instruction>(name) (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4),
217217
imm_ty:$i)>;
218218

219219
class SME2_Cvt_VG4_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt>
220220
: Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4)),
221-
(!cast<Instruction>(name) (REG_SEQUENCE ZPR4Mul4, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1, in_vt:$Zn3, zsub2, in_vt:$Zn4, zsub3))>;
221+
(!cast<Instruction>(name) (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO in_vt:$Zn1, in_vt:$Zn2, in_vt:$Zn3, in_vt:$Zn4))>;
222222

223223
class SME2_ZA_VG1x2_Multi_Pat<string name, SDPatternOperator intrinsic, ValueType vt, Operand index_ty, ComplexPattern tileslice>
224224
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2),
225-
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1))>;
225+
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2))>;
226226

227227
class SME2_ZA_VG1x4_Multi_Pat<string name, SDPatternOperator intrinsic, ValueType vt, Operand index_ty, ComplexPattern tileslice>
228228
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
229-
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>;
229+
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4))>;
230230

231231
class SME2_Tile_VG2_Multi_Pat<string name, SDPatternOperator intrinsic, Operand tile_imm, ValueType vt, Operand index_ty, ComplexPattern tileslice>
232232
: Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2),
233-
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1))>;
233+
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2))>;
234234

235235
class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand tile_imm, ValueType vt, Operand index_ty, ComplexPattern tileslice>
236236
: Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
237-
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>;
237+
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4))>;
238238

239239
class SME2_Zero_Matrix_Pat<string name, SDPatternOperator intrinsic, Operand offset_ty, ComplexPattern tileslice>
240240
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))),
@@ -2446,7 +2446,7 @@ multiclass sme2_fp8_cvt_vg2_single<string mnemonic, bit op, ValueType in_vt, SDP
24462446
let Uses = [FPMR, FPCR];
24472447
}
24482448
def : Pat<(nxv16i8 (intrinsic in_vt:$Zn1, in_vt:$Zn2)),
2449-
(!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1))>;
2449+
(!cast<Instruction>(NAME) (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO in_vt:$Zn1, in_vt:$Zn2))>;
24502450
}
24512451

24522452
class sme2_cvt_unpk_vector_vg2<bits<2>sz, bits<3> op, bit u, RegisterOperand first_ty,

llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll

Lines changed: 48 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -26,60 +26,53 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
2626
; CHECK-LABEL: fcvt_x4_tuple:
2727
; CHECK: // %bb.0: // %entry
2828
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
29-
; CHECK-NEXT: addvl sp, sp, #-10
29+
; CHECK-NEXT: addvl sp, sp, #-9
3030
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
31-
; CHECK-NEXT: ptrue pn8.b
32-
; CHECK-NEXT: str z19, [sp, #1, mul vl] // 16-byte Folded Spill
33-
; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
34-
; CHECK-NEXT: str z18, [sp, #2, mul vl] // 16-byte Folded Spill
35-
; CHECK-NEXT: str z17, [sp, #3, mul vl] // 16-byte Folded Spill
36-
; CHECK-NEXT: str z16, [sp, #4, mul vl] // 16-byte Folded Spill
37-
; CHECK-NEXT: str z14, [sp, #5, mul vl] // 16-byte Folded Spill
38-
; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
39-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 80 * VG
31+
; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
32+
; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
33+
; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
34+
; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
35+
; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
36+
; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
37+
; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
38+
; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
39+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
4040
; CHECK-NEXT: .cfi_offset w29, -16
4141
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
4242
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
4343
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
4444
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
45-
; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 40 * VG
46-
; CHECK-NEXT: ptrue pn8.b
45+
; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
46+
; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
47+
; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
48+
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
4749
; CHECK-NEXT: lsl x9, x0, #1
50+
; CHECK-NEXT: ptrue pn8.b
4851
; CHECK-NEXT: add x8, x1, x0
49-
; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x1]
50-
; CHECK-NEXT: ld1w { z4.s - z7.s }, pn8/z, [x8]
52+
; CHECK-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x1]
53+
; CHECK-NEXT: ld1w { z1.s, z5.s, z9.s, z13.s }, pn8/z, [x8]
5154
; CHECK-NEXT: add x10, x1, x9
5255
; CHECK-NEXT: add x8, x8, x9
53-
; CHECK-NEXT: mov z8.d, z2.d
5456
; CHECK-NEXT: ld1w { z2.s, z6.s, z10.s, z14.s }, pn8/z, [x10]
55-
; CHECK-NEXT: ld1w { z16.s - z19.s }, pn8/z, [x8]
56-
; CHECK-NEXT: mov z24.d, z0.d
57-
; CHECK-NEXT: mov z28.d, z1.d
58-
; CHECK-NEXT: mov z25.d, z4.d
59-
; CHECK-NEXT: mov z29.d, z5.d
60-
; CHECK-NEXT: mov z9.d, z5.d
61-
; CHECK-NEXT: ptrue pn8.b
62-
; CHECK-NEXT: mov z26.d, z2.d
63-
; CHECK-NEXT: mov z30.d, z6.d
64-
; CHECK-NEXT: mov z27.d, z16.d
65-
; CHECK-NEXT: mov z31.d, z17.d
66-
; CHECK-NEXT: mov z11.d, z18.d
67-
; CHECK-NEXT: mov z16.d, z3.d
68-
; CHECK-NEXT: mov z17.d, z7.d
69-
; CHECK-NEXT: mov z18.d, z14.d
70-
; CHECK-NEXT: fcvt z0.b, { z24.s - z27.s }
71-
; CHECK-NEXT: fcvt z1.b, { z28.s - z31.s }
72-
; CHECK-NEXT: fcvt z2.b, { z8.s - z11.s }
73-
; CHECK-NEXT: fcvt z3.b, { z16.s - z19.s }
74-
; CHECK-NEXT: ldr z19, [sp, #1, mul vl] // 16-byte Folded Reload
75-
; CHECK-NEXT: ldr z18, [sp, #2, mul vl] // 16-byte Folded Reload
76-
; CHECK-NEXT: ldr z17, [sp, #3, mul vl] // 16-byte Folded Reload
77-
; CHECK-NEXT: ldr z16, [sp, #4, mul vl] // 16-byte Folded Reload
78-
; CHECK-NEXT: ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload
79-
; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
80-
; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
57+
; CHECK-NEXT: ld1w { z3.s, z7.s, z11.s, z15.s }, pn8/z, [x8]
58+
; CHECK-NEXT: mov z24.d, z8.d
59+
; CHECK-NEXT: mov z25.d, z5.d
60+
; CHECK-NEXT: mov z26.d, z10.d
61+
; CHECK-NEXT: mov z27.d, z11.d
62+
; CHECK-NEXT: fcvt z0.b, { z0.s - z3.s }
63+
; CHECK-NEXT: fcvt z1.b, { z4.s - z7.s }
64+
; CHECK-NEXT: fcvt z2.b, { z24.s - z27.s }
65+
; CHECK-NEXT: fcvt z3.b, { z12.s - z15.s }
66+
; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
67+
; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
68+
; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
69+
; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
70+
; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
71+
; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
72+
; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
73+
; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
8174
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
82-
; CHECK-NEXT: addvl sp, sp, #10
75+
; CHECK-NEXT: addvl sp, sp, #9
8376
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
8477
; CHECK-NEXT: ret
8578
entry:
@@ -144,21 +137,24 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @bfcvt_tuple(i64 %stride, ptr
144137
; CHECK-LABEL: bfcvt_tuple:
145138
; CHECK: // %bb.0: // %entry
146139
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
147-
; CHECK-NEXT: addvl sp, sp, #-1
140+
; CHECK-NEXT: addvl sp, sp, #-3
148141
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
149-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
142+
; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
143+
; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
144+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
150145
; CHECK-NEXT: .cfi_offset w29, -16
146+
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
147+
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
151148
; CHECK-NEXT: ptrue pn8.b
152149
; CHECK-NEXT: add x8, x1, x0
153-
; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x1]
154-
; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x8]
155-
; CHECK-NEXT: mov z4.d, z0.d
156-
; CHECK-NEXT: mov z5.d, z2.d
157-
; CHECK-NEXT: mov z2.d, z1.d
158-
; CHECK-NEXT: bfcvt z0.b, { z4.h, z5.h }
159-
; CHECK-NEXT: bfcvt z1.b, { z2.h, z3.h }
150+
; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x1]
151+
; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x8]
152+
; CHECK-NEXT: bfcvt z0.b, { z0.h, z1.h }
153+
; CHECK-NEXT: bfcvt z1.b, { z8.h, z9.h }
154+
; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
155+
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
160156
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
161-
; CHECK-NEXT: addvl sp, sp, #1
157+
; CHECK-NEXT: addvl sp, sp, #3
162158
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
163159
; CHECK-NEXT: ret
164160
entry:

llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll

Lines changed: 14 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -247,12 +247,9 @@ define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) {
247247
; CHECK-NEXT: add x9, x1, x0
248248
; CHECK-NEXT: mov w8, wzr
249249
; CHECK-NEXT: ld1d { z16.d, z24.d }, pn8/z, [x1]
250-
; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x9]
251-
; CHECK-NEXT: mov z2.d, z16.d
252-
; CHECK-NEXT: mov z3.d, z0.d
253-
; CHECK-NEXT: mov z0.d, z24.d
254-
; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z2.d, z3.d }
255-
; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z0.d, z1.d }
250+
; CHECK-NEXT: ld1d { z17.d, z25.d }, pn8/z, [x9]
251+
; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z16.d, z17.d }
252+
; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z24.d, z25.d }
256253
; CHECK-NEXT: ret
257254
entry:
258255
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -324,52 +321,20 @@ define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0
324321
define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) {
325322
; CHECK-LABEL: multi_vector_add_za_vg1x4_f32_tuple:
326323
; CHECK: // %bb.0: // %entry
327-
; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
328-
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
329-
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
330-
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
331-
; CHECK-NEXT: .cfi_def_cfa_offset 64
332-
; CHECK-NEXT: .cfi_offset b8, -8
333-
; CHECK-NEXT: .cfi_offset b9, -16
334-
; CHECK-NEXT: .cfi_offset b10, -24
335-
; CHECK-NEXT: .cfi_offset b11, -32
336-
; CHECK-NEXT: .cfi_offset b12, -40
337-
; CHECK-NEXT: .cfi_offset b13, -48
338-
; CHECK-NEXT: .cfi_offset b14, -56
339-
; CHECK-NEXT: .cfi_offset b15, -64
324+
; CHECK-NEXT: lsl x9, x0, #1
325+
; CHECK-NEXT: add x10, x1, x0
340326
; CHECK-NEXT: ptrue pn8.b
341-
; CHECK-NEXT: add x9, x1, x0
342-
; CHECK-NEXT: lsl x10, x0, #1
343-
; CHECK-NEXT: ld1w { z17.s, z21.s, z25.s, z29.s }, pn8/z, [x1]
344-
; CHECK-NEXT: ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x9]
327+
; CHECK-NEXT: ld1w { z16.s, z20.s, z24.s, z28.s }, pn8/z, [x1]
328+
; CHECK-NEXT: ld1w { z17.s, z21.s, z25.s, z29.s }, pn8/z, [x10]
345329
; CHECK-NEXT: mov w8, wzr
346-
; CHECK-NEXT: add x11, x1, x10
347-
; CHECK-NEXT: add x9, x9, x10
348-
; CHECK-NEXT: ld1w { z8.s - z11.s }, pn8/z, [x11]
349-
; CHECK-NEXT: mov z4.d, z17.d
350-
; CHECK-NEXT: mov z5.d, z16.d
351-
; CHECK-NEXT: ld1w { z16.s - z19.s }, pn8/z, [x9]
352-
; CHECK-NEXT: mov z0.d, z21.d
353-
; CHECK-NEXT: mov z1.d, z20.d
354-
; CHECK-NEXT: mov z12.d, z25.d
355-
; CHECK-NEXT: mov z6.d, z8.d
356-
; CHECK-NEXT: mov z2.d, z9.d
357-
; CHECK-NEXT: mov z13.d, z24.d
358-
; CHECK-NEXT: mov z7.d, z16.d
359-
; CHECK-NEXT: mov z3.d, z17.d
360-
; CHECK-NEXT: mov z14.d, z10.d
361-
; CHECK-NEXT: mov z15.d, z18.d
362-
; CHECK-NEXT: mov z16.d, z29.d
363-
; CHECK-NEXT: mov z17.d, z28.d
364-
; CHECK-NEXT: mov z18.d, z11.d
365-
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z4.s - z7.s }
366-
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z0.s - z3.s }
367-
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z12.s - z15.s }
330+
; CHECK-NEXT: add x11, x1, x9
331+
; CHECK-NEXT: add x9, x10, x9
332+
; CHECK-NEXT: ld1w { z18.s, z22.s, z26.s, z30.s }, pn8/z, [x11]
333+
; CHECK-NEXT: ld1w { z19.s, z23.s, z27.s, z31.s }, pn8/z, [x9]
368334
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z16.s - z19.s }
369-
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
370-
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
371-
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
372-
; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
335+
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z20.s - z23.s }
336+
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z24.s - z27.s }
337+
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z28.s - z31.s }
373338
; CHECK-NEXT: ret
374339
entry:
375340
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()

0 commit comments

Comments
 (0)