Skip to content

Commit 25daf7b

Browse files
[AArch64][SME] Extend FORM_TRANSPOSED pseudos to all multi-vector intrinsics (llvm#124258)
All patterns for multi-vector intrinsics should try to use the FORM_TRANSPOSED pseudos so that they can benefit from register allocation hints when SME is available. This patch removes the post-isel hook for the pseudo and instead extends the SMEPeepholeOpt pass to replace a REG_SEQENCE with the pseudo if the expected pattern of StridedOrContiguous copies is found. With this change, the tablegen patterns for the intrinsics can remain unchanged. One test has been added for each multiclass this affects.
1 parent 5ca136d commit 25daf7b

13 files changed

+1537
-335
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8787,51 +8787,6 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
87878787
return ZExtBool;
87888788
}
87898789

8790-
// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
8791-
// input operands are copy nodes where the source register is in a
8792-
// StridedOrContiguous class. For example:
8793-
//
8794-
// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
8795-
// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
8796-
// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
8797-
// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
8798-
// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
8799-
// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
8800-
// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
8801-
//
8802-
bool shouldUseFormStridedPseudo(MachineInstr &MI) {
8803-
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
8804-
8805-
assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8806-
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) &&
8807-
"Unexpected opcode.");
8808-
8809-
MCRegister SubReg = MCRegister::NoRegister;
8810-
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8811-
MachineOperand &MO = MI.getOperand(I);
8812-
assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
8813-
8814-
MachineOperand *Def = MRI.getOneDef(MO.getReg());
8815-
if (!Def || !Def->getParent()->isCopy())
8816-
return false;
8817-
8818-
const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
8819-
unsigned OpSubReg = CopySrc.getSubReg();
8820-
if (SubReg == MCRegister::NoRegister)
8821-
SubReg = OpSubReg;
8822-
8823-
MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
8824-
const TargetRegisterClass *CopySrcClass =
8825-
MRI.getRegClass(CopySrcOp->getReg());
8826-
if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
8827-
(CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
8828-
CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass))
8829-
return false;
8830-
}
8831-
8832-
return true;
8833-
}
8834-
88358790
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
88368791
SDNode *Node) const {
88378792
// Live-in physreg copies that are glued to SMSTART are applied as
@@ -8857,27 +8812,6 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
88578812
}
88588813
}
88598814

8860-
if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
8861-
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
8862-
// If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
8863-
// from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
8864-
if (shouldUseFormStridedPseudo(MI))
8865-
return;
8866-
8867-
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8868-
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
8869-
TII->get(TargetOpcode::REG_SEQUENCE),
8870-
MI.getOperand(0).getReg());
8871-
8872-
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
8873-
MIB.add(MI.getOperand(I));
8874-
MIB.addImm(AArch64::zsub0 + (I - 1));
8875-
}
8876-
8877-
MI.eraseFromParent();
8878-
return;
8879-
}
8880-
88818815
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
88828816
// have nothing to do with VG, were it not that they are used to materialise a
88838817
// frame-address. If they contain a frame-index to a scalable vector, this

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,27 +36,26 @@ let WantsRoot = true in
3636
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
3737

3838
// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
39-
// improve register allocation for intrinsics which use strided and contiguous
40-
// multi-vector registers, avoiding unnecessary copies.
41-
// If the operands of the pseudo are copies where the source register is in
42-
// the StridedOrContiguous class, the pseudo is used to provide a hint to the
43-
// register allocator suggesting a contigious multi-vector register which
44-
// matches the subregister sequence used by the operands.
45-
// If the operands do not match this pattern, the pseudos are expanded
46-
// to a REG_SEQUENCE using the post-isel hook.
39+
// improve register allocation for intrinsics which use strided and
40+
// contiguous multi-vector registers, avoiding unnecessary copies.
41+
// The SMEPeepholeOpt pass will replace a REG_SEQUENCE instruction with the
42+
// FORM_TRANSPOSED_REG_TUPLE pseudo if the operands are copies where the
43+
// source register is in the StridedOrContiguous class. The operands in the
44+
// sequence must all have the same subreg index.
45+
// The pseudo is then used to provide a hint to the register allocator
46+
// suggesting a contigious multi-vector register which matches the
47+
// subregister sequence used by the operands.
4748

4849
def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
4950
Pseudo<(outs ZPR2:$tup),
5051
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
5152
let hasSideEffects = 0;
52-
let hasPostISelHook = 1;
5353
}
5454

5555
def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
5656
Pseudo<(outs ZPR4:$tup),
5757
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
5858
let hasSideEffects = 0;
59-
let hasPostISelHook = 1;
6059
}
6160

6261
def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
@@ -178,14 +177,14 @@ class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, O
178177
class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
179178
ValueType vt, ComplexPattern tileslice>
180179
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm),
181-
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2),
180+
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
182181
zpr_ty:$Zm)>;
183182
class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
184183
ValueType vt, ComplexPattern tileslice>
185184
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
186185
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm),
187186
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
188-
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
187+
(REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
189188
zpr_ty:$Zm)>;
190189

191190
class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
@@ -211,14 +210,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
211210
Operand imm_ty, ComplexPattern tileslice>
212211
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
213212
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
214-
(FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
213+
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
215214

216215
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
217216
Operand imm_ty, ComplexPattern tileslice>
218217
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
219218
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
220219
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
221-
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
220+
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
222221
zpr_ty:$Zm, imm_ty:$i)>;
223222

224223
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>

llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct SMEPeepholeOpt : public MachineFunctionPass {
4545

4646
bool optimizeStartStopPairs(MachineBasicBlock &MBB,
4747
bool &HasRemovedAllSMChanges) const;
48+
bool visitRegSequence(MachineInstr &MI);
4849
};
4950

5051
char SMEPeepholeOpt::ID = 0;
@@ -225,6 +226,81 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
225226
return Changed;
226227
}
227228

229+
// Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation
230+
// of multi-vector intrinsics. However, the psuedo should only be emitted if
231+
// the input registers of the REG_SEQUENCE are copy nodes where the source
232+
// register is in a StridedOrContiguous class. For example:
233+
//
234+
// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
235+
// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
236+
// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
237+
// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
238+
// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
239+
// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
240+
// %9:zpr2mul2 = REG_SEQUENCE %5:zpr, %subreg.zsub0, %8:zpr, %subreg.zsub1
241+
//
242+
// -> %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
243+
//
244+
bool SMEPeepholeOpt::visitRegSequence(MachineInstr &MI) {
245+
assert(MI.getMF()->getRegInfo().isSSA() && "Expected to be run on SSA form!");
246+
247+
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
248+
switch (MRI.getRegClass(MI.getOperand(0).getReg())->getID()) {
249+
case AArch64::ZPR2RegClassID:
250+
case AArch64::ZPR4RegClassID:
251+
case AArch64::ZPR2Mul2RegClassID:
252+
case AArch64::ZPR4Mul4RegClassID:
253+
break;
254+
default:
255+
return false;
256+
}
257+
258+
// The first operand is the register class created by the REG_SEQUENCE.
259+
// Each operand pair after this consists of a vreg + subreg index, so
260+
// for example a sequence of 2 registers will have a total of 5 operands.
261+
if (MI.getNumOperands() != 5 && MI.getNumOperands() != 9)
262+
return false;
263+
264+
MCRegister SubReg = MCRegister::NoRegister;
265+
for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
266+
MachineOperand &MO = MI.getOperand(I);
267+
268+
MachineOperand *Def = MRI.getOneDef(MO.getReg());
269+
if (!Def || !Def->getParent()->isCopy())
270+
return false;
271+
272+
const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
273+
unsigned OpSubReg = CopySrc.getSubReg();
274+
if (SubReg == MCRegister::NoRegister)
275+
SubReg = OpSubReg;
276+
277+
MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
278+
if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
279+
CopySrcOp->getReg().isPhysical())
280+
return false;
281+
282+
const TargetRegisterClass *CopySrcClass =
283+
MRI.getRegClass(CopySrcOp->getReg());
284+
if (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
285+
CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)
286+
return false;
287+
}
288+
289+
unsigned Opc = MI.getNumOperands() == 5
290+
? AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO
291+
: AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
292+
293+
const TargetInstrInfo *TII =
294+
MI.getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
295+
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
296+
TII->get(Opc), MI.getOperand(0).getReg());
297+
for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
298+
MIB.addReg(MI.getOperand(I).getReg());
299+
300+
MI.eraseFromParent();
301+
return true;
302+
}
303+
228304
INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt",
229305
"SME Peephole Optimization", false, false)
230306

@@ -247,6 +323,12 @@ bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
247323
bool BlockHasAllSMChangesRemoved;
248324
Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved);
249325
FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved;
326+
327+
if (MF.getSubtarget<AArch64Subtarget>().isStreaming()) {
328+
for (MachineInstr &MI : make_early_inc_range(MBB))
329+
if (MI.getOpcode() == AArch64::REG_SEQUENCE)
330+
Changed |= visitRegSequence(MI);
331+
}
250332
}
251333

252334
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mattr=+sme2,+fp8 -enable-subreg-liveness --force-streaming < %s | FileCheck %s
3+
4+
target triple = "aarch64-linux"
5+
6+
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @cvtn_f16_tuple(i64 %stride, ptr %ptr) {
7+
; CHECK-LABEL: cvtn_f16_tuple:
8+
; CHECK: // %bb.0: // %entry
9+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
10+
; CHECK-NEXT: addvl sp, sp, #-3
11+
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
12+
; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
13+
; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
14+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
15+
; CHECK-NEXT: .cfi_offset w29, -16
16+
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
17+
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
18+
; CHECK-NEXT: ptrue pn8.b
19+
; CHECK-NEXT: add x8, x1, x0
20+
; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x1]
21+
; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x8]
22+
; CHECK-NEXT: fcvtn z0.b, { z2.h, z3.h }
23+
; CHECK-NEXT: fcvtn z1.b, { z10.h, z11.h }
24+
; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
25+
; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
26+
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
27+
; CHECK-NEXT: addvl sp, sp, #3
28+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
29+
; CHECK-NEXT: ret
30+
entry:
31+
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
32+
%1 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %ptr)
33+
%2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 0
34+
%3 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 1
35+
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
36+
%4 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx2)
37+
%5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 0
38+
%6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 1
39+
%res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> %5)
40+
%res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %3, <vscale x 8 x half> %6)
41+
%ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
42+
%ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
43+
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
44+
}
45+
46+
47+
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @cvtnt_f32_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %d) {
48+
; CHECK-LABEL: cvtnt_f32_tuple:
49+
; CHECK: // %bb.0: // %entry
50+
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
51+
; CHECK-NEXT: addvl sp, sp, #-3
52+
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
53+
; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
54+
; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
55+
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
56+
; CHECK-NEXT: .cfi_offset w29, -16
57+
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
58+
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
59+
; CHECK-NEXT: ptrue pn8.b
60+
; CHECK-NEXT: add x8, x1, x0
61+
; CHECK-NEXT: mov z1.d, z0.d
62+
; CHECK-NEXT: ld1w { z2.s, z10.s }, pn8/z, [x1]
63+
; CHECK-NEXT: ld1w { z3.s, z11.s }, pn8/z, [x8]
64+
; CHECK-NEXT: fcvtnt z0.b, { z2.s, z3.s }
65+
; CHECK-NEXT: fcvtnt z1.b, { z10.s, z11.s }
66+
; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
67+
; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
68+
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
69+
; CHECK-NEXT: addvl sp, sp, #3
70+
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
71+
; CHECK-NEXT: ret
72+
entry:
73+
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
74+
%1 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
75+
%2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
76+
%3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
77+
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
78+
%4 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
79+
%5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 0
80+
%6 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 1
81+
%res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %2, <vscale x 4 x float> %5)
82+
%res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %3, <vscale x 4 x float> %6)
83+
%ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
84+
%ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
85+
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
86+
}

0 commit comments

Comments
 (0)