Skip to content

[AArch64][SME] Extend FORM_TRANSPOSED pseudos to all multi-vector intrinsics #124258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 0 additions & 66 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8787,51 +8787,6 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}

// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
// input operands are copy nodes where the source register is in a
// StridedOrContiguous class. For example:
//
// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
//
bool shouldUseFormStridedPseudo(MachineInstr &MI) {
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

assert((MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) &&
"Unexpected opcode.");

MCRegister SubReg = MCRegister::NoRegister;
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
MachineOperand &MO = MI.getOperand(I);
assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");

MachineOperand *Def = MRI.getOneDef(MO.getReg());
if (!Def || !Def->getParent()->isCopy())
return false;

const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
unsigned OpSubReg = CopySrc.getSubReg();
if (SubReg == MCRegister::NoRegister)
SubReg = OpSubReg;

MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
const TargetRegisterClass *CopySrcClass =
MRI.getRegClass(CopySrcOp->getReg());
if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
(CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass))
return false;
}

return true;
}

void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
// Live-in physreg copies that are glued to SMSTART are applied as
Expand All @@ -8857,27 +8812,6 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}

if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
// If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
// from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
if (shouldUseFormStridedPseudo(MI))
return;

const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(TargetOpcode::REG_SEQUENCE),
MI.getOperand(0).getReg());

for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
MIB.add(MI.getOperand(I));
MIB.addImm(AArch64::zsub0 + (I - 1));
}

MI.eraseFromParent();
return;
}

// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this
Expand Down
27 changes: 13 additions & 14 deletions llvm/lib/Target/AArch64/SMEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -36,27 +36,26 @@ let WantsRoot = true in
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;

// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
// improve register allocation for intrinsics which use strided and contiguous
// multi-vector registers, avoiding unnecessary copies.
// If the operands of the pseudo are copies where the source register is in
// the StridedOrContiguous class, the pseudo is used to provide a hint to the
// register allocator suggesting a contigious multi-vector register which
// matches the subregister sequence used by the operands.
// If the operands do not match this pattern, the pseudos are expanded
// to a REG_SEQUENCE using the post-isel hook.
// improve register allocation for intrinsics which use strided and
// contiguous multi-vector registers, avoiding unnecessary copies.
// The SMEPeepholeOpt pass will replace a REG_SEQUENCE instruction with the
// FORM_TRANSPOSED_REG_TUPLE pseudo if the operands are copies where the
// source register is in the StridedOrContiguous class. The operands in the
// sequence must all have the same subreg index.
// The pseudo is then used to provide a hint to the register allocator
// suggesting a contigious multi-vector register which matches the
// subregister sequence used by the operands.

def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
let hasSideEffects = 0;
let hasPostISelHook = 1;
}

def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
Pseudo<(outs ZPR4:$tup),
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
let hasSideEffects = 0;
let hasPostISelHook = 1;
}

def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
Expand Down Expand Up @@ -178,14 +177,14 @@ class SME2_ZA_TwoOp_Multi_Single_Pat<string name, SDPatternOperator intrinsic, O
class SME2_ZA_TwoOp_VG2_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm),
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1, vt:$Zn2),
(!cast<Instruction>(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
zpr_ty:$Zm)>;
class SME2_ZA_TwoOp_VG4_Multi_Single_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty,
ValueType vt, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
(REG_SEQUENCE ZPR4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm)>;

class SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ValueType vt, ComplexPattern tileslice>
Expand All @@ -211,14 +210,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;

class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm, imm_ty:$i)>;

class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
Expand Down
82 changes: 82 additions & 0 deletions llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct SMEPeepholeOpt : public MachineFunctionPass {

bool optimizeStartStopPairs(MachineBasicBlock &MBB,
bool &HasRemovedAllSMChanges) const;
bool visitRegSequence(MachineInstr &MI);
};

char SMEPeepholeOpt::ID = 0;
Expand Down Expand Up @@ -225,6 +226,81 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
return Changed;
}

// Using the FORM_TRANSPOSED_REG_TUPLE pseudo can improve register allocation
// of multi-vector intrinsics. However, the psuedo should only be emitted if
// the input registers of the REG_SEQUENCE are copy nodes where the source
// register is in a StridedOrContiguous class. For example:
//
// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
// %9:zpr2mul2 = REG_SEQUENCE %5:zpr, %subreg.zsub0, %8:zpr, %subreg.zsub1
//
// -> %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
//
bool SMEPeepholeOpt::visitRegSequence(MachineInstr &MI) {
assert(MI.getMF()->getRegInfo().isSSA() && "Expected to be run on SSA form!");

MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
switch (MRI.getRegClass(MI.getOperand(0).getReg())->getID()) {
case AArch64::ZPR2RegClassID:
case AArch64::ZPR4RegClassID:
case AArch64::ZPR2Mul2RegClassID:
case AArch64::ZPR4Mul4RegClassID:
break;
default:
return false;
}

// The first operand is the register class created by the REG_SEQUENCE.
// Each operand pair after this consists of a vreg + subreg index, so
// for example a sequence of 2 registers will have a total of 5 operands.
if (MI.getNumOperands() != 5 && MI.getNumOperands() != 9)
return false;

MCRegister SubReg = MCRegister::NoRegister;
for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
MachineOperand &MO = MI.getOperand(I);

MachineOperand *Def = MRI.getOneDef(MO.getReg());
if (!Def || !Def->getParent()->isCopy())
return false;

const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
unsigned OpSubReg = CopySrc.getSubReg();
if (SubReg == MCRegister::NoRegister)
SubReg = OpSubReg;

MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: This only returns a value when the MIR is in SSA form. There is an assert for this in the runOnmachineFunction, but maybe it's worth adding it to this function too in case that one accidentally gets removed?

if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
CopySrcOp->getReg().isPhysical())
return false;

const TargetRegisterClass *CopySrcClass =
MRI.getRegClass(CopySrcOp->getReg());
if (CopySrcClass != &AArch64::ZPR2StridedOrContiguousRegClass &&
CopySrcClass != &AArch64::ZPR4StridedOrContiguousRegClass)
return false;
}

unsigned Opc = MI.getNumOperands() == 5
? AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO
: AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;

const TargetInstrInfo *TII =
MI.getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(Opc), MI.getOperand(0).getReg());
for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
MIB.addReg(MI.getOperand(I).getReg());

MI.eraseFromParent();
return true;
}

INITIALIZE_PASS(SMEPeepholeOpt, "aarch64-sme-peephole-opt",
"SME Peephole Optimization", false, false)

Expand All @@ -247,6 +323,12 @@ bool SMEPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
bool BlockHasAllSMChangesRemoved;
Changed |= optimizeStartStopPairs(MBB, BlockHasAllSMChangesRemoved);
FunctionHasAllSMChangesRemoved |= BlockHasAllSMChangesRemoved;

if (MF.getSubtarget<AArch64Subtarget>().isStreaming()) {
for (MachineInstr &MI : make_early_inc_range(MBB))
if (MI.getOpcode() == AArch64::REG_SEQUENCE)
Changed |= visitRegSequence(MI);
}
}

AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
Expand Down
86 changes: 86 additions & 0 deletions llvm/test/CodeGen/AArch64/fp8-sme2-cvtn.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+sme2,+fp8 -enable-subreg-liveness --force-streaming < %s | FileCheck %s

target triple = "aarch64-linux"

define { <vscale x 16 x i8>, <vscale x 16 x i8> } @cvtn_f16_tuple(i64 %stride, ptr %ptr) {
; CHECK-LABEL: cvtn_f16_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: add x8, x1, x0
; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x1]
; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x8]
; CHECK-NEXT: fcvtn z0.b, { z2.h, z3.h }
; CHECK-NEXT: fcvtn z1.b, { z10.h, z11.h }
; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 0
%3 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 0
%6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %4, 1
%res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> %5)
%res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %3, <vscale x 8 x half> %6)
%ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
%ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
}


define { <vscale x 16 x i8>, <vscale x 16 x i8> } @cvtnt_f32_tuple(i64 %stride, ptr %ptr, <vscale x 16 x i8> %d) {
; CHECK-LABEL: cvtnt_f32_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 16 * VG
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: add x8, x1, x0
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: ld1w { z2.s, z10.s }, pn8/z, [x1]
; CHECK-NEXT: ld1w { z3.s, z11.s }, pn8/z, [x8]
; CHECK-NEXT: fcvtnt z0.b, { z2.s, z3.s }
; CHECK-NEXT: fcvtnt z1.b, { z10.s, z11.s }
; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
%3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 0
%6 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %4, 1
%res1 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %2, <vscale x 4 x float> %5)
%res2 = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %3, <vscale x 4 x float> %6)
%ins1 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> %res1, 0
%ins2 = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins1, <vscale x 16 x i8> %res2, 1
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %ins2
}
Loading