Skip to content

Commit 6704d6a

Browse files
authored
[SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (#73317)
See ARM-software/acle#217 Patch by: Hassnaa Hamdi <[email protected]>
1 parent b3f0fa8 commit 6704d6a

File tree

11 files changed

+766
-7
lines changed

11 files changed

+766
-7
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,9 +321,18 @@ let TargetGuard = "sme2" in {
321321
let TargetGuard = "sme2" in {
322322
def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
323323
def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>;
324+
}
324325

325326
//
326327
// Zero ZT0
327328
//
329+
let TargetGuard = "sme2" in {
328330
def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
329331
}
332+
333+
// lookup table expand four contiguous registers
334+
//
335+
let TargetGuard = "sme2" in {
336+
def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
337+
def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
338+
}

clang/include/clang/Basic/arm_sve_sme_incl.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
// m: uint32_t
9494
// n: uint64_t
9595

96+
// [: svuint8_t
9697
// t: svint32_t
9798
// z: svuint32_t
9899
// g: svuint64_t

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c

Lines changed: 280 additions & 0 deletions
Large diffs are not rendered by default.

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c

Lines changed: 233 additions & 0 deletions
Large diffs are not rendered by default.

clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,52 @@ void test_str_zt(void *base) __arm_streaming_compatible __arm_shared_za __arm_pr
2828
svstr_zt(1, base); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
2929
}
3030

31+
void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
32+
// Test Reg Offset
33+
svluti2_lane_zt_u8_x4(1, zn, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
34+
// Test index value range
35+
svluti2_lane_zt_u8_x4(0, zn, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
36+
// Test Reg Offset
37+
svluti2_lane_zt_u16_x4(1, zn, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
38+
// Test index value range
39+
svluti2_lane_zt_u16_x4(0, zn, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
40+
// Test Reg Offset
41+
svluti2_lane_zt_u32_x4(1, zn, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
42+
// Test index value range
43+
svluti2_lane_zt_u32_x4(0, zn, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
44+
// Test Reg Offset
45+
svluti2_lane_zt_f16_x4(1, zn, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
46+
// Test index value range
47+
svluti2_lane_zt_f16_x4(0, zn, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
48+
// Test Reg Offset
49+
svluti2_lane_zt_bf16_x4(1, zn, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
50+
// Test index value range
51+
svluti2_lane_zt_bf16_x4(0, zn, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
52+
// Test Reg Offset
53+
svluti2_lane_zt_f32_x4(1, zn, 3); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
54+
// Test index value range
55+
svluti2_lane_zt_f32_x4(0, zn, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
56+
}
57+
58+
void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za {
59+
// Test Reg Offset
60+
svluti4_lane_zt_u16_x4(1, zn, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
61+
// Test index value range
62+
svluti4_lane_zt_u16_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
63+
// Test Reg Offset
64+
svluti4_lane_zt_u32_x4(1, zn, 1); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
65+
// Test index value range
66+
svluti4_lane_zt_u32_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
67+
// Test Reg Offset
68+
svluti4_lane_zt_f16_x4(1, zn, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
69+
// Test index value range
70+
svluti4_lane_zt_f16_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
71+
// Test Reg Offset
72+
svluti4_lane_zt_bf16_x4(1, zn, 0); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
73+
// Test index value range
74+
svluti4_lane_zt_bf16_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
75+
// Test Reg Offset
76+
svluti4_lane_zt_f32_x4(1, zn, 1); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
77+
// Test index value range
78+
svluti4_lane_zt_f32_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
79+
}

clang/utils/TableGen/SveEmitter.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,12 @@ void SVEType::applyModifier(char Mod) {
744744
BFloat = false;
745745
ElementBitwidth = 64;
746746
break;
747+
case '[':
748+
Signed = false;
749+
Float = false;
750+
BFloat = false;
751+
ElementBitwidth = 8;
752+
break;
747753
case 't':
748754
Signed = true;
749755
Float = false;

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3548,6 +3548,18 @@ let TargetPrefix = "aarch64" in {
35483548
// Zero ZT0
35493549
//
35503550
def int_aarch64_sme_zero_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrWriteMem]>;
3551+
3552+
//
3553+
// Lookup table expand four registers
3554+
//
3555+
def int_aarch64_sme_luti2_lane_zt_x4
3556+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
3557+
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
3558+
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
3559+
def int_aarch64_sme_luti4_lane_zt_x4
3560+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
3561+
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
3562+
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
35513563
}
35523564

35533565
// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
327327
}
328328

329329
template <unsigned BaseReg, unsigned Max>
330-
bool ImmToTile(SDValue N, SDValue &Imm) {
330+
bool ImmToReg(SDValue N, SDValue &Imm) {
331331
if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
332332
uint64_t C = CI->getZExtValue();
333333

@@ -404,6 +404,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
404404
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
405405
}
406406

407+
void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
408+
uint32_t MaxImm);
409+
407410
template <unsigned MaxIdx, unsigned Scale>
408411
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
409412
return SelectSMETileSlice(N, MaxIdx, Vector, Offset, Scale);
@@ -1864,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
18641867
SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
18651868
}
18661869

1870+
void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
1871+
unsigned NumOutVecs,
1872+
unsigned Opc, uint32_t MaxImm) {
1873+
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
1874+
if (Imm->getZExtValue() > MaxImm)
1875+
return;
1876+
1877+
SDValue ZtValue;
1878+
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
1879+
return;
1880+
SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
1881+
SDLoc DL(Node);
1882+
EVT VT = Node->getValueType(0);
1883+
1884+
SDNode *Instruction =
1885+
CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
1886+
SDValue SuperReg = SDValue(Instruction, 0);
1887+
1888+
for (unsigned I = 0; I < NumOutVecs; ++I)
1889+
ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
1890+
AArch64::zsub0 + I, DL, VT, SuperReg));
1891+
1892+
// Copy chain
1893+
unsigned ChainIdx = NumOutVecs;
1894+
ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
1895+
CurDAG->RemoveDeadNode(Node);
1896+
}
1897+
18671898
void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
18681899
unsigned Op) {
18691900
SDLoc DL(N);
@@ -5072,6 +5103,23 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
50725103
MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
50735104
return;
50745105
}
5106+
case Intrinsic::aarch64_sme_luti2_lane_zt_x4: {
5107+
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
5108+
Node->getValueType(0),
5109+
{AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H,
5110+
AArch64::LUTI2_4ZTZI_S}))
5111+
// Second Immediate must be <= 3:
5112+
SelectMultiVectorLuti(Node, 4, Opc, 3);
5113+
return;
5114+
}
5115+
case Intrinsic::aarch64_sme_luti4_lane_zt_x4: {
5116+
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
5117+
Node->getValueType(0),
5118+
{0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S}))
5119+
// Second Immediate must be <= 1:
5120+
SelectMultiVectorLuti(Node, 4, Opc, 1);
5121+
return;
5122+
}
50755123
}
50765124
} break;
50775125
case ISD::INTRINSIC_WO_CHAIN: {

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
def imm_to_tile8 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAB0, 0>", []>;
14-
def imm_to_tile16 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAH0, 1>", []>;
15-
def imm_to_tile32 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAS0, 3>", []>;
16-
def imm_to_tile64 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAD0, 7>", []>;
17-
def imm_to_tile128 : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZAQ0, 15>", []>;
18-
def imm_to_zt : ComplexPattern<i32, 1, "ImmToTile<AArch64::ZT0, 0>", []>;
13+
def imm_to_tile8 : ComplexPattern<i32, 1, "ImmToReg<AArch64::ZAB0, 0>", []>;
14+
def imm_to_tile16 : ComplexPattern<i32, 1, "ImmToReg<AArch64::ZAH0, 1>", []>;
15+
def imm_to_tile32 : ComplexPattern<i32, 1, "ImmToReg<AArch64::ZAS0, 3>", []>;
16+
def imm_to_tile64 : ComplexPattern<i32, 1, "ImmToReg<AArch64::ZAD0, 7>", []>;
17+
def imm_to_tile128 : ComplexPattern<i32, 1, "ImmToReg<AArch64::ZAQ0, 15>", []>;
18+
def imm_to_zt : ComplexPattern<i32, 1, "ImmToReg<AArch64::ZT0, 0>", []>;
1919

2020
def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<15, 1>", []>;
2121
def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<7, 1>", []>;
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
3+
4+
; lookup table expand one register
5+
6+
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @luti2_i8(<vscale x 16 x i8> %x) {
7+
; CHECK-LABEL: luti2_i8:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: luti2 { z0.b - z3.b }, zt0, z0[3]
10+
; CHECK-NEXT: ret
11+
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %x, i32 3)
12+
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
13+
}
14+
15+
define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @luti2_i16(<vscale x 16 x i8> %x) {
16+
; CHECK-LABEL: luti2_i16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: luti2 { z0.h - z3.h }, zt0, z0[3]
19+
; CHECK-NEXT: ret
20+
%res = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> %x, i32 3)
21+
ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %res
22+
}
23+
24+
define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @luti2_i32(<vscale x 16 x i8> %x) {
25+
; CHECK-LABEL: luti2_i32:
26+
; CHECK: // %bb.0:
27+
; CHECK-NEXT: luti2 { z0.s - z3.s }, zt0, z0[3]
28+
; CHECK-NEXT: ret
29+
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> %x, i32 3)
30+
ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>}%res
31+
}
32+
33+
define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @luti2_f16(<vscale x 16 x i8> %x) {
34+
; CHECK-LABEL: luti2_f16:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: luti2 { z0.h - z3.h }, zt0, z0[3]
37+
; CHECK-NEXT: ret
38+
%res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %x, i32 3)
39+
ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
40+
}
41+
42+
define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @luti2_bf16(<vscale x 16 x i8> %x) {
43+
; CHECK-LABEL: luti2_bf16:
44+
; CHECK: // %bb.0:
45+
; CHECK-NEXT: luti2 { z0.h - z3.h }, zt0, z0[3]
46+
; CHECK-NEXT: ret
47+
%res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8bf16(i32 0, <vscale x 16 x i8> %x, i32 3)
48+
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
49+
}
50+
51+
define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @luti2_f32(<vscale x 16 x i8> %x) {
52+
; CHECK-LABEL: luti2_f32:
53+
; CHECK: // %bb.0:
54+
; CHECK-NEXT: luti2 { z0.s - z3.s }, zt0, z0[3]
55+
; CHECK-NEXT: ret
56+
%res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 3)
57+
ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>}%res
58+
}
59+
60+
61+
declare {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32, <vscale x 16 x i8>, i32)
62+
declare {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8i16(i32, <vscale x 16 x i8>, i32)
63+
declare {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4i32(i32, <vscale x 16 x i8>, i32)
64+
declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8f16(i32, <vscale x 16 x i8>, i32)
65+
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv8bf16(i32, <vscale x 16 x i8>, i32)
66+
declare {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti2.lane.zt.x4.nxv4f32(i32, <vscale x 16 x i8>, i32)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
3+
4+
; lookup table expand one register
5+
6+
define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @luti4_i16(<vscale x 16 x i8> %x) {
7+
; CHECK-LABEL: luti4_i16:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[1]
10+
; CHECK-NEXT: ret
11+
%res = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, <vscale x 16 x i8> %x, i32 1)
12+
ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %res
13+
}
14+
15+
define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @luti4_i32(<vscale x 16 x i8> %x) {
16+
; CHECK-LABEL: luti4_i32:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1]
19+
; CHECK-NEXT: ret
20+
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32 0, <vscale x 16 x i8> %x, i32 1)
21+
ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res
22+
}
23+
24+
define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @luti4_bf16(<vscale x 16 x i8> %x) {
25+
; CHECK-LABEL: luti4_bf16:
26+
; CHECK: // %bb.0:
27+
; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[1]
28+
; CHECK-NEXT: ret
29+
%res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32 0, <vscale x 16 x i8> %x, i32 1)
30+
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
31+
}
32+
33+
define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @luti4_f16(<vscale x 16 x i8> %x) {
34+
; CHECK-LABEL: luti4_f16:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[1]
37+
; CHECK-NEXT: ret
38+
%res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %x, i32 1)
39+
ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
40+
}
41+
42+
define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @luti4_f32(<vscale x 16 x i8> %x) {
43+
; CHECK-LABEL: luti4_f32:
44+
; CHECK: // %bb.0:
45+
; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1]
46+
; CHECK-NEXT: ret
47+
%res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 1)
48+
ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
49+
}
50+
51+
declare {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, <vscale x 16 x i8>, i32)
52+
declare {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, <vscale x 16 x i8>, i32)
53+
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32, <vscale x 16 x i8>, i32)
54+
declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32, <vscale x 16 x i8>, i32)
55+
declare {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32, <vscale x 16 x i8>, i32)

0 commit comments

Comments
 (0)