Skip to content

Commit 30faf19

Browse files
authored
[SME2] Add LUTI2 and LUTI4 double Builtins and Intrinsics (#73305)
See ARM-software/acle#217 Patch by: Hassnaa Hamdi <[email protected]>
1 parent 721558a commit 30faf19

File tree

8 files changed

+640
-0
lines changed

8 files changed

+640
-0
lines changed

clang/include/clang/Basic/arm_sme.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,18 @@ let TargetGuard = "sme2" in {
330330
def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
331331
}
332332

333+
//
333334
// lookup table expand four contiguous registers
334335
//
335336
let TargetGuard = "sme2" in {
336337
def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
337338
def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
338339
}
340+
341+
//
342+
// lookup table expand two contiguous registers
343+
//
344+
let TargetGuard = "sme2" in {
345+
def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
346+
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
347+
}

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c

Lines changed: 209 additions & 0 deletions
Large diffs are not rendered by default.

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c

Lines changed: 209 additions & 0 deletions
Large diffs are not rendered by default.

clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,57 @@ void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm
7777
// Test index value range
7878
svluti4_lane_zt_f32_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
7979
}
80+
81+
void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
82+
// Test Reg Offset
83+
svluti2_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
84+
// Test index value range
85+
svluti2_lane_zt_u8_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
86+
// Test Reg Offset
87+
svluti2_lane_zt_u16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
88+
// Test index value range
89+
svluti2_lane_zt_u16_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
90+
// Test Reg Offset
91+
svluti2_lane_zt_u32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
92+
// Test index value range
93+
svluti2_lane_zt_u32_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
94+
// Test Reg Offset
95+
svluti2_lane_zt_f16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
96+
// Test index value range
97+
svluti2_lane_zt_f16_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
98+
// Test Reg Offset
99+
svluti2_lane_zt_bf16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
100+
// Test index value range
101+
svluti2_lane_zt_bf16_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
102+
// Test Reg Offset
103+
svluti2_lane_zt_f32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
104+
// Test index value range
105+
svluti2_lane_zt_f32_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
106+
}
107+
108+
void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
109+
// Test Reg Offset
110+
svluti4_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
111+
// Test index value range
112+
svluti4_lane_zt_u8_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
113+
// Test Reg Offset
114+
svluti4_lane_zt_u16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
115+
// Test index value range
116+
svluti4_lane_zt_u16_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
117+
// Test Reg Offset
118+
svluti4_lane_zt_u32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
119+
// Test index value range
120+
svluti4_lane_zt_u32_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
121+
// Test Reg Offset
122+
svluti4_lane_zt_f16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
123+
// Test index value range
124+
svluti4_lane_zt_f16_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
125+
// Test Reg Offset
126+
svluti4_lane_zt_bf16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
127+
// Test index value range
128+
svluti4_lane_zt_bf16_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
129+
// Test Reg Offset
130+
svluti4_lane_zt_f32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
131+
// Test index value range
132+
svluti4_lane_zt_f32_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
133+
}

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3560,6 +3560,17 @@ let TargetPrefix = "aarch64" in {
35603560
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
35613561
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
35623562
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
3563+
3564+
//
3565+
// Lookup table expand two registers
3566+
//
3567+
def int_aarch64_sme_luti2_lane_zt_x2
3568+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
3569+
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
3570+
def int_aarch64_sme_luti4_lane_zt_x2
3571+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
3572+
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
3573+
35633574
}
35643575

35653576
// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5120,6 +5120,24 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
51205120
SelectMultiVectorLuti(Node, 4, Opc, 1);
51215121
return;
51225122
}
5123+
case Intrinsic::aarch64_sme_luti2_lane_zt_x2: {
5124+
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
5125+
Node->getValueType(0),
5126+
{AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H,
5127+
AArch64::LUTI2_2ZTZI_S}))
5128+
// Second Immediate must be <= 7:
5129+
SelectMultiVectorLuti(Node, 2, Opc, 7);
5130+
return;
5131+
}
5132+
case Intrinsic::aarch64_sme_luti4_lane_zt_x2: {
5133+
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
5134+
Node->getValueType(0),
5135+
{AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H,
5136+
AArch64::LUTI4_2ZTZI_S}))
5137+
// Second Immediate must be <= 3:
5138+
SelectMultiVectorLuti(Node, 2, Opc, 3);
5139+
return;
5140+
}
51235141
}
51245142
} break;
51255143
case ISD::INTRINSIC_WO_CHAIN: {
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
3+
4+
; lookup table expand one register
5+
6+
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @luti2_i8(<vscale x 16 x i8> %x) {
7+
; CHECK-LABEL: luti2_i8:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: luti2 { z0.b, z1.b }, zt0, z0[7]
10+
; CHECK-NEXT: ret
11+
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> %x, i32 7)
12+
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res
13+
}
14+
15+
define {<vscale x 8 x i16>, <vscale x 8 x i16>} @luti2_i16(<vscale x 16 x i8> %x) {
16+
; CHECK-LABEL: luti2_i16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7]
19+
; CHECK-NEXT: ret
20+
%res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> %x, i32 7)
21+
ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res
22+
}
23+
24+
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @luti2_i32(<vscale x 16 x i8> %x) {
25+
; CHECK-LABEL: luti2_i32:
26+
; CHECK: // %bb.0:
27+
; CHECK-NEXT: luti2 { z0.s, z1.s }, zt0, z0[7]
28+
; CHECK-NEXT: ret
29+
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> %x, i32 7)
30+
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
31+
}
32+
33+
define {<vscale x 8 x half>, <vscale x 8 x half>} @luti2_f16(<vscale x 16 x i8> %x) {
34+
; CHECK-LABEL: luti2_f16:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7]
37+
; CHECK-NEXT: ret
38+
%res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> %x, i32 7)
39+
ret {<vscale x 8 x half>, <vscale x 8 x half>} %res
40+
}
41+
42+
define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @luti2_bf16(<vscale x 16 x i8> %x) {
43+
; CHECK-LABEL: luti2_bf16:
44+
; CHECK: // %bb.0:
45+
; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7]
46+
; CHECK-NEXT: ret
47+
%res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> %x, i32 7)
48+
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
49+
}
50+
51+
define {<vscale x 4 x float>, <vscale x 4 x float>} @luti2_f32(<vscale x 16 x i8> %x) {
52+
; CHECK-LABEL: luti2_f32:
53+
; CHECK: // %bb.0:
54+
; CHECK-NEXT: luti2 { z0.s, z1.s }, zt0, z0[7]
55+
; CHECK-NEXT: ret
56+
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 7)
57+
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
58+
}
59+
60+
declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32, <vscale x 16 x i8>, i32)
61+
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32, <vscale x 16 x i8>, i32)
62+
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32, <vscale x 16 x i8>, i32)
63+
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8f16(i32, <vscale x 16 x i8>, i32)
64+
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8bf16(i32, <vscale x 16 x i8>, i32)
65+
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4f32(i32, <vscale x 16 x i8>, i32)
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
3+
4+
; lookup table expand one register
5+
6+
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @luti4_i8(<vscale x 16 x i8> %x) {
7+
; CHECK-LABEL: luti4_i8:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: luti4 { z0.b, z1.b }, zt0, z0[3]
10+
; CHECK-NEXT: ret
11+
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> %x, i32 3)
12+
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res
13+
}
14+
15+
define {<vscale x 8 x i16>, <vscale x 8 x i16>} @luti4_i16(<vscale x 16 x i8> %x) {
16+
; CHECK-LABEL: luti4_i16:
17+
; CHECK: // %bb.0:
18+
; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3]
19+
; CHECK-NEXT: ret
20+
%res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> %x, i32 3)
21+
ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res
22+
}
23+
24+
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @luti4_i32(<vscale x 16 x i8> %x) {
25+
; CHECK-LABEL: luti4_i32:
26+
; CHECK: // %bb.0:
27+
; CHECK-NEXT: luti4 { z0.s, z1.s }, zt0, z0[3]
28+
; CHECK-NEXT: ret
29+
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> %x, i32 3)
30+
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
31+
}
32+
33+
define {<vscale x 8 x half>, <vscale x 8 x half>} @luti4_f16(<vscale x 16 x i8> %x) {
34+
; CHECK-LABEL: luti4_f16:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3]
37+
; CHECK-NEXT: ret
38+
%res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> %x, i32 3)
39+
ret {<vscale x 8 x half>, <vscale x 8 x half>} %res
40+
}
41+
42+
define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @luti4_bf16(<vscale x 16 x i8> %x) {
43+
; CHECK-LABEL: luti4_bf16:
44+
; CHECK: // %bb.0:
45+
; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3]
46+
; CHECK-NEXT: ret
47+
%res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> %x, i32 3)
48+
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
49+
}
50+
51+
define {<vscale x 4 x float>, <vscale x 4 x float>} @luti4_f32(<vscale x 16 x i8> %x) {
52+
; CHECK-LABEL: luti4_f32:
53+
; CHECK: // %bb.0:
54+
; CHECK-NEXT: luti4 { z0.s, z1.s }, zt0, z0[3]
55+
; CHECK-NEXT: ret
56+
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 3)
57+
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
58+
}
59+
60+
declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32, <vscale x 16 x i8>, i32)
61+
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32, <vscale x 16 x i8>, i32)
62+
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32, <vscale x 16 x i8>, i32)
63+
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8f16(i32, <vscale x 16 x i8>, i32)
64+
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8bf16(i32, <vscale x 16 x i8>, i32)
65+
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4f32(i32, <vscale x 16 x i8>, i32)

0 commit comments

Comments
 (0)