Skip to content

[SME2] Add LUTI2 and LUTI4 double Builtins and Intrinsics #73305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,18 @@ let TargetGuard = "sme2" in {
def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
}

//
// lookup table expand four contiguous registers
//
let TargetGuard = "sme2" in {
def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
}

//
// lookup table expand two contiguous registers
//
let TargetGuard = "sme2" in {
def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
}

Large diffs are not rendered by default.

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,57 @@ void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_shared_za __arm
// Test index value range
svluti4_lane_zt_f32_x4(0, zn, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
}

void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
// Test Reg Offset
svluti2_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti2_lane_zt_u8_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
// Test Reg Offset
svluti2_lane_zt_u16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti2_lane_zt_u16_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
// Test Reg Offset
svluti2_lane_zt_u32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti2_lane_zt_u32_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
// Test Reg Offset
svluti2_lane_zt_f16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti2_lane_zt_f16_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
// Test Reg Offset
svluti2_lane_zt_bf16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti2_lane_zt_bf16_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
// Test Reg Offset
svluti2_lane_zt_f32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti2_lane_zt_f32_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
}

void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_shared_za __arm_preserves_za {
// Test Reg Offset
svluti4_lane_zt_u8_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti4_lane_zt_u8_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
// Test Reg Offset
svluti4_lane_zt_u16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti4_lane_zt_u16_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
// Test Reg Offset
svluti4_lane_zt_u32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti4_lane_zt_u32_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
// Test Reg Offset
svluti4_lane_zt_f16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti4_lane_zt_f16_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
// Test Reg Offset
svluti4_lane_zt_bf16_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti4_lane_zt_bf16_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
// Test Reg Offset
svluti4_lane_zt_f32_x2(1, zn_u8, 2); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
// Test index value range
svluti4_lane_zt_f32_x2(0, zn_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
}
11 changes: 11 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3560,6 +3560,17 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;

//
// Lookup table expand two registers
//
def int_aarch64_sme_luti2_lane_zt_x2
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;
def int_aarch64_sme_luti4_lane_zt_x2
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;

}

// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5120,6 +5120,24 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectMultiVectorLuti(Node, 4, Opc, 1);
return;
}
case Intrinsic::aarch64_sme_luti2_lane_zt_x2: {
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
Node->getValueType(0),
{AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H,
AArch64::LUTI2_2ZTZI_S}))
// Second Immediate must be <= 7:
SelectMultiVectorLuti(Node, 2, Opc, 7);
return;
}
case Intrinsic::aarch64_sme_luti4_lane_zt_x2: {
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
Node->getValueType(0),
{AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H,
AArch64::LUTI4_2ZTZI_S}))
// Second Immediate must be <= 3:
SelectMultiVectorLuti(Node, 2, Opc, 3);
return;
}
}
} break;
case ISD::INTRINSIC_WO_CHAIN: {
Expand Down
65 changes: 65 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s

; lookup table expand one register

define {<vscale x 16 x i8>, <vscale x 16 x i8>} @luti2_i8(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti2_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: luti2 { z0.b, z1.b }, zt0, z0[7]
; CHECK-NEXT: ret
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> %x, i32 7)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res
}

define {<vscale x 8 x i16>, <vscale x 8 x i16>} @luti2_i16(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti2_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7]
; CHECK-NEXT: ret
%res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> %x, i32 7)
ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>} @luti2_i32(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: luti2 { z0.s, z1.s }, zt0, z0[7]
; CHECK-NEXT: ret
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> %x, i32 7)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
}

define {<vscale x 8 x half>, <vscale x 8 x half>} @luti2_f16(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7]
; CHECK-NEXT: ret
%res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> %x, i32 7)
ret {<vscale x 8 x half>, <vscale x 8 x half>} %res
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @luti2_bf16(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: luti2 { z0.h, z1.h }, zt0, z0[7]
; CHECK-NEXT: ret
%res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> %x, i32 7)
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
}

define {<vscale x 4 x float>, <vscale x 4 x float>} @luti2_f32(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: luti2 { z0.s, z1.s }, zt0, z0[7]
; CHECK-NEXT: ret
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 7)
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
}

declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv16i8(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8i16(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4i32(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8f16(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv8bf16(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti2.lane.zt.x2.nxv4f32(i32, <vscale x 16 x i8>, i32)
65 changes: 65 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s

; lookup table expand one register

define {<vscale x 16 x i8>, <vscale x 16 x i8>} @luti4_i8(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti4_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: luti4 { z0.b, z1.b }, zt0, z0[3]
; CHECK-NEXT: ret
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32 0, <vscale x 16 x i8> %x, i32 3)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res
}

define {<vscale x 8 x i16>, <vscale x 8 x i16>} @luti4_i16(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti4_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3]
; CHECK-NEXT: ret
%res = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32 0, <vscale x 16 x i8> %x, i32 3)
ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>} @luti4_i32(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: luti4 { z0.s, z1.s }, zt0, z0[3]
; CHECK-NEXT: ret
%res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32 0, <vscale x 16 x i8> %x, i32 3)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
}

define {<vscale x 8 x half>, <vscale x 8 x half>} @luti4_f16(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3]
; CHECK-NEXT: ret
%res = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8f16(i32 0, <vscale x 16 x i8> %x, i32 3)
ret {<vscale x 8 x half>, <vscale x 8 x half>} %res
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @luti4_bf16(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: luti4 { z0.h, z1.h }, zt0, z0[3]
; CHECK-NEXT: ret
%res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8bf16(i32 0, <vscale x 16 x i8> %x, i32 3)
ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
}

define {<vscale x 4 x float>, <vscale x 4 x float>} @luti4_f32(<vscale x 16 x i8> %x) {
; CHECK-LABEL: luti4_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: luti4 { z0.s, z1.s }, zt0, z0[3]
; CHECK-NEXT: ret
%res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 3)
ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
}

declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv16i8(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8i16(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4i32(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8f16(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv8bf16(i32, <vscale x 16 x i8>, i32)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x2.nxv4f32(i32, <vscale x 16 x i8>, i32)