Skip to content

[AArch64] Add intrinsics for SME FP8 FVDOT, FVDOTB and FVDOTT intrinsics #119922

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,9 @@ let SMETargetGuard = "sme-f8f32" in {
def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;

def SVVDOTB_LANE_FP8_ZA32_VG1x4 : Inst<"svvdotb_lane_za32[_mf8]_vg1x4_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdotb_lane_za32_vg1x4", [IsOverloadNone, IsStreaming, IsInOutZA, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;
def SVVDOTT_LANE_FP8_ZA32_VG1x4 : Inst<"svvdott_lane_za32[_mf8]_vg1x4_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdott_lane_za32_vg1x4", [IsOverloadNone, IsStreaming, IsInOutZA, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;

def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;

Expand All @@ -760,6 +763,8 @@ let SMETargetGuard = "sme-f8f16" in {
def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;

def SVVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svvdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fvdot_lane_za16_vg1x2", [IsOverloadNone, IsStreaming, IsInOutZA, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;

def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;

Expand Down
80 changes: 80 additions & 0 deletions clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fvdot.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4

// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s

// REQUIRES: aarch64-registered-target

#include <arm_sme.h>

#ifdef SVE_OVERLOADED_FORMS
#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
#else
#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
#endif

// CHECK-LABEL: define dso_local void @test_svvdot_lane_za16_mf8_vg1x2(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fvdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 7)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z31test_svvdot_lane_za16_mf8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fvdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 7)
// CPP-CHECK-NEXT: ret void
//
void test_svvdot_lane_za16_mf8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm,
fpm_t fpmr) __arm_streaming
__arm_inout("za") {
SVE_ACLE_FUNC(svvdot_lane_za16, _mf8, _vg1x2_fpm)(slice, zn, zm, 7, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svvdotb_lane_za32_mf8_vg1x4(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fvdotb.lane.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z32test_svvdotb_lane_za32_mf8_vg1x4j13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fvdotb.lane.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
// CPP-CHECK-NEXT: ret void
//
void test_svvdotb_lane_za32_mf8_vg1x4(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm,
fpm_t fpmr) __arm_streaming
__arm_inout("za") {
SVE_ACLE_FUNC(svvdotb_lane_za32, _mf8, _vg1x4_fpm)(slice, zn, zm, 3, fpmr);
}

// CHECK-LABEL: define dso_local void @test_svvdott_lane_za32_mf8_vg1x4(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fvdott.lane.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z32test_svvdott_lane_za32_mf8_vg1x4j13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fvdott.lane.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
// CPP-CHECK-NEXT: ret void
//
void test_svvdott_lane_za32_mf8_vg1x4(uint32_t slice, svmfloat8x2_t zn,
svmfloat8_t zm,
fpm_t fpmr) __arm_streaming
__arm_inout("za") {
SVE_ACLE_FUNC(svvdott_lane_za32, _mf8, _vg1x4_fpm)(slice, zn, zm, 3, fpmr);
}
32 changes: 32 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fvdot.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -verify -emit-llvm -o - %s

// REQUIRES: aarch64-registered-target

#include <arm_sme.h>

void test_features(uint32_t slice, fpm_t fpmr, svmfloat8x2_t zn,
svmfloat8_t zm) __arm_streaming __arm_inout("za") {
// expected-error@+1 {{'svvdot_lane_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}}
svvdot_lane_za16_mf8_vg1x2_fpm(slice, zn, zm, 7, fpmr);
// expected-error@+1 {{'svvdotb_lane_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
svvdotb_lane_za32_mf8_vg1x4_fpm(slice, zn, zm, 3, fpmr);
// expected-error@+1 {{'svvdott_lane_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
svvdott_lane_za32_mf8_vg1x4_fpm(slice, zn, zm, 3, fpmr);
}

void test_imm(uint32_t slice, fpm_t fpmr, svmfloat8x2_t zn,
svmfloat8_t zm) __arm_streaming __arm_inout("za") {
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
svvdot_lane_za16_mf8_vg1x2_fpm(slice, zn, zm, -1, fpmr);
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
svvdotb_lane_za32_mf8_vg1x4_fpm(slice, zn, zm, -1, fpmr);
// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
svvdott_lane_za32_mf8_vg1x4_fpm(slice, zn, zm, -1, fpmr);

// expected-error@+1{{argument value 8 is outside the valid range [0, 7]}}
svvdot_lane_za16_mf8_vg1x2_fpm(slice, zn, zm, 8, fpmr);
// expected-error@+1{{argument value 4 is outside the valid range [0, 3]}}
svvdotb_lane_za32_mf8_vg1x4_fpm(slice, zn, zm, 4, fpmr);
// expected-error@+1{{argument value 4 is outside the valid range [0, 3]}}
svvdott_lane_za32_mf8_vg1x4_fpm(slice, zn, zm, 4, fpmr);
}
5 changes: 5 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3904,6 +3904,11 @@ class SME2_FP8_FDOT_MULTI_VG1x4 :
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;

def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;

def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x2;
def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x2;

def int_aarch64_sme_fp8_fdot_single_za16_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2;
def int_aarch64_sme_fp8_fdot_single_za16_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4;

Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -985,8 +985,8 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
} //[HasSME2p1, HasSME_LUTv2]

let Predicates = [HasSMEF8F16] in {
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
defm FVDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fvdot", 0b110, int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2>;
defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", 0b010, int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;

defm FDOT_VG2_M2ZZ_BtoH : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x2>;
Expand Down Expand Up @@ -1016,8 +1016,8 @@ defm FDOT_VG4_M4ZZ_BtoS : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110011, MatrixO
defm FDOT_VG2_M2Z2Z_BtoS : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x2>;
defm FDOT_VG4_M4Z4Z_BtoS : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x4>;

def FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdotb", 0b0>;
def FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdott", 0b1>;
defm FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_fdotv_index_za32_vg1x4<"fvdotb", 0b0, int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4>;
defm FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_fdotv_index_za32_vg1x4<"fvdott", 0b1, int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4>;

defm FMLALL_MZZI_BtoS : sme2_mla_ll_array_index_32b<"fmlall", 0b01, 0b000, null_frag>;
defm FMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"fmlall", 0b10, 0b100, null_frag>;
Expand Down
13 changes: 11 additions & 2 deletions llvm/lib/Target/AArch64/SMEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -5797,9 +5797,9 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {

// FP8 SME FDOT instructions

multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic,
multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic, bits<3> op,
SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg2_index<0b11, {0b0,?,?,0b10,?}, MatrixOp16,
def NAME : sme2_multi_vec_array_vg2_index<0b11, {op{2},?,?,op{1-0},?}, MatrixOp16,
ZZ_b_mul_r, ZPR4b8,
VectorIndexH32b_timm, mnemonic>,
SMEPseudo2Instr<NAME, 1>{
Expand Down Expand Up @@ -5883,6 +5883,15 @@ multiclass sme2_fp8_fdot_index_za32_vg1x4<string mnemonic,
def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>;
}

multiclass sme2_fp8_fdotv_index_za32_vg1x4<string mnemonic, bit T, SDPatternOperator intrinsic> {
def NAME : sme2_fp8_multi_vec_array_vg4_index<mnemonic, T>,
SMEPseudo2Instr<NAME, 1>;

def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, ZPR4b8, VectorIndexS32b_timm, SMEMatrixArray>;

def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>;
}

multiclass sme2_fp8_fdot_single_vg1x2<string mnemonic, bits<7> op,
MatrixOperand matrix_op,
SDPatternOperator intrinsic> {
Expand Down
50 changes: 50 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fvdot.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:" --version 4
; RUN: llc -force-streaming < %s | FileCheck %s
target triple = "aarch64-linux"

define void @test_fvdot16_1x2_indexed(i32 %slice.0,
; CHECK-LABEL: test_fvdot16_1x2_indexed:
; CHECK: // %bb.0:
; CHECK: mov w8, w0
; CHECK: fvdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b[3]
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fvdot.lane.za16.vg1x2(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm, i32 3)
ret void
}

define void @test_fvdot32_bottom_1x4_indexed(i32 %slice.0,
; CHECK-LABEL: test_fvdot32_bottom_1x4_indexed:
; CHECK: // %bb.0:
; CHECK: mov w8, w0
; CHECK: fvdotb za.s[w8, 7, vgx4], { z0.b, z1.b }, z2.b[3]
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fvdotb.lane.za32.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm, i32 3)
ret void
}

define void @test_fvdot32_top_1x4_indexed(i32 %slice.0,
; CHECK-LABEL: test_fvdot32_top_1x4_indexed:
; CHECK: // %bb.0:
; CHECK: mov w8, w0
; CHECK: fvdott za.s[w8, 7, vgx4], { z0.b, z1.b }, z2.b[3]
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fvdott.lane.za32.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm, i32 3)
ret void
}

attributes #0 = { "target-features" = "+sme-f8f32,+sme-f8f16" }
Loading