Skip to content

[Clang][AArch64] Add FP8 variants of Neon store intrinsics #145346

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions clang/include/clang/Basic/arm_neon.td
Original file line number Diff line number Diff line change
Expand Up @@ -466,15 +466,15 @@ def VLD1_LANE : WInst<"vld1_lane", ".(c*!).I",
def VLD1_DUP : WInst<"vld1_dup", ".(c*!)",
"QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
def VST1 : WInst<"vst1", "v*(.!)",
"QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
"QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
def VST1_X2 : WInst<"vst1_x2", "v*(2!)",
"cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
"cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPsmQm">;
def VST1_X3 : WInst<"vst1_x3", "v*(3!)",
"cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
"cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPsmQm">;
def VST1_X4 : WInst<"vst1_x4", "v*(4!)",
"cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
"cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPsmQm">;
def VST1_LANE : WInst<"vst1_lane", "v*(.!)I",
"QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs",
"QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPsmQm",
[ImmCheck<2, ImmCheckLaneIndex, 1>]>;

let ArchGuard = "(__ARM_FP & 2)" in {
Expand Down Expand Up @@ -510,14 +510,14 @@ def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
[ImmCheck<5, ImmCheckLaneIndex, 1>]>;
def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
[ImmCheck<6, ImmCheckLaneIndex, 1>]>;
def VST2 : WInst<"vst2", "v*(2!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
def VST3 : WInst<"vst3", "v*(3!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
def VST4 : WInst<"vst4", "v*(4!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
def VST2 : WInst<"vst2", "v*(2!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
def VST3 : WInst<"vst3", "v*(3!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
def VST4 : WInst<"vst4", "v*(4!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPsmQm",
Copy link
Contributor

@CarolineConcatto CarolineConcatto Jun 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit puzzled how the st2|3|4_lane work for 128 bits(Qc or QUc). I can see them in the tests but they are not here.

uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) {
return vld4q_lane_u8(a, b, 15);
}

Ok, it looks like we have these descriptions split in table gen, some as VST and some as ST*. Can we merge them all into one? It does not need to be in the patch, but I am fine if it is.

def ST2 : WInst<"vst2", "v*(2!)", "QUlQldQdPlQPl">;
def ST3 : WInst<"vst3", "v*(3!)", "QUlQldQdPlQPl">;
def ST4 : WInst<"vst4", "v*(4!)", "QUlQldQdPlQPl">;

def ST1_LANE : WInst<"vst1_lane", "v*(.!)I", "dQdPlQPl",
[ImmCheck<2, ImmCheckLaneIndex, 1>]>;
def ST2_LANE : WInst<"vst2_lane", "v*(2!)I", "lUlQcQUcQPcQlQUldQdPlQPl",
[ImmCheck<3, ImmCheckLaneIndex, 1>]>;
def ST3_LANE : WInst<"vst3_lane", "v*(3!)I", "lUlQcQUcQPcQlQUldQdPlQPl",
[ImmCheck<4, ImmCheckLaneIndex, 1>]>;
def ST4_LANE : WInst<"vst4_lane", "v*(4!)I", "lUlQcQUcQPcQlQUldQdPlQPl",
[ImmCheck<5, ImmCheckLaneIndex, 1>]>;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed the same after you asked about the Qc variants of VST2. I will try to merge these in a follow-up patch.

[ImmCheck<3, ImmCheckLaneIndex, 1>]>;
def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPsmQm",
[ImmCheck<4, ImmCheckLaneIndex, 1>]>;
def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPsmQm",
[ImmCheck<5, ImmCheckLaneIndex, 1>]>;
let ArchGuard = "(__ARM_FP & 2)" in {
def VLD2_F16 : WInst<"vld2", "2(c*!)", "hQh">;
Expand Down Expand Up @@ -2194,4 +2194,4 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
// fscale
def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
}
}
176 changes: 176 additions & 0 deletions clang/test/CodeGen/AArch64/neon-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -14732,6 +14732,16 @@ void test_vst1q_s64(int64_t *a, int64x2_t b) {
vst1q_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1q_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: store <16 x i8> [[VAL]], ptr [[A]], align 1
// CHECK-NEXT: ret void
//
void test_vst1q_mf8(mfloat8_t *a, mfloat8x16_t val) {
vst1q_mf8(a, val);
}

// CHECK-LABEL: define dso_local void @test_vst1q_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -14885,6 +14895,16 @@ void test_vst1_s64(int64_t *a, int64x1_t b) {
vst1_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: store <8 x i8> [[VAL]], ptr [[A]], align 1
// CHECK-NEXT: ret void
//
void test_vst1_mf8(mfloat8_t *a, mfloat8x8_t val) {
vst1_mf8(a, val);
}

// CHECK-LABEL: define dso_local void @test_vst1_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -15067,6 +15087,18 @@ void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
vst2q_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst2q_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst2q_mf8(mfloat8_t *a, mfloat8x16x2_t b) {
vst2q_mf8(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst2q_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -15269,6 +15301,18 @@ void test_vst2_s64(int64_t *a, int64x1x2_t b) {
vst2_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst2_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst2_mf8(mfloat8_t *a, mfloat8x8x2_t b) {
vst2_mf8(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst2_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -15493,6 +15537,19 @@ void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
vst3q_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst3q_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst3q_mf8(mfloat8_t *a, mfloat8x16x3_t b) {
vst3q_mf8(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst3q_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -15731,6 +15788,19 @@ void test_vst3_s64(int64_t *a, int64x1x3_t b) {
vst3_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst3_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst3_mf8(mfloat8_t *a, mfloat8x8x3_t b) {
vst3_mf8(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst3_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -15992,6 +16062,20 @@ void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
vst4q_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst4q_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst4q_mf8(mfloat8_t *a, mfloat8x16x4_t b) {
vst4q_mf8(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst4q_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16266,6 +16350,20 @@ void test_vst4_s64(int64_t *a, int64x1x4_t b) {
vst4_s64(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst4_mf8(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst4_mf8(mfloat8_t *a, mfloat8x8x4_t b) {
vst4_mf8(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst4_f16(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16576,6 +16674,18 @@ poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
return vld1_p64_x4(a);
}

// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x2(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst1q_mf8_x2(mfloat8_t *a, mfloat8x16x2_t b) {
vst1q_mf8_x2(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1q_f64_x2(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16610,6 +16720,18 @@ void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
vst1q_p64_x2(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_mf8_x2(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst1_mf8_x2(mfloat8_t *a, mfloat8x8x2_t b) {
vst1_mf8_x2(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_f64_x2(
// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16646,6 +16768,19 @@ void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
vst1_p64_x2(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x3(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst1q_mf8_x3(mfloat8_t *a, mfloat8x16x3_t b) {
vst1q_mf8_x3(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1q_f64_x3(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16687,6 +16822,19 @@ void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
vst1q_p64_x3(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_mf8_x3(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst1_mf8_x3(mfloat8_t *a, mfloat8x8x3_t b) {
vst1_mf8_x3(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_f64_x3(
// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16731,6 +16879,20 @@ void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
vst1_p64_x3(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x4(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3
// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst1q_mf8_x4(mfloat8_t *a, mfloat8x16x4_t b) {
vst1q_mf8_x4(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1q_f64_x4(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down Expand Up @@ -16779,6 +16941,20 @@ void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
vst1q_p64_x4(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_mf8_x4(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0
// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1
// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2
// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3
// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
// CHECK-NEXT: ret void
//
void test_vst1_mf8_x4(mfloat8_t *a, mfloat8x8x4_t b) {
vst1_mf8_x4(a, b);
}

// CHECK-LABEL: define dso_local void @test_vst1_f64_x4(
// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
Expand Down
Loading