Skip to content

Commit ab3607c

Browse files
committed
[AArch64][SVE] Add missing load/store patterns for unpacked bfloat vectors.
Reviewed By: c-rhodes Differential Revision: https://reviews.llvm.org/D110063
1 parent 0205806 commit ab3607c

File tree

4 files changed

+138
-46
lines changed

4 files changed

+138
-46
lines changed

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 54 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2000,25 +2000,27 @@ let Predicates = [HasSVEorStreamingSVE] in {
20002000
}
20012001

20022002
// 2-element contiguous loads
2003-
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
2004-
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
2005-
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
2006-
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
2007-
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
2008-
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
2009-
defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
2010-
defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
2011-
defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
2012-
defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
2003+
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
2004+
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
2005+
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
2006+
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
2007+
defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
2008+
defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
2009+
defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
2010+
defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
2011+
defm : pred_load<nxv2bf16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
2012+
defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
2013+
defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
20132014

20142015
// 4-element contiguous loads
2015-
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
2016-
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
2017-
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
2018-
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
2019-
defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
2020-
defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
2021-
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
2016+
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
2017+
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
2018+
defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
2019+
defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
2020+
defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
2021+
defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
2022+
defm : pred_load<nxv4bf16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
2023+
defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
20222024

20232025
// 8-element contiguous loads
20242026
defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
@@ -2045,20 +2047,22 @@ let Predicates = [HasSVEorStreamingSVE] in {
20452047
}
20462048

20472049
// 2-element contiguous stores
2048-
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
2049-
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
2050-
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
2051-
defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
2052-
defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
2053-
defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
2054-
defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
2050+
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
2051+
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
2052+
defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
2053+
defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
2054+
defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
2055+
defm : pred_store<nxv2bf16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
2056+
defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
2057+
defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
20552058

20562059
// 4-element contiguous stores
2057-
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
2058-
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
2059-
defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
2060-
defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
2061-
defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
2060+
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
2061+
defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
2062+
defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
2063+
defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
2064+
defm : pred_store<nxv4bf16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
2065+
defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
20622066

20632067
// 8-element contiguous stores
20642068
defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
@@ -2099,23 +2103,25 @@ let Predicates = [HasSVEorStreamingSVE] in {
20992103
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
21002104
}
21012105

2102-
defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
2103-
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
2104-
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
2105-
defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
2106-
defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
2107-
defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
2108-
defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
2109-
defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
2110-
defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
2111-
defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
2112-
defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
2113-
defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
2114-
defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
2115-
defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
2116-
defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
2117-
defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
2118-
defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
2106+
defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
2107+
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
2108+
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
2109+
defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>;
2110+
defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
2111+
defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
2112+
defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
2113+
defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
2114+
defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
2115+
defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
2116+
defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
2117+
defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
2118+
defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
2119+
defm : unpred_store< store, nxv4bf16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
2120+
defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
2121+
defm : unpred_store< store, nxv2bf16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
2122+
defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
2123+
defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
2124+
defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
21192125

21202126
multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
21212127
Instruction RegImmInst, Instruction PTrue,
@@ -2162,7 +2168,9 @@ let Predicates = [HasSVEorStreamingSVE] in {
21622168
defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
21632169
defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>;
21642170
defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
2171+
defm : unpred_load< load, nxv4bf16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>;
21652172
defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
2173+
defm : unpred_load< load, nxv2bf16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>;
21662174
defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>;
21672175
defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
21682176
defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;

llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-reg.ll

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,18 @@ define <vscale x 4 x half> @ld1_nxv4f16(half* %addr, i64 %off) {
231231
ret <vscale x 4 x half> %val
232232
}
233233

234+
define <vscale x 4 x bfloat> @ld1_nxv4bf16(bfloat* %addr, i64 %off) {
235+
; CHECK-LABEL: ld1_nxv4bf16:
236+
; CHECK: // %bb.0:
237+
; CHECK-NEXT: ptrue p0.s
238+
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
239+
; CHECK-NEXT: ret
240+
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off
241+
%ptrcast = bitcast bfloat* %ptr to <vscale x 4 x bfloat>*
242+
%val = load volatile <vscale x 4 x bfloat>, <vscale x 4 x bfloat>* %ptrcast
243+
ret <vscale x 4 x bfloat> %val
244+
}
245+
234246
define <vscale x 2 x half> @ld1_nxv2f16(half* %addr, i64 %off) {
235247
; CHECK-LABEL: ld1_nxv2f16:
236248
; CHECK: // %bb.0:
@@ -243,6 +255,18 @@ define <vscale x 2 x half> @ld1_nxv2f16(half* %addr, i64 %off) {
243255
ret <vscale x 2 x half> %val
244256
}
245257

258+
define <vscale x 2 x bfloat> @ld1_nxv2bf16(bfloat* %addr, i64 %off) {
259+
; CHECK-LABEL: ld1_nxv2bf16:
260+
; CHECK: // %bb.0:
261+
; CHECK-NEXT: ptrue p0.d
262+
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
263+
; CHECK-NEXT: ret
264+
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off
265+
%ptrcast = bitcast bfloat* %ptr to <vscale x 2 x bfloat>*
266+
%val = load volatile <vscale x 2 x bfloat>, <vscale x 2 x bfloat>* %ptrcast
267+
ret <vscale x 2 x bfloat> %val
268+
}
269+
246270
; LD1W
247271

248272
define <vscale x 4 x i32> @ld1_nxv4i32(i32* %addr, i64 %off) {

llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,14 @@ define <vscale x 2 x half> @masked_load_nxv2f16(<vscale x 2 x half> *%a, <vscale
6060
ret <vscale x 2 x half> %load
6161
}
6262

63+
define <vscale x 2 x bfloat> @masked_load_nxv2bf16(<vscale x 2 x bfloat> *%a, <vscale x 2 x i1> %mask) nounwind #0 {
64+
; CHECK-LABEL: masked_load_nxv2bf16:
65+
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
66+
; CHECK-NEXT: ret
67+
%load = call <vscale x 2 x bfloat> @llvm.masked.load.nxv2bf16(<vscale x 2 x bfloat> *%a, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
68+
ret <vscale x 2 x bfloat> %load
69+
}
70+
6371
define <vscale x 4 x float> @masked_load_nxv4f32(<vscale x 4 x float> *%a, <vscale x 4 x i1> %mask) nounwind {
6472
; CHECK-LABEL: masked_load_nxv4f32:
6573
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
@@ -76,6 +84,14 @@ define <vscale x 4 x half> @masked_load_nxv4f16(<vscale x 4 x half> *%a, <vscale
7684
ret <vscale x 4 x half> %load
7785
}
7886

87+
define <vscale x 4 x bfloat> @masked_load_nxv4bf16(<vscale x 4 x bfloat> *%a, <vscale x 4 x i1> %mask) nounwind #0 {
88+
; CHECK-LABEL: masked_load_nxv4bf16:
89+
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
90+
; CHECK-NEXT: ret
91+
%load = call <vscale x 4 x bfloat> @llvm.masked.load.nxv4bf16(<vscale x 4 x bfloat> *%a, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x bfloat> undef)
92+
ret <vscale x 4 x bfloat> %load
93+
}
94+
7995
define <vscale x 8 x half> @masked_load_nxv8f16(<vscale x 8 x half> *%a, <vscale x 8 x i1> %mask) nounwind {
8096
; CHECK-LABEL: masked_load_nxv8f16:
8197
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
@@ -185,6 +201,22 @@ define void @masked_store_nxv8f16(<vscale x 8 x half> *%a, <vscale x 8 x half> %
185201
ret void
186202
}
187203

204+
define void @masked_store_nxv2bf16(<vscale x 2 x bfloat> *%a, <vscale x 2 x bfloat> %val, <vscale x 2 x i1> %mask) nounwind #0 {
205+
; CHECK-LABEL: masked_store_nxv2bf16:
206+
; CHECK-NEXT: st1h { z0.d }, p0, [x0]
207+
; CHECK-NEXT: ret
208+
call void @llvm.masked.store.nxv2bf16(<vscale x 2 x bfloat> %val, <vscale x 2 x bfloat> *%a, i32 2, <vscale x 2 x i1> %mask)
209+
ret void
210+
}
211+
212+
define void @masked_store_nxv4bf16(<vscale x 4 x bfloat> *%a, <vscale x 4 x bfloat> %val, <vscale x 4 x i1> %mask) nounwind #0 {
213+
; CHECK-LABEL: masked_store_nxv4bf16:
214+
; CHECK-NEXT: st1h { z0.s }, p0, [x0]
215+
; CHECK-NEXT: ret
216+
call void @llvm.masked.store.nxv4bf16(<vscale x 4 x bfloat> %val, <vscale x 4 x bfloat> *%a, i32 2, <vscale x 4 x i1> %mask)
217+
ret void
218+
}
219+
188220
define void @masked_store_nxv8bf16(<vscale x 8 x bfloat> *%a, <vscale x 8 x bfloat> %val, <vscale x 8 x i1> %mask) nounwind #0 {
189221
; CHECK-LABEL: masked_store_nxv8bf16:
190222
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
@@ -292,6 +324,8 @@ declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32,
292324
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
293325
declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
294326
declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
327+
declare <vscale x 2 x bfloat> @llvm.masked.load.nxv2bf16(<vscale x 2 x bfloat>*, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
328+
declare <vscale x 4 x bfloat> @llvm.masked.load.nxv4bf16(<vscale x 4 x bfloat>*, i32, <vscale x 4 x i1>, <vscale x 4 x bfloat>)
295329
declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
296330

297331
declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
@@ -305,6 +339,8 @@ declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>
305339
declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
306340
declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
307341
declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
342+
declare void @llvm.masked.store.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>*, i32, <vscale x 2 x i1>)
343+
declare void @llvm.masked.store.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>*, i32, <vscale x 4 x i1>)
308344
declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
309345

310346
declare <vscale x 2 x i8*> @llvm.masked.load.nxv2p0i8.p0nxv2p0i8(<vscale x 2 x i8*>*, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i8*>)

llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-reg.ll

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,18 @@ define void @st1_nxv4f16(half* %addr, i64 %off, <vscale x 4 x half> %val) {
166166
ret void
167167
}
168168

169+
define void @st1_nxv4bf16(bfloat* %addr, i64 %off, <vscale x 4 x bfloat> %val) {
170+
; CHECK-LABEL: st1_nxv4bf16:
171+
; CHECK: // %bb.0:
172+
; CHECK-NEXT: ptrue p0.s
173+
; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
174+
; CHECK-NEXT: ret
175+
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off
176+
%ptrcast = bitcast bfloat* %ptr to <vscale x 4 x bfloat>*
177+
store <vscale x 4 x bfloat> %val, <vscale x 4 x bfloat>* %ptrcast
178+
ret void
179+
}
180+
169181
define void @st1_nxv2f16(half* %addr, i64 %off, <vscale x 2 x half> %val) {
170182
; CHECK-LABEL: st1_nxv2f16:
171183
; CHECK: // %bb.0:
@@ -178,6 +190,18 @@ define void @st1_nxv2f16(half* %addr, i64 %off, <vscale x 2 x half> %val) {
178190
ret void
179191
}
180192

193+
define void @st1_nxv2bf16(bfloat* %addr, i64 %off, <vscale x 2 x bfloat> %val) {
194+
; CHECK-LABEL: st1_nxv2bf16:
195+
; CHECK: // %bb.0:
196+
; CHECK-NEXT: ptrue p0.d
197+
; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
198+
; CHECK-NEXT: ret
199+
%ptr = getelementptr inbounds bfloat, bfloat* %addr, i64 %off
200+
%ptrcast = bitcast bfloat* %ptr to <vscale x 2 x bfloat>*
201+
store <vscale x 2 x bfloat> %val, <vscale x 2 x bfloat>* %ptrcast
202+
ret void
203+
}
204+
181205
; ST1W
182206

183207
define void @st1_nxv4i32(i32* %addr, i64 %off, <vscale x 4 x i32> %val) {

0 commit comments

Comments
 (0)