Skip to content

Commit 9249f60

Browse files
committed
[AArch64][SVE] Add intrinsics for non-temporal gather-loads/scatter-stores
Summary: This patch adds the following LLVM IR intrinsics for SVE: 1. non-temporal gather loads * @llvm.aarch64.sve.ldnt1.gather * @llvm.aarch64.sve.ldnt1.gather.uxtw * @llvm.aarch64.sve.ldnt1.gather.scalar.offset 2. non-temporal scatter stores * @llvm.aarch64.sve.stnt1.scatter * @llvm.aarch64.sve.ldnt1.gather.uxtw * @llvm.aarch64.sve.ldnt1.gather.scalar.offset These intrinsic are mapped to the corresponding SVE instructions (example for half-words, zero-extending): * ldnt1h { z0.s }, p0/z, [z0.s, x0] * stnt1h { z0.s }, p0/z, [z0.s, x0] Note that for non-temporal gathers/scatters, the SVE spec defines only one instruction type: "vector + scalar". For this reason, we swap the arguments when processing intrinsics that implement the "scalar + vector" addressing mode: * @llvm.aarch64.sve.ldnt1.gather * @llvm.aarch64.sve.ldnt1.gather.uxtw * @llvm.aarch64.sve.stnt1.scatter * @llvm.aarch64.sve.ldnt1.gather.uxtw In other words, all intrinsics for gather-loads and scatter-stores implemented in this patch are mapped to the same load and store instruction, respectively. The sve2_mem_gldnt_vs multiclass (and it's counterpart for scatter stores) from SVEInstrFormats.td was split into: * sve2_mem_gldnt_vec_vs_32_ptrs (32bit wide base addresses) * sve2_mem_gldnt_vec_vs_62_ptrs (64bit wide base addresses) This is consistent with what we did for @llvm.aarch64.sve.ld1.scalar.offset and highlights the actual split in the spec and the implementation. Reviewed by: sdesmalen Differential Revision: https://reviews.llvm.org/D74858
1 parent 1a8cbfa commit 9249f60

11 files changed

+826
-34
lines changed

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,6 +1760,22 @@ def int_aarch64_sve_ldff1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_
17601760
def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
17611761

17621762

1763+
//
1764+
// Non-temporal gather loads: scalar base + vector offsets
1765+
//
1766+
1767+
// 64 bit unscaled offsets
1768+
def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
1769+
1770+
// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
1771+
def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
1772+
1773+
//
1774+
// Non-temporal gather loads: vector base + scalar offset
1775+
//
1776+
1777+
def int_aarch64_sve_ldnt1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic;
1778+
17631779
//
17641780
// Scatter stores: scalar base + vector offsets
17651781
//
@@ -1791,6 +1807,22 @@ def int_aarch64_sve_st1_scatter_uxtw_index
17911807

17921808
def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
17931809

1810+
//
1811+
// Non-temporal scatter stores: scalar base + vector offsets
1812+
//
1813+
1814+
// 64 bit unscaled offsets
1815+
def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
1816+
1817+
// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
1818+
def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
1819+
1820+
//
1821+
// Non-temporal scatter stores: vector base + scalar offset
1822+
//
1823+
1824+
def int_aarch64_sve_stnt1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic;
1825+
17941826
//
17951827
// SVE2 - Uniform DSP operations
17961828
//

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1438,13 +1438,20 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
14381438
case AArch64ISD::GLDFF1S_UXTW_SCALED:
14391439
return "AArch64ISD::GLDFF1S_UXTW_SCALED";
14401440
case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM";
1441+
1442+
case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1";
1443+
case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S";
1444+
14411445
case AArch64ISD::SST1: return "AArch64ISD::SST1";
14421446
case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
14431447
case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
14441448
case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";
14451449
case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
14461450
case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
14471451
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
1452+
1453+
case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1";
1454+
14481455
case AArch64ISD::LDP: return "AArch64ISD::LDP";
14491456
case AArch64ISD::STP: return "AArch64ISD::STP";
14501457
case AArch64ISD::STNP: return "AArch64ISD::STNP";
@@ -10457,6 +10464,7 @@ static SDValue performSVEAndCombine(SDNode *N,
1045710464
case AArch64ISD::GLDFF1_UXTW:
1045810465
case AArch64ISD::GLDFF1_UXTW_SCALED:
1045910466
case AArch64ISD::GLDFF1_IMM:
10467+
case AArch64ISD::GLDNT1:
1046010468
MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
1046110469
break;
1046210470
default:
@@ -12644,6 +12652,14 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
1264412652
// vector of offsets (that fits into one register)
1264512653
SDValue Offset = N->getOperand(5);
1264612654

12655+
// In the case of non-temporal gather loads there's only one SVE instruction
12656+
// per data-size: "scalar + vector", i.e.
12657+
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
12658+
// Since we do have intrinsics that allow the arguments to be in a different
12659+
// order, we may need to swap them to match the spec.
12660+
if (Opcode == AArch64ISD::SSTNT1 && Offset.getValueType().isVector())
12661+
std::swap(Base, Offset);
12662+
1264712663
// SST1_IMM requires that the offset is an immediate that is:
1264812664
// * a multiple of #SizeInBytes,
1264912665
// * in the range [0, 31 x #SizeInBytes],
@@ -12730,6 +12746,14 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
1273012746
// vector of offsets (that fits into one register)
1273112747
SDValue Offset = N->getOperand(4);
1273212748

12749+
// In the case of non-temporal gather loads there's only one SVE instruction
12750+
// per data-size: "scalar + vector", i.e.
12751+
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
12752+
// Since we do have intrinsics that allow the arguments to be in a different
12753+
// order, we may need to swap them to match the spec.
12754+
if (Opcode == AArch64ISD::GLDNT1 && Offset.getValueType().isVector())
12755+
std::swap(Base, Offset);
12756+
1273312757
// GLD{FF}1_IMM requires that the offset is an immediate that is:
1273412758
// * a multiple of #SizeInBytes,
1273512759
// * in the range [0, 31 x #SizeInBytes],
@@ -12859,6 +12883,9 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1285912883
case AArch64ISD::GLDFF1_IMM:
1286012884
NewOpc = AArch64ISD::GLDFF1S_IMM;
1286112885
break;
12886+
case AArch64ISD::GLDNT1:
12887+
NewOpc = AArch64ISD::GLDNT1S;
12888+
break;
1286212889
default:
1286312890
return SDValue();
1286412891
}
@@ -12972,12 +12999,24 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1297212999
return performNEONPostLDSTCombine(N, DCI, DAG);
1297313000
case Intrinsic::aarch64_sve_ldnt1:
1297413001
return performLDNT1Combine(N, DAG);
13002+
case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
13003+
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
13004+
case Intrinsic::aarch64_sve_ldnt1_gather:
13005+
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
13006+
case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
13007+
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
1297513008
case Intrinsic::aarch64_sve_ldnf1:
1297613009
return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1);
1297713010
case Intrinsic::aarch64_sve_ldff1:
1297813011
return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1);
1297913012
case Intrinsic::aarch64_sve_stnt1:
1298013013
return performSTNT1Combine(N, DAG);
13014+
case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
13015+
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
13016+
case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
13017+
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
13018+
case Intrinsic::aarch64_sve_stnt1_scatter:
13019+
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
1298113020
case Intrinsic::aarch64_sve_ld1_gather:
1298213021
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
1298313022
case Intrinsic::aarch64_sve_ld1_gather_index:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,10 @@ enum NodeType : unsigned {
261261
GLDFF1S_SXTW_SCALED,
262262
GLDFF1S_IMM,
263263

264+
// Non-temporal gather loads
265+
GLDNT1,
266+
GLDNT1S,
267+
264268
// Scatter store
265269
SST1,
266270
SST1_SCALED,
@@ -270,6 +274,9 @@ enum NodeType : unsigned {
270274
SST1_SXTW_SCALED,
271275
SST1_IMM,
272276

277+
// Non-temporal scatter store
278+
SSTNT1,
279+
273280
// Strict (exception-raising) floating point comparison
274281
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
275282
STRICT_FCMPE,

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ def AArch64ldff1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED",
6969
def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
7070
def AArch64ldff1s_gather_imm : SDNode<"AArch64ISD::GLDFF1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
7171

72+
def AArch64ldnt1_gather : SDNode<"AArch64ISD::GLDNT1", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
73+
def AArch64ldnt1s_gather : SDNode<"AArch64ISD::GLDNT1S", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
74+
7275
// Scatter stores - node definitions
7376
//
7477
def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
@@ -89,6 +92,8 @@ def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_
8992
def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
9093
def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
9194

95+
def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
96+
9297
// AArch64 SVE/SVE2 - the remaining node definitions
9398
//
9499

@@ -1909,32 +1914,32 @@ let Predicates = [HasSVE2] in {
19091914
def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
19101915

19111916
// SVE2 non-temporal gather loads
1912-
defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
1913-
defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
1914-
defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
1915-
defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
1916-
defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
1917-
1918-
defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
1919-
defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
1920-
defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
1921-
defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
1922-
defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
1923-
defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
1924-
defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
1917+
defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>;
1918+
defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather, nxv4i8>;
1919+
defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>;
1920+
defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather, nxv4i16>;
1921+
defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather, nxv4i32>;
1922+
1923+
defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>;
1924+
defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather, nxv2i8>;
1925+
defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>;
1926+
defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather, nxv2i16>;
1927+
defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>;
1928+
defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather, nxv2i32>;
1929+
defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather, nxv2i64>;
19251930

19261931
// SVE2 vector splice (constructive)
19271932
defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
19281933

19291934
// SVE2 non-temporal scatter stores
1930-
defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
1931-
defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
1932-
defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
1933-
1934-
defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
1935-
defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
1936-
defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
1937-
defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
1935+
defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
1936+
defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
1937+
defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
1938+
1939+
defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
1940+
defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
1941+
defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
1942+
defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
19381943

19391944
// SVE2 table lookup (three sources)
19401945
defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5071,16 +5071,36 @@ class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
50715071
let mayStore = 1;
50725072
}
50735073

5074-
multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
5075-
RegisterOperand listty, ZPRRegOp zprty> {
5076-
def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
5074+
multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
5075+
SDPatternOperator op,
5076+
ValueType vt> {
5077+
def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
50775078

50785079
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
5079-
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
5080+
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
50805081
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
5081-
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
5082+
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
50825083
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
5083-
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
5084+
(!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
5085+
5086+
def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
5087+
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
5088+
}
5089+
5090+
multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
5091+
SDPatternOperator op,
5092+
ValueType vt> {
5093+
def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
5094+
5095+
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
5096+
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
5097+
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
5098+
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
5099+
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
5100+
(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
5101+
5102+
def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
5103+
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
50845104
}
50855105

50865106
class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
@@ -6529,17 +6549,38 @@ class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
65296549
let mayLoad = 1;
65306550
}
65316551

6532-
multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
6533-
RegisterOperand listty, ZPRRegOp zprty> {
6534-
def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
6535-
asm, listty>;
6552+
multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
6553+
SDPatternOperator op,
6554+
ValueType vt> {
6555+
def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
6556+
asm, Z_s>;
6557+
6558+
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
6559+
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
6560+
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
6561+
(!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
6562+
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
6563+
(!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
6564+
6565+
def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
6566+
(!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
6567+
}
6568+
6569+
multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
6570+
SDPatternOperator op,
6571+
ValueType vt> {
6572+
def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
6573+
asm, Z_d>;
65366574

65376575
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
6538-
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
6576+
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
65396577
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
6540-
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
6578+
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
65416579
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
6542-
(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
6580+
(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
6581+
6582+
def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
6583+
(!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
65436584
}
65446585

65456586
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)