Skip to content

[AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. #111538

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 155 additions & 52 deletions llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,89 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
V1UnitV, V1UnitV, V1UnitV,
V1UnitV, V1UnitV, V1UnitV]>;

//===----------------------------------------------------------------------===//
// Define forwarded types

// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
// consumers of 64 bit multiply high operations?
def V1Wr_IM : SchedWriteRes<[V1UnitM]> { let Latency = 2; }
def V1Wr_IMA : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
def V1WriteIM : SchedWriteVariant<
[SchedVar<NeoverseMULIdiomPred, [V1Wr_IM]>,
SchedVar<NoSchedPred, [V1Wr_IMA]>]>;
def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>;

def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;

def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;

def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;

def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;

def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;

def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;

def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;

def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;

def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;

def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;

def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;

def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;

def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;

def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;

def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;

def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;

def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;

def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;

let Latency = 5, NumMicroOps = 2 in
def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;

def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;

def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;

def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;

// Miscellaneous Instructions
// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -553,16 +636,19 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
def : SchedAlias<WriteID64, V1Write_20c5_1M0>;

def : SchedAlias<WriteIM32, V1Write_2c_1M>;
def : SchedAlias<WriteIM64, V1Write_2c_1M>;

// Multiply
// Multiply accumulate
// Multiply accumulate, long
// Multiply long
def V1WriteIM : SchedWriteVariant<
[SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
SchedVar<NoSchedPred, [V1Write_2c_1M0]>]>;
def : SchedAlias<WriteIM32, V1WriteIM>;
def : SchedAlias<WriteIM64, V1WriteIM>;
// Multiply accumulate, W-form
// Multiply accumulate, X-form
def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
(instregex "^M(ADD|SUB)[WX]rrr$")>;

// Multiply accumulate long
// Multiply long
def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
(instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
// Multiply high
def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;

Expand Down Expand Up @@ -680,10 +766,11 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;

// FP multiply
def : SchedAlias<WriteFMul, V1Write_3c_1V>;
def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }

// FP multiply accumulate
def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA],
(instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;

// FP round to integral
def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
Expand Down Expand Up @@ -824,7 +911,7 @@ def : SchedAlias<WriteVq, V1Write_2c_1V>;
// ASIMD absolute diff accum
// ASIMD absolute diff accum long
// ASIMD pairwise add and accumulate long
def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;

// ASIMD arith, reduce, 4H/4S
// ASIMD max/min, reduce, 4H/4S
Expand All @@ -843,23 +930,26 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",

// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
def : InstRW<[V1Wr_VDOT, V1Rd_VDOT],
(instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;

// ASIMD matrix multiply- accumulate
def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
// ASIMD matrix multiply-accumulate
def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;

// ASIMD multiply
def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;

// ASIMD multiply accumulate
def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;

// ASIMD multiply accumulate long
def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;

// ASIMD multiply accumulate high
def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;

// ASIMD multiply accumulate saturating long
def : InstRW<[V1Write_4c_1V02],
(instregex "^MUL(v[148]i16|v[124]i32)$",
"^SQR?DMULH(v[48]i16|v[24]i32)$",
"^ML[AS](v[148]i16|v[124]i32)$",
"^[SU]ML[AS]Lv",
"^SQRDML[AS]H(v[148]i16|v[124]i32)$",
"^SQDML[AS]Lv")>;
def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;

// ASIMD multiply/multiply long (8x8) polynomial
def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
Expand All @@ -868,11 +958,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;

// ASIMD shift accumulate
def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;

// ASIMD shift by immed, complex
// ASIMD shift by register, complex
def : InstRW<[V1Write_4c_1V13],
(instregex "^[SU]R?SRAv",
"^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
(instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
"^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
"^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv",
"^[SU]Q?RSHLv", "^[SU]QSHLv")>;
Expand All @@ -890,16 +981,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
// ASIMD FP absolute value/difference
// ASIMD FP arith, normal
// ASIMD FP compare
// ASIMD FP complex add
// ASIMD FP max/min, normal
// ASIMD FP max/min, pairwise
// ASIMD FP negate
// Covered by "SchedAlias (WriteV[dq]...)" above

// ASIMD FP complex add
def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;

// ASIMD FP complex multiply add
def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;

// ASIMD FP multiply
def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;

// ASIMD FP multiply accumulate
def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
"^FML[AS]v")>;
def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;

// ASIMD FP multiply accumulate long
def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;

// ASIMD FP convert, long (F16 to F32)
def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
Expand Down Expand Up @@ -953,12 +1053,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
// ASIMD FP max/min, reduce, Q-form F16
def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;

// ASIMD FP multiply
def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>;

// ASIMD FP multiply accumulate long
def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>;

// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;

Expand All @@ -976,13 +1070,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;

// ASIMD dot product
def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;

// ASIMD matrix multiply accumulate
def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>;
def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;

// ASIMD multiply accumulate long
def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>;
def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;

// Scalar convert, F32 to BF16
def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
Expand Down Expand Up @@ -1300,7 +1394,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
// -----------------------------------------------------------------------------

// CRC checksum ops
def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>;
def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;


// SVE Predicate instructions
Expand Down Expand Up @@ -1440,13 +1534,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
"^[SU]DIV_ZPZZ_D")>;

// Dot product, 8 bit
def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>;
def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;

// Dot product, 8 bit, using signed and unsigned integers
def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB],
(instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;

// Dot product, 16 bit
def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;

// Duplicate, immediate and indexed form
def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
Expand Down Expand Up @@ -1488,7 +1583,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
"^MOVPRFX_ZZ$")>;

// Matrix multiply-accumulate
def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;

// Multiply, B, H, S element size
def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
Expand All @@ -1497,12 +1592,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
"^[SU]MULH_ZPZZ_[BHS]")>;

// Multiply, D element size
// Multiply accumulate, D element size
def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
"^MUL_ZPZZ_D",
"^[SU]MULH_(ZPmZ|ZZZ)_D",
"^[SU]MULH_ZPZZ_D",
"^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
"^[SU]MULH_ZPZZ_D")>;

// Multiply accumulate, D element size
def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
(instregex "^ML[AS]_ZPZZZ_D")>;
def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
(instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;

// Multiply accumulate, B, H, S element size
// NOTE: This is not specified in the SOG.
Expand Down Expand Up @@ -1583,8 +1682,8 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;

// Floating point complex multiply add
def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
"^FCMLA_ZZZI_[HS]$")>;
def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;

// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
// Floating point convert to integer, F32
Expand Down Expand Up @@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
"^FMUL_ZPZ[IZ]_[HSD]")>;

// Floating point multiply accumulate
def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
(instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
"^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
(instregex "^FML[AS]_ZZZI_[HSD]",
"^FN?ML[AS]_ZPZZZ_[HSD]")>;

// Floating point reciprocal step
def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
"^FN?ML[AS]_ZPZZZ_[HSD]",
"^FML[AS]_ZZZI_[HSD]$",
"^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;

// Floating point reciprocal estimate, F16
def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
Expand Down Expand Up @@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;

// Dot product
def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;

// Matrix multiply accumulate
def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>;
def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;

// Multiply accumulate long
def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;


// SVE Load instructions
Expand Down
Loading
Loading