Skip to content

[AArch64] Fix scheduling information for arithmetic and logical instructions. #113542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ def V1UnitV0 : ProcResource<1>; // FP/ASIMD 0
def V1UnitV1 : ProcResource<1>; // FP/ASIMD 1
def V1UnitV2 : ProcResource<1>; // FP/ASIMD 2
def V1UnitV3 : ProcResource<1>; // FP/ASIMD 3
def V1UnitFlg : ProcResource<3>; // Flags

def V1UnitI : ProcResGroup<[V1UnitS,
V1UnitM0, V1UnitM1]>; // Integer units
def V1UnitJ : ProcResGroup<[V1UnitS, V1UnitM0]>; // Integer 0-2 units
def V1UnitM : ProcResGroup<[V1UnitM0, V1UnitM1]>; // Integer multicycle units
def V1UnitL : ProcResGroup<[V1UnitL01, V1UnitL2]>; // Load units
def V1UnitV : ProcResGroup<[V1UnitV0, V1UnitV1,
Expand Down Expand Up @@ -98,13 +98,14 @@ def V1Write_0c_0Z : SchedWriteRes<[]>;

def V1Write_1c_1B : SchedWriteRes<[V1UnitB]> { let Latency = 1; }
def V1Write_1c_1I : SchedWriteRes<[V1UnitI]> { let Latency = 1; }
def V1Write_1c_1J : SchedWriteRes<[V1UnitJ]> { let Latency = 1; }
def V1Write_1c_1I_1Flg : SchedWriteRes<[V1UnitI, V1UnitFlg]> { let Latency = 1; }
def V1Write_4c_1L : SchedWriteRes<[V1UnitL]> { let Latency = 4; }
def V1Write_6c_1L : SchedWriteRes<[V1UnitL]> { let Latency = 6; }
def V1Write_1c_1L01 : SchedWriteRes<[V1UnitL01]> { let Latency = 1; }
def V1Write_4c_1L01 : SchedWriteRes<[V1UnitL01]> { let Latency = 4; }
def V1Write_6c_1L01 : SchedWriteRes<[V1UnitL01]> { let Latency = 6; }
def V1Write_2c_1M : SchedWriteRes<[V1UnitM]> { let Latency = 2; }
def V1Write_2c_1M_1Flg : SchedWriteRes<[V1UnitM, V1UnitFlg]> { let Latency = 2; }
def V1Write_3c_1M : SchedWriteRes<[V1UnitM]> { let Latency = 3; }
def V1Write_4c_1M : SchedWriteRes<[V1UnitM]> { let Latency = 4; }
def V1Write_1c_1M0 : SchedWriteRes<[V1UnitM0]> { let Latency = 1; }
Expand Down Expand Up @@ -595,7 +596,7 @@ def : InstRW<[V1Write_1c_1B], (instregex "^[CT]BN?Z[XW]$")>;
def : SchedAlias<WriteI, V1Write_1c_1I>;

// ALU, basic, flagset
def : InstRW<[V1Write_1c_1J],
def : InstRW<[V1Write_1c_1I_1Flg],
(instregex "^(ADD|SUB)S[WX]r[ir]$",
"^(ADC|SBC)S[WX]r$",
"^ANDS[WX]ri$",
Expand All @@ -614,19 +615,19 @@ def : SchedAlias<WriteISReg, V1WriteISReg>;
// Arithmetic, flagset, LSL shift, shift <= 4
// Arithmetic, flagset, LSR/ASR/ROR shift or LSL shift > 4
def V1WriteISRegS : SchedWriteVariant<
[SchedVar<IsCheapLSL, [V1Write_1c_1J]>,
SchedVar<NoSchedPred, [V1Write_2c_1M]>]>;
[SchedVar<IsCheapLSL, [V1Write_1c_1I_1Flg]>,
SchedVar<NoSchedPred, [V1Write_2c_1M_1Flg]>]>;
def : InstRW<[V1WriteISRegS],
(instregex "^(ADD|SUB)S(([WX]r[sx])|Xrx64)$")>;

// Logical, shift, no flagset
def : InstRW<[V1Write_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>;

// Logical, shift, flagset
def : InstRW<[V1Write_2c_1M], (instregex "^(AND|BIC)S[WX]rs$")>;
def : InstRW<[V1Write_2c_1M_1Flg], (instregex "^(AND|BIC)S[WX]rs$")>;

// Flag manipulation instructions
def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
def : InstRW<[V1Write_1c_1I_1Flg], (instrs SETF8, SETF16, RMIF, CFINV)>;


// Divide and multiply instructions
Expand Down
30 changes: 19 additions & 11 deletions llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def V2UnitV3 : ProcResource<1>; // FP/ASIMD 3
def V2UnitL01 : ProcResource<2>; // Load/Store 0/1
def V2UnitL2 : ProcResource<1>; // Load 2
def V2UnitD : ProcResource<2>; // Store data 0/1
def V2UnitFlg : ProcResource<3>; // Flags

def V2UnitR : ProcResGroup<[V2UnitS0, V2UnitS1]>; // Integer single-cycle 0/1
def V2UnitS : ProcResGroup<[V2UnitS0, V2UnitS1, V2UnitS2, V2UnitS3]>; // Integer single-cycle 0/1/2/3
Expand Down Expand Up @@ -97,11 +98,13 @@ def V2Write_0c : SchedWriteRes<[]> { let Latency = 0; }

def V2Write_1c_1B : SchedWriteRes<[V2UnitB]> { let Latency = 1; }
def V2Write_1c_1F : SchedWriteRes<[V2UnitF]> { let Latency = 1; }
def V2Write_1c_1F_1Flg : SchedWriteRes<[V2UnitF, V2UnitFlg]> { let Latency = 1; }
def V2Write_1c_1I : SchedWriteRes<[V2UnitI]> { let Latency = 1; }
def V2Write_1c_1M : SchedWriteRes<[V2UnitM]> { let Latency = 1; }
def V2Write_1c_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 1; }
def V2Write_1c_1L01 : SchedWriteRes<[V2UnitL01]> { let Latency = 1; }
def V2Write_2c_1M : SchedWriteRes<[V2UnitM]> { let Latency = 2; }
def V2Write_2c_1M_1Flg : SchedWriteRes<[V2UnitM, V2UnitFlg]> { let Latency = 2; }
def V2Write_3c_1M : SchedWriteRes<[V2UnitM]> { let Latency = 3; }
def V2Write_2c_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 2; }
def V2Write_3c_1M0 : SchedWriteRes<[V2UnitM0]> { let Latency = 3; }
Expand Down Expand Up @@ -886,12 +889,12 @@ def V2Write_ArithI : SchedWriteVariant<[
SchedVar<NoSchedPred, [V2Write_2c_1M]>]>;

def V2Write_ArithF : SchedWriteVariant<[
SchedVar<IsCheapLSL, [V2Write_1c_1F]>,
SchedVar<NoSchedPred, [V2Write_2c_1M]>]>;
SchedVar<IsCheapLSL, [V2Write_1c_1F_1Flg]>,
SchedVar<NoSchedPred, [V2Write_2c_1M_1Flg]>]>;

def V2Write_Logical : SchedWriteVariant<[
SchedVar<NeoverseNoLSL, [V2Write_1c_1F]>,
SchedVar<NoSchedPred, [V2Write_2c_1M]>]>;
SchedVar<NeoverseNoLSL, [V2Write_1c_1F_1Flg]>,
SchedVar<NoSchedPred, [V2Write_2c_1M_1Flg]>]>;

def V2Write_Extr : SchedWriteVariant<[
SchedVar<IsRORImmIdiomPred, [V2Write_1c_1I]>,
Expand Down Expand Up @@ -1106,19 +1109,19 @@ def : InstRW<[V2Write_1c_1B_1R], (instrs BL, BLR)>;
// -----------------------------------------------------------------------------

// ALU, basic
// ALU, basic, flagset
def : SchedAlias<WriteI, V2Write_1c_1I>;
def : InstRW<[V2Write_1c_1F], (instregex "^(ADD|SUB)S[WX]r[ir]$",

// ALU, basic, flagset
def : InstRW<[V2Write_1c_1F_1Flg],
(instregex "^(ADD|SUB)S[WX]r[ir]$",
"^(ADC|SBC)S[WX]r$",
"^ANDS[WX]ri$")>;
"^ANDS[WX]ri$",
"^(AND|BIC)S[WX]rr$")>;
def : InstRW<[V2Write_0or1c_1I], (instregex "^MOVZ[WX]i$")>;

// ALU, extend and shift
def : SchedAlias<WriteIEReg, V2Write_2c_1M>;

// Conditional compare
def : InstRW<[V2Write_1c_1F], (instregex "^CCM[NP][WX][ir]")>;

// Arithmetic, LSL shift, shift <= 4
// Arithmetic, flagset, LSL shift, shift <= 4
// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
Expand All @@ -1129,6 +1132,9 @@ def : InstRW<[V2Write_ArithF],
// Arithmetic, immediate to logical address tag
def : InstRW<[V2Write_2c_1M], (instrs ADDG, SUBG)>;

// Conditional compare
def : InstRW<[V2Write_1c_1F_1Flg], (instregex "^CCM[NP][WX][ir]")>;

// Convert floating-point condition flags
// Flag manipulation instructions
def : WriteRes<WriteSys, []> { let Latency = 1; }
Expand All @@ -1138,8 +1144,10 @@ def : InstRW<[V2Write_2c_1M], (instrs IRG, IRGstack)>;

// Insert Tag Mask
// Subtract Pointer
def : InstRW<[V2Write_1c_1I], (instrs GMI, SUBP)>;

// Subtract Pointer, flagset
def : InstRW<[V2Write_1c_1I], (instrs GMI, SUBP, SUBPS)>;
def : InstRW<[V2Write_1c_1F_1Flg], (instrs SUBPS)>;

// Logical, shift, no flagset
def : InstRW<[V2Write_1c_1I], (instregex "^(AND|BIC|EON|EOR|ORN)[WX]rs$")>;
Expand Down
33 changes: 18 additions & 15 deletions llvm/test/tools/llvm-mca/AArch64/Neoverse/512tvb-sve-instructions.s
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,25 @@ abs z0.b, p0/m, z0.b
# CHECK-NEXT: [0.1] - V1UnitB
# CHECK-NEXT: [1.0] - V1UnitD
# CHECK-NEXT: [1.1] - V1UnitD
# CHECK-NEXT: [2] - V1UnitL2
# CHECK-NEXT: [3.0] - V1UnitL01
# CHECK-NEXT: [3.1] - V1UnitL01
# CHECK-NEXT: [4] - V1UnitM0
# CHECK-NEXT: [5] - V1UnitM1
# CHECK-NEXT: [6.0] - V1UnitS
# CHECK-NEXT: [6.1] - V1UnitS
# CHECK-NEXT: [7] - V1UnitV0
# CHECK-NEXT: [8] - V1UnitV1
# CHECK-NEXT: [9] - V1UnitV2
# CHECK-NEXT: [10] - V1UnitV3
# CHECK-NEXT: [2.0] - V1UnitFlg
# CHECK-NEXT: [2.1] - V1UnitFlg
# CHECK-NEXT: [2.2] - V1UnitFlg
# CHECK-NEXT: [3] - V1UnitL2
# CHECK-NEXT: [4.0] - V1UnitL01
# CHECK-NEXT: [4.1] - V1UnitL01
# CHECK-NEXT: [5] - V1UnitM0
# CHECK-NEXT: [6] - V1UnitM1
# CHECK-NEXT: [7.0] - V1UnitS
# CHECK-NEXT: [7.1] - V1UnitS
# CHECK-NEXT: [8] - V1UnitV0
# CHECK-NEXT: [9] - V1UnitV1
# CHECK-NEXT: [10] - V1UnitV2
# CHECK-NEXT: [11] - V1UnitV3

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 - -
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7.0] [7.1] [8] [9] [10] [11]
# CHECK-NEXT: - - - - - - - - - - - - - - 0.50 0.50 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 - - abs z0.b, p0/m, z0.b
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [3] [4.0] [4.1] [5] [6] [7.0] [7.1] [8] [9] [10] [11] Instructions:
# CHECK-NEXT: - - - - - - - - - - - - - - 0.50 0.50 - - abs z0.b, p0/m, z0.b
Loading
Loading