Skip to content

Commit 12b824f

Browse files
committed
[AMDGPU][MCA] Scheduler updates for pseudo scalar transcendental instructions
1 parent c36718f commit 12b824f

File tree

4 files changed

+142
-3
lines changed

4 files changed

+142
-3
lines changed

llvm/lib/Target/AMDGPU/GCNProcessors.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,10 +284,10 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
284284
// GCN GFX12.
285285
//===----------------------------------------------------------------------===//
286286

287-
def : ProcessorModel<"gfx1200", GFX11SpeedModel,
287+
def : ProcessorModel<"gfx1200", GFX12SpeedModel,
288288
FeatureISAVersion12.Features
289289
>;
290290

291-
def : ProcessorModel<"gfx1201", GFX11SpeedModel,
291+
def : ProcessorModel<"gfx1201", GFX12SpeedModel,
292292
FeatureISAVersion12.Features
293293
>;

llvm/lib/Target/AMDGPU/SISchedule.td

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ def Write8PassDGEMM : SchedWrite;
6868
// Scalar float instructions
6969
def WriteSFPU : SchedWrite;
7070

71+
// F16 or F32 pseudo scalar transcendental instructions
72+
def WritePseudoScalarTrans : SchedWrite;
73+
7174
// FIXME: Should there be a class for instructions which are VALU
7275
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
7376
// instructions)
@@ -93,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel;
9396
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
9497
def GFX10SpeedModel : SISchedMachineModel;
9598
def GFX11SpeedModel : SISchedMachineModel;
99+
def GFX12SpeedModel : SISchedMachineModel;
96100

97101
// XXX: Are the resource counts correct?
98102
def HWBranch : ProcResource<1> {
@@ -174,6 +178,7 @@ multiclass SICommonWriteRes {
174178
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
175179

176180
def : UnsupportedWriteRes<WriteSFPU>;
181+
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
177182
} // End RetireOOO = 1
178183

179184
def : ReadAdvance<MIVGPRRead, -2>;
@@ -318,6 +323,7 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
318323
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
319324

320325
def : UnsupportedWriteRes<WriteSFPU>;
326+
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
321327
} // End RetireOOO = 1
322328

323329
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -351,6 +357,36 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
351357
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
352358
} // End RetireOOO = 1
353359

360+
def : UnsupportedWriteRes<WritePseudoScalarTrans>;
361+
354362
def : InstRW<[WriteCopy], (instrs COPY)>;
355363

356364
} // End SchedModel = GFX11SpeedModel
365+
366+
let SchedModel = GFX12SpeedModel in {
367+
368+
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
369+
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
370+
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
371+
def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
372+
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
373+
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
374+
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
375+
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
376+
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
377+
def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
378+
def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
379+
def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>;
380+
381+
def : HWWriteRes<WriteBranch, [HWBranch], 32>;
382+
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
383+
def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
384+
def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
385+
def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
386+
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
387+
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
388+
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
389+
390+
def : InstRW<[WriteCopy], (instrs COPY)>;
391+
392+
} // End SchedModel = GFX12SpeedModel

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
854854
def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;
855855

856856
let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
857-
isReMaterializable = 1 in {
857+
isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
858858
defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32>;
859859
defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
860860
defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32>;
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx1200 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
3+
4+
v_s_exp_f32 s0, s0
5+
v_s_log_f32 s0, s0
6+
v_s_rcp_f32 s1, s1
7+
v_s_rsq_f32 s1, s0
8+
v_s_sqrt_f32 s2, s1
9+
v_s_exp_f16 s3, s1
10+
v_s_log_f16 s4, s1
11+
v_s_rcp_f16 s5, s2
12+
v_s_rsq_f16 s5, s4
13+
v_s_sqrt_f16 s5, s5
14+
15+
# CHECK: Iterations: 1
16+
# CHECK-NEXT: Instructions: 10
17+
# CHECK-NEXT: Total Cycles: 45
18+
# CHECK-NEXT: Total uOps: 10
19+
20+
# CHECK: Dispatch Width: 1
21+
# CHECK-NEXT: uOps Per Cycle: 0.22
22+
# CHECK-NEXT: IPC: 0.22
23+
# CHECK-NEXT: Block RThroughput: 10.0
24+
25+
# CHECK: Instruction Info:
26+
# CHECK-NEXT: [1]: #uOps
27+
# CHECK-NEXT: [2]: Latency
28+
# CHECK-NEXT: [3]: RThroughput
29+
# CHECK-NEXT: [4]: MayLoad
30+
# CHECK-NEXT: [5]: MayStore
31+
# CHECK-NEXT: [6]: HasSideEffects (U)
32+
33+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
34+
# CHECK-NEXT: 1 7 1.00 U v_s_exp_f32 s0, s0
35+
# CHECK-NEXT: 1 7 1.00 U v_s_log_f32 s0, s0
36+
# CHECK-NEXT: 1 7 1.00 U v_s_rcp_f32 s1, s1
37+
# CHECK-NEXT: 1 7 1.00 U v_s_rsq_f32 s1, s0
38+
# CHECK-NEXT: 1 7 1.00 U v_s_sqrt_f32 s2, s1
39+
# CHECK-NEXT: 1 7 1.00 U v_s_exp_f16 s3, s1
40+
# CHECK-NEXT: 1 7 1.00 U v_s_log_f16 s4, s1
41+
# CHECK-NEXT: 1 7 1.00 U v_s_rcp_f16 s5, s2
42+
# CHECK-NEXT: 1 7 1.00 U v_s_rsq_f16 s5, s4
43+
# CHECK-NEXT: 1 7 1.00 U v_s_sqrt_f16 s5, s5
44+
45+
# CHECK: Resources:
46+
# CHECK-NEXT: [0] - HWBranch
47+
# CHECK-NEXT: [1] - HWExport
48+
# CHECK-NEXT: [2] - HWLGKM
49+
# CHECK-NEXT: [3] - HWRC
50+
# CHECK-NEXT: [4] - HWSALU
51+
# CHECK-NEXT: [5] - HWVALU
52+
# CHECK-NEXT: [6] - HWVMEM
53+
54+
# CHECK: Resource pressure per iteration:
55+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
56+
# CHECK-NEXT: - - - 10.00 - 10.00 -
57+
58+
# CHECK: Resource pressure by instruction:
59+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
60+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_exp_f32 s0, s0
61+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_log_f32 s0, s0
62+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rcp_f32 s1, s1
63+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rsq_f32 s1, s0
64+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_sqrt_f32 s2, s1
65+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_exp_f16 s3, s1
66+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_log_f16 s4, s1
67+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rcp_f16 s5, s2
68+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_rsq_f16 s5, s4
69+
# CHECK-NEXT: - - - 1.00 - 1.00 - v_s_sqrt_f16 s5, s5
70+
71+
# CHECK: Timeline view:
72+
# CHECK-NEXT: 0123456789 0123456789
73+
# CHECK-NEXT: Index 0123456789 0123456789 01234
74+
75+
# CHECK: [0,0] DeeeeeeE . . . . . . . . v_s_exp_f32 s0, s0
76+
# CHECK-NEXT: [0,1] . . DeeeeeeE. . . . . . . v_s_log_f32 s0, s0
77+
# CHECK-NEXT: [0,2] . . DeeeeeeE . . . . . . v_s_rcp_f32 s1, s1
78+
# CHECK-NEXT: [0,3] . . . DeeeeeeE . . . . . v_s_rsq_f32 s1, s0
79+
# CHECK-NEXT: [0,4] . . . . .DeeeeeeE . . . . v_s_sqrt_f32 s2, s1
80+
# CHECK-NEXT: [0,5] . . . . . DeeeeeeE. . . . v_s_exp_f16 s3, s1
81+
# CHECK-NEXT: [0,6] . . . . . DeeeeeeE . . . v_s_log_f16 s4, s1
82+
# CHECK-NEXT: [0,7] . . . . . . DeeeeeeE . . v_s_rcp_f16 s5, s2
83+
# CHECK-NEXT: [0,8] . . . . . . DeeeeeeE . . v_s_rsq_f16 s5, s4
84+
# CHECK-NEXT: [0,9] . . . . . . . . DeeeeeeE v_s_sqrt_f16 s5, s5
85+
86+
# CHECK: Average Wait times (based on the timeline view):
87+
# CHECK-NEXT: [0]: Executions
88+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
89+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
90+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
91+
92+
# CHECK: [0] [1] [2] [3]
93+
# CHECK-NEXT: 0. 1 0.0 0.0 0.0 v_s_exp_f32 s0, s0
94+
# CHECK-NEXT: 1. 1 0.0 0.0 0.0 v_s_log_f32 s0, s0
95+
# CHECK-NEXT: 2. 1 0.0 0.0 0.0 v_s_rcp_f32 s1, s1
96+
# CHECK-NEXT: 3. 1 0.0 0.0 0.0 v_s_rsq_f32 s1, s0
97+
# CHECK-NEXT: 4. 1 0.0 0.0 0.0 v_s_sqrt_f32 s2, s1
98+
# CHECK-NEXT: 5. 1 0.0 0.0 0.0 v_s_exp_f16 s3, s1
99+
# CHECK-NEXT: 6. 1 0.0 0.0 0.0 v_s_log_f16 s4, s1
100+
# CHECK-NEXT: 7. 1 0.0 0.0 0.0 v_s_rcp_f16 s5, s2
101+
# CHECK-NEXT: 8. 1 0.0 0.0 0.0 v_s_rsq_f16 s5, s4
102+
# CHECK-NEXT: 9. 1 0.0 0.0 0.0 v_s_sqrt_f16 s5, s5
103+
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>

0 commit comments

Comments
 (0)