Skip to content

Commit 36d516b

Browse files
committed
drm/i915/gt: Switch to manual evaluation of RPS
As with the realisation for soft-rc6, we respond to idling the engines within microseconds, far faster than the response times for HW RC6 and RPS. Furthermore, our fast parking upon idle, prevents HW RPS from running for many desktop workloads, as the RPS evaluation intervals are on the order of tens of milliseconds, but the typical workload is just a couple of milliseconds, but yet we still need to determine the best frequency for user latency versus power. Recognising that the HW evaluation intervals are a poor fit, and that they were deprecated [in bspec at least] from gen10, start to wean ourselves off them and replace the EI with a timer and our accurate busy-stats. The principle benefit of manually evaluating RPS intervals is that we can be more responsive for better performance and powersaving for both spiky workloads and steady-state. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1698 Fixes: 98479ad ("drm/i915/gt: Treat idling as a RPS downclock event") Signed-off-by: Chris Wilson <[email protected]> Cc: Mika Kuoppala <[email protected]> Cc: Andi Shyti <[email protected]> Reviewed-by: Andi Shyti <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 8e99299 commit 36d516b

File tree

4 files changed

+147
-16
lines changed

4 files changed

+147
-16
lines changed

drivers/gpu/drm/i915/gt/intel_engine_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,11 @@ struct intel_engine_cs {
555555
* Idle is defined as active == 0, active is active > 0.
556556
*/
557557
ktime_t start;
558+
559+
/**
560+
* @rps: Utilisation at last RPS sampling.
561+
*/
562+
ktime_t rps;
558563
} stats;
559564

560565
struct {

drivers/gpu/drm/i915/gt/intel_rps.c

Lines changed: 122 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "intel_sideband.h"
1616
#include "../../../platform/x86/intel_ips.h"
1717

18+
#define BUSY_MAX_EI 20u /* ms */
19+
1820
/*
1921
* Lock protecting IPS related data structures
2022
*/
@@ -45,6 +47,100 @@ static inline void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
4547
intel_uncore_write_fw(uncore, reg, val);
4648
}
4749

50+
static void rps_timer(struct timer_list *t)
51+
{
52+
struct intel_rps *rps = from_timer(rps, t, timer);
53+
struct intel_engine_cs *engine;
54+
enum intel_engine_id id;
55+
s64 max_busy[3] = {};
56+
ktime_t dt, last;
57+
58+
for_each_engine(engine, rps_to_gt(rps), id) {
59+
s64 busy;
60+
int i;
61+
62+
dt = intel_engine_get_busy_time(engine);
63+
last = engine->stats.rps;
64+
engine->stats.rps = dt;
65+
66+
busy = ktime_to_ns(ktime_sub(dt, last));
67+
for (i = 0; i < ARRAY_SIZE(max_busy); i++) {
68+
if (busy > max_busy[i])
69+
swap(busy, max_busy[i]);
70+
}
71+
}
72+
73+
dt = ktime_get();
74+
last = rps->pm_timestamp;
75+
rps->pm_timestamp = dt;
76+
77+
if (intel_rps_is_active(rps)) {
78+
s64 busy;
79+
int i;
80+
81+
dt = ktime_sub(dt, last);
82+
83+
/*
84+
* Our goal is to evaluate each engine independently, so we run
85+
* at the lowest clocks required to sustain the heaviest
86+
* workload. However, a task may be split into sequential
87+
* dependent operations across a set of engines, such that
88+
* the independent contributions do not account for high load,
89+
* but overall the task is GPU bound. For example, consider
90+
* video decode on vcs followed by colour post-processing
91+
* on vecs, followed by general post-processing on rcs.
92+
* Since multi-engines being active does imply a single
93+
* continuous workload across all engines, we hedge our
94+
* bets by only contributing a factor of the distributed
95+
* load into our busyness calculation.
96+
*/
97+
busy = max_busy[0];
98+
for (i = 1; i < ARRAY_SIZE(max_busy); i++) {
99+
if (!max_busy[i])
100+
break;
101+
102+
busy += div_u64(max_busy[i], 1 << i);
103+
}
104+
GT_TRACE(rps_to_gt(rps),
105+
"busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
106+
busy, (int)div64_u64(100 * busy, dt),
107+
max_busy[0], max_busy[1], max_busy[2],
108+
rps->pm_interval);
109+
110+
if (100 * busy > rps->power.up_threshold * dt &&
111+
rps->cur_freq < rps->max_freq_softlimit) {
112+
rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
113+
rps->pm_interval = 1;
114+
schedule_work(&rps->work);
115+
} else if (100 * busy < rps->power.down_threshold * dt &&
116+
rps->cur_freq > rps->min_freq_softlimit) {
117+
rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
118+
rps->pm_interval = 1;
119+
schedule_work(&rps->work);
120+
} else {
121+
rps->last_adj = 0;
122+
}
123+
124+
mod_timer(&rps->timer,
125+
jiffies + msecs_to_jiffies(rps->pm_interval));
126+
rps->pm_interval = min(rps->pm_interval * 2, BUSY_MAX_EI);
127+
}
128+
}
129+
130+
static void rps_start_timer(struct intel_rps *rps)
131+
{
132+
rps->pm_timestamp = ktime_sub(ktime_get(), rps->pm_timestamp);
133+
rps->pm_interval = 1;
134+
mod_timer(&rps->timer, jiffies + 1);
135+
}
136+
137+
static void rps_stop_timer(struct intel_rps *rps)
138+
{
139+
del_timer_sync(&rps->timer);
140+
rps->pm_timestamp = ktime_sub(ktime_get(), rps->pm_timestamp);
141+
cancel_work_sync(&rps->work);
142+
}
143+
48144
static u32 rps_pm_mask(struct intel_rps *rps, u8 val)
49145
{
50146
u32 mask = 0;
@@ -535,36 +631,24 @@ static void rps_set_power(struct intel_rps *rps, int new_power)
535631
if (new_power == rps->power.mode)
536632
return;
537633

634+
threshold_up = 95;
635+
threshold_down = 85;
636+
538637
/* Note the units here are not exactly 1us, but 1280ns. */
539638
switch (new_power) {
540639
case LOW_POWER:
541-
/* Upclock if more than 95% busy over 16ms */
542640
ei_up = 16000;
543-
threshold_up = 95;
544-
545-
/* Downclock if less than 85% busy over 32ms */
546641
ei_down = 32000;
547-
threshold_down = 85;
548642
break;
549643

550644
case BETWEEN:
551-
/* Upclock if more than 90% busy over 13ms */
552645
ei_up = 13000;
553-
threshold_up = 90;
554-
555-
/* Downclock if less than 75% busy over 32ms */
556646
ei_down = 32000;
557-
threshold_down = 75;
558647
break;
559648

560649
case HIGH_POWER:
561-
/* Upclock if more than 85% busy over 10ms */
562650
ei_up = 10000;
563-
threshold_up = 85;
564-
565-
/* Downclock if less than 60% busy over 32ms */
566651
ei_down = 32000;
567-
threshold_down = 60;
568652
break;
569653
}
570654

@@ -742,8 +826,11 @@ void intel_rps_unpark(struct intel_rps *rps)
742826

743827
mutex_unlock(&rps->lock);
744828

829+
rps->pm_iir = 0;
745830
if (intel_rps_has_interrupts(rps))
746831
rps_enable_interrupts(rps);
832+
if (intel_rps_uses_timer(rps))
833+
rps_start_timer(rps);
747834

748835
if (IS_GEN(rps_to_i915(rps), 5))
749836
gen5_rps_update(rps);
@@ -754,6 +841,8 @@ void intel_rps_park(struct intel_rps *rps)
754841
if (!intel_rps_clear_active(rps))
755842
return;
756843

844+
if (intel_rps_uses_timer(rps))
845+
rps_stop_timer(rps);
757846
if (intel_rps_has_interrupts(rps))
758847
rps_disable_interrupts(rps);
759848

@@ -1211,6 +1300,19 @@ static unsigned long __ips_gfx_val(struct intel_ips *ips)
12111300
return ips->gfx_power + state2;
12121301
}
12131302

1303+
static bool has_busy_stats(struct intel_rps *rps)
1304+
{
1305+
struct intel_engine_cs *engine;
1306+
enum intel_engine_id id;
1307+
1308+
for_each_engine(engine, rps_to_gt(rps), id) {
1309+
if (!intel_engine_supports_stats(engine))
1310+
return false;
1311+
}
1312+
1313+
return true;
1314+
}
1315+
12141316
void intel_rps_enable(struct intel_rps *rps)
12151317
{
12161318
struct drm_i915_private *i915 = rps_to_i915(rps);
@@ -1255,7 +1357,9 @@ void intel_rps_enable(struct intel_rps *rps)
12551357
GEM_BUG_ON(rps->efficient_freq < rps->min_freq);
12561358
GEM_BUG_ON(rps->efficient_freq > rps->max_freq);
12571359

1258-
if (INTEL_GEN(i915) >= 6)
1360+
if (has_busy_stats(rps))
1361+
intel_rps_set_timer(rps);
1362+
else if (INTEL_GEN(i915) >= 6)
12591363
intel_rps_set_interrupts(rps);
12601364
else
12611365
/* Ironlake currently uses intel_ips.ko */ {}
@@ -1274,6 +1378,7 @@ void intel_rps_disable(struct intel_rps *rps)
12741378

12751379
intel_rps_clear_enabled(rps);
12761380
intel_rps_clear_interrupts(rps);
1381+
intel_rps_clear_timer(rps);
12771382

12781383
if (INTEL_GEN(i915) >= 6)
12791384
gen6_rps_disable(rps);
@@ -1689,6 +1794,7 @@ void intel_rps_init_early(struct intel_rps *rps)
16891794
mutex_init(&rps->power.mutex);
16901795

16911796
INIT_WORK(&rps->work, rps_work);
1797+
timer_setup(&rps->timer, rps_timer, 0);
16921798

16931799
atomic_set(&rps->num_waiters, 0);
16941800
}

drivers/gpu/drm/i915/gt/intel_rps.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,19 @@ static inline void intel_rps_clear_interrupts(struct intel_rps *rps)
8181
clear_bit(INTEL_RPS_INTERRUPTS, &rps->flags);
8282
}
8383

84+
static inline bool intel_rps_uses_timer(const struct intel_rps *rps)
85+
{
86+
return test_bit(INTEL_RPS_TIMER, &rps->flags);
87+
}
88+
89+
static inline void intel_rps_set_timer(struct intel_rps *rps)
90+
{
91+
set_bit(INTEL_RPS_TIMER, &rps->flags);
92+
}
93+
94+
static inline void intel_rps_clear_timer(struct intel_rps *rps)
95+
{
96+
clear_bit(INTEL_RPS_TIMER, &rps->flags);
97+
}
98+
8499
#endif /* INTEL_RPS_H */

drivers/gpu/drm/i915/gt/intel_rps_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ enum {
3535
INTEL_RPS_ENABLED = 0,
3636
INTEL_RPS_ACTIVE,
3737
INTEL_RPS_INTERRUPTS,
38+
INTEL_RPS_TIMER,
3839
};
3940

4041
struct intel_rps {
@@ -44,8 +45,12 @@ struct intel_rps {
4445
* work, interrupts_enabled and pm_iir are protected by
4546
* dev_priv->irq_lock
4647
*/
48+
struct timer_list timer;
4749
struct work_struct work;
4850
unsigned long flags;
51+
52+
ktime_t pm_timestamp;
53+
u32 pm_interval;
4954
u32 pm_iir;
5055

5156
/* PM interrupt bits that should never be masked */

0 commit comments

Comments
 (0)