Skip to content

Commit 5861b01

Browse files
committed
drm/i915: Do a synchronous switch-to-kernel-context on idling
When the system idles, we switch to the kernel context as a defensive measure (no users are harmed if the kernel context is lost). Currently, we issue a switch to kernel context and then come back later to see if the kernel context is still current and the system is idle. However, if we are no longer privy to the runqueue ordering, then we have to relax our assumptions about the logical state of the GPU and the only way to ensure that the kernel context is currently loaded is by issuing a request to run after all others, and wait for it to complete all while preventing anyone else from issuing their own requests. v2: Pull wedging into switch_to_kernel_context_sync() but only after waiting (though only for the same short delay) for the active context to finish. Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 3123ada commit 5861b01

File tree

5 files changed

+73
-110
lines changed

5 files changed

+73
-110
lines changed

drivers/gpu/drm/i915/i915_drv.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -714,8 +714,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
714714
return 0;
715715

716716
cleanup_gem:
717-
if (i915_gem_suspend(dev_priv))
718-
DRM_ERROR("failed to idle hardware; continuing to unload!\n");
717+
i915_gem_suspend(dev_priv);
719718
i915_gem_fini(dev_priv);
720719
cleanup_modeset:
721720
intel_modeset_cleanup(dev);
@@ -1933,8 +1932,7 @@ void i915_driver_unload(struct drm_device *dev)
19331932
/* Flush any external code that still may be under the RCU lock */
19341933
synchronize_rcu();
19351934

1936-
if (i915_gem_suspend(dev_priv))
1937-
DRM_ERROR("failed to idle hardware; continuing to unload!\n");
1935+
i915_gem_suspend(dev_priv);
19381936

19391937
drm_atomic_helper_shutdown(dev);
19401938

@@ -2042,20 +2040,16 @@ static bool suspend_to_idle(struct drm_i915_private *dev_priv)
20422040
static int i915_drm_prepare(struct drm_device *dev)
20432041
{
20442042
struct drm_i915_private *i915 = to_i915(dev);
2045-
int err;
20462043

20472044
/*
20482045
* NB intel_display_suspend() may issue new requests after we've
20492046
* ostensibly marked the GPU as ready-to-sleep here. We need to
20502047
* split out that work and pull it forward so that after point,
20512048
* the GPU is not woken again.
20522049
*/
2053-
err = i915_gem_suspend(i915);
2054-
if (err)
2055-
dev_err(&i915->drm.pdev->dev,
2056-
"GEM idle failed, suspend/resume might fail\n");
2050+
i915_gem_suspend(i915);
20572051

2058-
return err;
2052+
return 0;
20592053
}
20602054

20612055
static int i915_drm_suspend(struct drm_device *dev)

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3032,7 +3032,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv);
30323032
void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
30333033
int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
30343034
unsigned int flags, long timeout);
3035-
int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
3035+
void i915_gem_suspend(struct drm_i915_private *dev_priv);
30363036
void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
30373037
void i915_gem_resume(struct drm_i915_private *dev_priv);
30383038
vm_fault_t i915_gem_fault(struct vm_fault *vmf);

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 62 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -2828,13 +2828,6 @@ i915_gem_retire_work_handler(struct work_struct *work)
28282828
round_jiffies_up_relative(HZ));
28292829
}
28302830

2831-
static inline bool
2832-
new_requests_since_last_retire(const struct drm_i915_private *i915)
2833-
{
2834-
return (READ_ONCE(i915->gt.active_requests) ||
2835-
work_pending(&i915->gt.idle_work.work));
2836-
}
2837-
28382831
static void assert_kernel_context_is_current(struct drm_i915_private *i915)
28392832
{
28402833
struct intel_engine_cs *engine;
@@ -2843,85 +2836,95 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
28432836
if (i915_reset_failed(i915))
28442837
return;
28452838

2846-
GEM_BUG_ON(i915->gt.active_requests);
2839+
i915_retire_requests(i915);
2840+
28472841
for_each_engine(engine, i915, id) {
28482842
GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
28492843
GEM_BUG_ON(engine->last_retired_context !=
28502844
to_intel_context(i915->kernel_context, engine));
28512845
}
28522846
}
28532847

2848+
static bool switch_to_kernel_context_sync(struct drm_i915_private *i915)
2849+
{
2850+
bool result = true;
2851+
2852+
/*
2853+
* Even if we fail to switch, give whatever is running a small chance
2854+
* to save itself before we report the failure. Yes, this may be a
2855+
* false positive due to e.g. ENOMEM, caveat emptor!
2856+
*/
2857+
if (i915_gem_switch_to_kernel_context(i915))
2858+
result = false;
2859+
2860+
if (i915_gem_wait_for_idle(i915,
2861+
I915_WAIT_LOCKED |
2862+
I915_WAIT_FOR_IDLE_BOOST,
2863+
I915_GEM_IDLE_TIMEOUT))
2864+
result = false;
2865+
2866+
if (result) {
2867+
assert_kernel_context_is_current(i915);
2868+
} else {
2869+
/* Forcibly cancel outstanding work and leave the gpu quiet. */
2870+
dev_err(i915->drm.dev,
2871+
"Failed to idle engines, declaring wedged!\n");
2872+
GEM_TRACE_DUMP();
2873+
i915_gem_set_wedged(i915);
2874+
}
2875+
2876+
i915_retire_requests(i915); /* ensure we flush after wedging */
2877+
return result;
2878+
}
2879+
28542880
static void
28552881
i915_gem_idle_work_handler(struct work_struct *work)
28562882
{
2857-
struct drm_i915_private *dev_priv =
2858-
container_of(work, typeof(*dev_priv), gt.idle_work.work);
2883+
struct drm_i915_private *i915 =
2884+
container_of(work, typeof(*i915), gt.idle_work.work);
28592885
bool rearm_hangcheck;
28602886

2861-
if (!READ_ONCE(dev_priv->gt.awake))
2887+
if (!READ_ONCE(i915->gt.awake))
28622888
return;
28632889

2864-
if (READ_ONCE(dev_priv->gt.active_requests))
2890+
if (READ_ONCE(i915->gt.active_requests))
28652891
return;
28662892

2867-
/*
2868-
* Flush out the last user context, leaving only the pinned
2869-
* kernel context resident. When we are idling on the kernel_context,
2870-
* no more new requests (with a context switch) are emitted and we
2871-
* can finally rest. A consequence is that the idle work handler is
2872-
* always called at least twice before idling (and if the system is
2873-
* idle that implies a round trip through the retire worker).
2874-
*/
2875-
mutex_lock(&dev_priv->drm.struct_mutex);
2876-
i915_gem_switch_to_kernel_context(dev_priv);
2877-
mutex_unlock(&dev_priv->drm.struct_mutex);
2878-
2879-
GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
2880-
READ_ONCE(dev_priv->gt.active_requests));
2881-
2882-
/*
2883-
* Wait for last execlists context complete, but bail out in case a
2884-
* new request is submitted. As we don't trust the hardware, we
2885-
* continue on if the wait times out. This is necessary to allow
2886-
* the machine to suspend even if the hardware dies, and we will
2887-
* try to recover in resume (after depriving the hardware of power,
2888-
* it may be in a better mmod).
2889-
*/
2890-
__wait_for(if (new_requests_since_last_retire(dev_priv)) return,
2891-
intel_engines_are_idle(dev_priv),
2892-
I915_IDLE_ENGINES_TIMEOUT * 1000,
2893-
10, 500);
2894-
28952893
rearm_hangcheck =
2896-
cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
2894+
cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
28972895

2898-
if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
2896+
if (!mutex_trylock(&i915->drm.struct_mutex)) {
28992897
/* Currently busy, come back later */
2900-
mod_delayed_work(dev_priv->wq,
2901-
&dev_priv->gt.idle_work,
2898+
mod_delayed_work(i915->wq,
2899+
&i915->gt.idle_work,
29022900
msecs_to_jiffies(50));
29032901
goto out_rearm;
29042902
}
29052903

29062904
/*
2907-
* New request retired after this work handler started, extend active
2908-
* period until next instance of the work.
2905+
* Flush out the last user context, leaving only the pinned
2906+
* kernel context resident. Should anything unfortunate happen
2907+
* while we are idle (such as the GPU being power cycled), no users
2908+
* will be harmed.
29092909
*/
2910-
if (new_requests_since_last_retire(dev_priv))
2911-
goto out_unlock;
2910+
if (!work_pending(&i915->gt.idle_work.work) &&
2911+
!i915->gt.active_requests) {
2912+
++i915->gt.active_requests; /* don't requeue idle */
29122913

2913-
__i915_gem_park(dev_priv);
2914+
switch_to_kernel_context_sync(i915);
29142915

2915-
assert_kernel_context_is_current(dev_priv);
2916+
if (!--i915->gt.active_requests) {
2917+
__i915_gem_park(i915);
2918+
rearm_hangcheck = false;
2919+
}
2920+
}
29162921

2917-
rearm_hangcheck = false;
2918-
out_unlock:
2919-
mutex_unlock(&dev_priv->drm.struct_mutex);
2922+
mutex_unlock(&i915->drm.struct_mutex);
29202923

29212924
out_rearm:
29222925
if (rearm_hangcheck) {
2923-
GEM_BUG_ON(!dev_priv->gt.awake);
2924-
i915_queue_hangcheck(dev_priv);
2926+
GEM_BUG_ON(!i915->gt.awake);
2927+
i915_queue_hangcheck(i915);
29252928
}
29262929
}
29272930

@@ -3128,7 +3131,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
31283131
return err;
31293132

31303133
i915_retire_requests(i915);
3131-
GEM_BUG_ON(i915->gt.active_requests);
31323134
}
31333135

31343136
return 0;
@@ -4340,10 +4342,9 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
43404342
mutex_unlock(&i915->drm.struct_mutex);
43414343
}
43424344

4343-
int i915_gem_suspend(struct drm_i915_private *i915)
4345+
void i915_gem_suspend(struct drm_i915_private *i915)
43444346
{
43454347
intel_wakeref_t wakeref;
4346-
int ret;
43474348

43484349
GEM_TRACE("\n");
43494350

@@ -4363,23 +4364,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
43634364
* state. Fortunately, the kernel_context is disposable and we do
43644365
* not rely on its state.
43654366
*/
4366-
if (!i915_reset_failed(i915)) {
4367-
ret = i915_gem_switch_to_kernel_context(i915);
4368-
if (ret)
4369-
goto err_unlock;
4370-
4371-
ret = i915_gem_wait_for_idle(i915,
4372-
I915_WAIT_INTERRUPTIBLE |
4373-
I915_WAIT_LOCKED |
4374-
I915_WAIT_FOR_IDLE_BOOST,
4375-
I915_GEM_IDLE_TIMEOUT);
4376-
if (ret == -EINTR)
4377-
goto err_unlock;
4378-
4379-
/* Forcibly cancel outstanding work and leave the gpu quiet. */
4380-
i915_gem_set_wedged(i915);
4381-
}
4382-
i915_retire_requests(i915); /* ensure we flush after wedging */
4367+
switch_to_kernel_context_sync(i915);
43834368

43844369
mutex_unlock(&i915->drm.struct_mutex);
43854370
i915_reset_flush(i915);
@@ -4399,12 +4384,6 @@ int i915_gem_suspend(struct drm_i915_private *i915)
43994384
GEM_BUG_ON(i915->gt.awake);
44004385

44014386
intel_runtime_pm_put(i915, wakeref);
4402-
return 0;
4403-
4404-
err_unlock:
4405-
mutex_unlock(&i915->drm.struct_mutex);
4406-
intel_runtime_pm_put(i915, wakeref);
4407-
return ret;
44084387
}
44094388

44104389
void i915_gem_suspend_late(struct drm_i915_private *i915)
@@ -4670,20 +4649,11 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
46704649
goto err_active;
46714650
}
46724651

4673-
err = i915_gem_switch_to_kernel_context(i915);
4674-
if (err)
4675-
goto err_active;
4676-
4677-
if (i915_gem_wait_for_idle(i915,
4678-
I915_WAIT_LOCKED,
4679-
I915_GEM_IDLE_TIMEOUT)) {
4680-
i915_gem_set_wedged(i915);
4652+
if (!switch_to_kernel_context_sync(i915)) {
46814653
err = -EIO; /* Caller will declare us wedged */
46824654
goto err_active;
46834655
}
46844656

4685-
assert_kernel_context_is_current(i915);
4686-
46874657
/*
46884658
* Immediately park the GPU so that we enable powersaving and
46894659
* treat it as idle. The next time we issue a request, we will
@@ -4927,7 +4897,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
49274897
err_init_hw:
49284898
mutex_unlock(&dev_priv->drm.struct_mutex);
49294899

4930-
WARN_ON(i915_gem_suspend(dev_priv));
4900+
i915_gem_suspend(dev_priv);
49314901
i915_gem_suspend_late(dev_priv);
49324902

49334903
i915_gem_drain_workqueue(dev_priv);

drivers/gpu/drm/i915/i915_gem_context.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,10 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
767767
lockdep_assert_held(&i915->drm.struct_mutex);
768768
GEM_BUG_ON(!i915->kernel_context);
769769

770+
/* Inoperable, so presume the GPU is safely pointing into the void! */
771+
if (i915_terminally_wedged(i915))
772+
return 0;
773+
770774
i915_retire_requests(i915);
771775

772776
for_each_engine(engine, i915, id) {

drivers/gpu/drm/i915/selftests/i915_gem.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,9 @@ static void simulate_hibernate(struct drm_i915_private *i915)
8484

8585
static int pm_prepare(struct drm_i915_private *i915)
8686
{
87-
int err = 0;
88-
89-
if (i915_gem_suspend(i915)) {
90-
pr_err("i915_gem_suspend failed\n");
91-
err = -EINVAL;
92-
}
87+
i915_gem_suspend(i915);
9388

94-
return err;
89+
return 0;
9590
}
9691

9792
static void pm_suspend(struct drm_i915_private *i915)

0 commit comments

Comments
 (0)