Skip to content

Commit 821ed7d

Browse files
committed
drm/i915: Update reset path to fix incomplete requests
Update reset path in preparation for engine reset which requires identification of incomplete requests and associated context and fixing their state so that engine can resume correctly after reset. The request that caused the hang will be skipped and head is reset to the start of breadcrumb. This allows us to resume from where we left-off. Since this request didn't complete normally we also need to cleanup elsp queue manually. This is vital if we employ nonblocking request submission where we may have a web of dependencies upon the hung request and so advancing the seqno manually is no longer trivial. ABI: gem_reset_stats / DRM_IOCTL_I915_GET_RESET_STATS We change the way we count pending batches. Only the active context involved in the reset is marked as either innocent or guilty, and not mark the entire world as pending. By inspection this only affects igt/gem_reset_stats (which assumes implementation details) and not piglit. ARB_robustness gives this guide on how we expect the user of this interface to behave: * Provide a mechanism for an OpenGL application to learn about graphics resets that affect the context. When a graphics reset occurs, the OpenGL context becomes unusable and the application must create a new context to continue operation. Detecting a graphics reset happens through an inexpensive query. And with regards to the actual meaning of the reset values: Certain events can result in a reset of the GL context. Such a reset causes all context state to be lost. Recovery from such events requires recreation of all objects in the affected context. The current status of the graphics reset state is returned by enum GetGraphicsResetStatusARB(); The symbolic constant returned indicates if the GL context has been in a reset state at any point since the last call to GetGraphicsResetStatusARB. NO_ERROR indicates that the GL context has not been in a reset state since the last call. GUILTY_CONTEXT_RESET_ARB indicates that a reset has been detected that is attributable to the current GL context. INNOCENT_CONTEXT_RESET_ARB indicates a reset has been detected that is not attributable to the current GL context. UNKNOWN_CONTEXT_RESET_ARB indicates a detected graphics reset whose cause is unknown. The language here is explicit in that we must mark up the guilty batch, but is loose enough for us to relax the innocent (i.e. pending) accounting as only the active batches are involved with the reset. In the future, we are looking towards single engine resetting (with minimal locking), where it seems inappropriate to mark the entire world as innocent since the reset occurred on a different engine. Reducing the information available means we only have to encounter the pain once, and also reduces the information leaking from one context to another. v2: Legacy ringbuffer submission required a reset following hibernation, or else we restore stale values to the RING_HEAD and walked over stolen garbage. v3: GuC requires replaying the requests after a reset. v4: Restore engine IRQ after reset (so waiters will be woken!) Rearm hangcheck if resetting with a waiter. Cc: Tvrtko Ursulin <[email protected]> Cc: Mika Kuoppala <[email protected]> Cc: Arun Siluvery <[email protected]> Signed-off-by: Chris Wilson <[email protected]> Reviewed-by: Mika Kuoppala <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 780f262 commit 821ed7d

File tree

10 files changed

+185
-99
lines changed

10 files changed

+185
-99
lines changed

drivers/gpu/drm/i915/i915_drv.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,6 @@ static void i915_gem_fini(struct drm_device *dev)
559559
}
560560

561561
mutex_lock(&dev->struct_mutex);
562-
i915_gem_reset(dev);
563562
i915_gem_cleanup_engines(dev);
564563
i915_gem_context_fini(dev);
565564
mutex_unlock(&dev->struct_mutex);
@@ -1579,7 +1578,7 @@ static int i915_drm_resume(struct drm_device *dev)
15791578
mutex_lock(&dev->struct_mutex);
15801579
if (i915_gem_init_hw(dev)) {
15811580
DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
1582-
set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
1581+
i915_gem_set_wedged(dev_priv);
15831582
}
15841583
mutex_unlock(&dev->struct_mutex);
15851584

@@ -1755,9 +1754,6 @@ void i915_reset(struct drm_i915_private *dev_priv)
17551754
error->reset_count++;
17561755

17571756
pr_notice("drm/i915: Resetting chip after gpu hang\n");
1758-
1759-
i915_gem_reset(dev);
1760-
17611757
ret = intel_gpu_reset(dev_priv, ALL_ENGINES);
17621758
if (ret) {
17631759
if (ret != -ENODEV)
@@ -1767,6 +1763,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
17671763
goto error;
17681764
}
17691765

1766+
i915_gem_reset(dev_priv);
17701767
intel_overlay_reset(dev_priv);
17711768

17721769
/* Ok, now get things going again... */
@@ -1803,7 +1800,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
18031800
return;
18041801

18051802
error:
1806-
set_bit(I915_WEDGED, &error->flags);
1803+
i915_gem_set_wedged(dev_priv);
18071804
goto wakeup;
18081805
}
18091806

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,6 +2042,7 @@ struct drm_i915_private {
20422042

20432043
/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
20442044
struct {
2045+
void (*resume)(struct drm_i915_private *);
20452046
void (*cleanup_engine)(struct intel_engine_cs *engine);
20462047

20472048
/**
@@ -3263,7 +3264,8 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
32633264
return READ_ONCE(error->reset_count);
32643265
}
32653266

3266-
void i915_gem_reset(struct drm_device *dev);
3267+
void i915_gem_reset(struct drm_i915_private *dev_priv);
3268+
void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
32673269
bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
32683270
int __must_check i915_gem_init(struct drm_device *dev);
32693271
int __must_check i915_gem_init_hw(struct drm_device *dev);
@@ -3392,7 +3394,6 @@ void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj);
33923394
int __must_check i915_gem_context_init(struct drm_device *dev);
33933395
void i915_gem_context_lost(struct drm_i915_private *dev_priv);
33943396
void i915_gem_context_fini(struct drm_device *dev);
3395-
void i915_gem_context_reset(struct drm_device *dev);
33963397
int i915_gem_context_open(struct drm_device *dev, struct drm_file *file);
33973398
void i915_gem_context_close(struct drm_device *dev, struct drm_file *file);
33983399
int i915_switch_context(struct drm_i915_gem_request *req);

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 75 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,29 +2555,85 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
25552555
return NULL;
25562556
}
25572557

2558-
static void i915_gem_reset_engine_status(struct intel_engine_cs *engine)
2558+
static void reset_request(struct drm_i915_gem_request *request)
2559+
{
2560+
void *vaddr = request->ring->vaddr;
2561+
u32 head;
2562+
2563+
/* As this request likely depends on state from the lost
2564+
* context, clear out all the user operations leaving the
2565+
* breadcrumb at the end (so we get the fence notifications).
2566+
*/
2567+
head = request->head;
2568+
if (request->postfix < head) {
2569+
memset(vaddr + head, 0, request->ring->size - head);
2570+
head = 0;
2571+
}
2572+
memset(vaddr + head, 0, request->postfix - head);
2573+
}
2574+
2575+
static void i915_gem_reset_engine(struct intel_engine_cs *engine)
25592576
{
25602577
struct drm_i915_gem_request *request;
2578+
struct i915_gem_context *incomplete_ctx;
25612579
bool ring_hung;
25622580

2581+
/* Ensure irq handler finishes, and not run again. */
2582+
tasklet_kill(&engine->irq_tasklet);
2583+
if (engine->irq_seqno_barrier)
2584+
engine->irq_seqno_barrier(engine);
2585+
25632586
request = i915_gem_find_active_request(engine);
2564-
if (request == NULL)
2587+
if (!request)
25652588
return;
25662589

25672590
ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
2568-
25692591
i915_set_reset_status(request->ctx, ring_hung);
2592+
if (!ring_hung)
2593+
return;
2594+
2595+
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
2596+
engine->name, request->fence.seqno);
2597+
2598+
/* Setup the CS to resume from the breadcrumb of the hung request */
2599+
engine->reset_hw(engine, request);
2600+
2601+
/* Users of the default context do not rely on logical state
2602+
* preserved between batches. They have to emit full state on
2603+
* every batch and so it is safe to execute queued requests following
2604+
* the hang.
2605+
*
2606+
* Other contexts preserve state, now corrupt. We want to skip all
2607+
* queued requests that reference the corrupt context.
2608+
*/
2609+
incomplete_ctx = request->ctx;
2610+
if (i915_gem_context_is_default(incomplete_ctx))
2611+
return;
2612+
25702613
list_for_each_entry_continue(request, &engine->request_list, link)
2571-
i915_set_reset_status(request->ctx, false);
2614+
if (request->ctx == incomplete_ctx)
2615+
reset_request(request);
25722616
}
25732617

2574-
static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
2618+
void i915_gem_reset(struct drm_i915_private *dev_priv)
25752619
{
2576-
struct drm_i915_gem_request *request;
2577-
struct intel_ring *ring;
2620+
struct intel_engine_cs *engine;
25782621

2579-
/* Ensure irq handler finishes, and not run again. */
2580-
tasklet_kill(&engine->irq_tasklet);
2622+
i915_gem_retire_requests(dev_priv);
2623+
2624+
for_each_engine(engine, dev_priv)
2625+
i915_gem_reset_engine(engine);
2626+
2627+
i915_gem_restore_fences(&dev_priv->drm);
2628+
}
2629+
2630+
static void nop_submit_request(struct drm_i915_gem_request *request)
2631+
{
2632+
}
2633+
2634+
static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
2635+
{
2636+
engine->submit_request = nop_submit_request;
25812637

25822638
/* Mark all pending requests as complete so that any concurrent
25832639
* (lockless) lookup doesn't try and wait upon the request as we
@@ -2600,54 +2656,22 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
26002656
spin_unlock(&engine->execlist_lock);
26012657
}
26022658

2603-
/*
2604-
* We must free the requests after all the corresponding objects have
2605-
* been moved off active lists. Which is the same order as the normal
2606-
* retire_requests function does. This is important if object hold
2607-
* implicit references on things like e.g. ppgtt address spaces through
2608-
* the request.
2609-
*/
2610-
request = i915_gem_active_raw(&engine->last_request,
2611-
&engine->i915->drm.struct_mutex);
2612-
if (request)
2613-
i915_gem_request_retire_upto(request);
2614-
GEM_BUG_ON(intel_engine_is_active(engine));
2615-
2616-
/* Having flushed all requests from all queues, we know that all
2617-
* ringbuffers must now be empty. However, since we do not reclaim
2618-
* all space when retiring the request (to prevent HEADs colliding
2619-
* with rapid ringbuffer wraparound) the amount of available space
2620-
* upon reset is less than when we start. Do one more pass over
2621-
* all the ringbuffers to reset last_retired_head.
2622-
*/
2623-
list_for_each_entry(ring, &engine->buffers, link) {
2624-
ring->last_retired_head = ring->tail;
2625-
intel_ring_update_space(ring);
2626-
}
2627-
26282659
engine->i915->gt.active_engines &= ~intel_engine_flag(engine);
26292660
}
26302661

2631-
void i915_gem_reset(struct drm_device *dev)
2662+
void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
26322663
{
2633-
struct drm_i915_private *dev_priv = to_i915(dev);
26342664
struct intel_engine_cs *engine;
26352665

2636-
/*
2637-
* Before we free the objects from the requests, we need to inspect
2638-
* them for finding the guilty party. As the requests only borrow
2639-
* their reference to the objects, the inspection must be done first.
2640-
*/
2641-
for_each_engine(engine, dev_priv)
2642-
i915_gem_reset_engine_status(engine);
2666+
lockdep_assert_held(&dev_priv->drm.struct_mutex);
2667+
set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
26432668

2669+
i915_gem_context_lost(dev_priv);
26442670
for_each_engine(engine, dev_priv)
2645-
i915_gem_reset_engine_cleanup(engine);
2671+
i915_gem_cleanup_engine(engine);
26462672
mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
26472673

2648-
i915_gem_context_reset(dev);
2649-
2650-
i915_gem_restore_fences(dev);
2674+
i915_gem_retire_requests(dev_priv);
26512675
}
26522676

26532677
static void
@@ -4343,8 +4367,7 @@ void i915_gem_resume(struct drm_device *dev)
43434367
* guarantee that the context image is complete. So let's just reset
43444368
* it and start again.
43454369
*/
4346-
if (i915.enable_execlists)
4347-
intel_lr_context_reset(dev_priv, dev_priv->kernel_context);
4370+
dev_priv->gt.resume(dev_priv);
43484371

43494372
mutex_unlock(&dev->struct_mutex);
43504373
}
@@ -4496,8 +4519,10 @@ int i915_gem_init(struct drm_device *dev)
44964519
mutex_lock(&dev->struct_mutex);
44974520

44984521
if (!i915.enable_execlists) {
4522+
dev_priv->gt.resume = intel_legacy_submission_resume;
44994523
dev_priv->gt.cleanup_engine = intel_engine_cleanup;
45004524
} else {
4525+
dev_priv->gt.resume = intel_lr_context_resume;
45014526
dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
45024527
}
45034528

@@ -4530,7 +4555,7 @@ int i915_gem_init(struct drm_device *dev)
45304555
* for all other failure, such as an allocation failure, bail.
45314556
*/
45324557
DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
4533-
set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
4558+
i915_gem_set_wedged(dev_priv);
45344559
ret = 0;
45354560
}
45364561

drivers/gpu/drm/i915/i915_gem_context.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -420,22 +420,6 @@ static void i915_gem_context_unpin(struct i915_gem_context *ctx,
420420
}
421421
}
422422

423-
void i915_gem_context_reset(struct drm_device *dev)
424-
{
425-
struct drm_i915_private *dev_priv = to_i915(dev);
426-
427-
lockdep_assert_held(&dev->struct_mutex);
428-
429-
if (i915.enable_execlists) {
430-
struct i915_gem_context *ctx;
431-
432-
list_for_each_entry(ctx, &dev_priv->context_list, link)
433-
intel_lr_context_reset(dev_priv, ctx);
434-
}
435-
436-
i915_gem_context_lost(dev_priv);
437-
}
438-
439423
int i915_gem_context_init(struct drm_device *dev)
440424
{
441425
struct drm_i915_private *dev_priv = to_i915(dev);

drivers/gpu/drm/i915/i915_guc_submission.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
994994
struct intel_guc *guc = &dev_priv->guc;
995995
struct i915_guc_client *client;
996996
struct intel_engine_cs *engine;
997+
struct drm_i915_gem_request *request;
997998

998999
/* client for execbuf submission */
9991000
client = guc_client_alloc(dev_priv,
@@ -1010,9 +1011,14 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
10101011
guc_init_doorbell_hw(guc);
10111012

10121013
/* Take over from manual control of ELSP (execlists) */
1013-
for_each_engine(engine, dev_priv)
1014+
for_each_engine(engine, dev_priv) {
10141015
engine->submit_request = i915_guc_submit;
10151016

1017+
/* Replay the current set of previously submitted requests */
1018+
list_for_each_entry(request, &engine->request_list, link)
1019+
i915_guc_submit(request);
1020+
}
1021+
10161022
return 0;
10171023
}
10181024

drivers/gpu/drm/i915/intel_engine_cs.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
211211
{
212212
memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
213213
clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
214+
if (intel_engine_has_waiter(engine))
215+
i915_queue_hangcheck(engine->i915);
214216
}
215217

216218
static void intel_engine_init_requests(struct intel_engine_cs *engine)
@@ -230,7 +232,6 @@ static void intel_engine_init_requests(struct intel_engine_cs *engine)
230232
*/
231233
void intel_engine_setup_common(struct intel_engine_cs *engine)
232234
{
233-
INIT_LIST_HEAD(&engine->buffers);
234235
INIT_LIST_HEAD(&engine->execlist_queue);
235236
spin_lock_init(&engine->execlist_lock);
236237

@@ -306,6 +307,18 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
306307
return 0;
307308
}
308309

310+
void intel_engine_reset_irq(struct intel_engine_cs *engine)
311+
{
312+
struct drm_i915_private *dev_priv = engine->i915;
313+
314+
spin_lock_irq(&dev_priv->irq_lock);
315+
if (intel_engine_has_waiter(engine))
316+
engine->irq_enable(engine);
317+
else
318+
engine->irq_disable(engine);
319+
spin_unlock_irq(&dev_priv->irq_lock);
320+
}
321+
309322
/**
310323
* intel_engines_cleanup_common - cleans up the engine state created by
311324
* the common initiailizers.

0 commit comments

Comments
 (0)