Skip to content

Commit c0dcb20

Browse files
committed
drm/i915: Restore context and pd for ringbuffer submission after reset
Following a reset, the context and page directory registers are lost. However, the queue of requests that we resubmit after the reset may depend upon them - the registers are restored from a context image, but that restore may be inhibited and may simply be absent from the request if it was in the middle of a sequence using the same context. If we prime the CCID/PD registers with the first request in the queue (even for the hung request), we prevent invalid memory access for the following requests (and continually hung engines). v2: Magic BIT(8), reserved for future use but still appears unused. v3: Some commentary on handling innocent vs guilty requests v4: Add a wait for PD_BASE fetch. The reload appears to be instant on my Ivybridge, but this bit probably exists for a reason. Fixes: 821ed7d ("drm/i915: Update reset path to fix incomplete requests") Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Cc: Mika Kuoppala <[email protected]> Link: http://patchwork.freedesktop.org/patch/msgid/[email protected] Reviewed-by: Mika Kuoppala <[email protected]>
1 parent 6248017 commit c0dcb20

File tree

4 files changed

+81
-17
lines changed

4 files changed

+81
-17
lines changed

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2745,21 +2745,17 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
27452745
engine->irq_seqno_barrier(engine);
27462746

27472747
request = i915_gem_find_active_request(engine);
2748-
if (!request)
2749-
return;
2750-
2751-
if (!i915_gem_reset_request(request))
2752-
return;
2748+
if (request && i915_gem_reset_request(request)) {
2749+
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
2750+
engine->name, request->global_seqno);
27532751

2754-
DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
2755-
engine->name, request->global_seqno);
2752+
/* If this context is now banned, skip all pending requests. */
2753+
if (i915_gem_context_is_banned(request->ctx))
2754+
engine_skip_context(request);
2755+
}
27562756

27572757
/* Setup the CS to resume from the breadcrumb of the hung request */
27582758
engine->reset_hw(engine, request);
2759-
2760-
/* If this context is now banned, skip all of its pending requests. */
2761-
if (i915_gem_context_is_banned(request->ctx))
2762-
engine_skip_context(request);
27632759
}
27642760

27652761
void i915_gem_reset_finish(struct drm_i915_private *dev_priv)

drivers/gpu/drm/i915/i915_reg.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3307,8 +3307,10 @@ enum skl_disp_power_wells {
33073307
/*
33083308
* Logical Context regs
33093309
*/
3310-
#define CCID _MMIO(0x2180)
3311-
#define CCID_EN (1<<0)
3310+
#define CCID _MMIO(0x2180)
3311+
#define CCID_EN BIT(0)
3312+
#define CCID_EXTENDED_STATE_RESTORE BIT(2)
3313+
#define CCID_EXTENDED_STATE_SAVE BIT(3)
33123314
/*
33133315
* Notes on SNB/IVB/VLV context size:
33143316
* - Power context is saved elsewhere (LLC or stolen)

drivers/gpu/drm/i915/intel_lrc.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1352,7 +1352,20 @@ static void reset_common_ring(struct intel_engine_cs *engine,
13521352
struct drm_i915_gem_request *request)
13531353
{
13541354
struct execlist_port *port = engine->execlist_port;
1355-
struct intel_context *ce = &request->ctx->engine[engine->id];
1355+
struct intel_context *ce;
1356+
1357+
/* If the request was innocent, we leave the request in the ELSP
1358+
* and will try to replay it on restarting. The context image may
1359+
* have been corrupted by the reset, in which case we may have
1360+
* to service a new GPU hang, but more likely we can continue on
1361+
* without impact.
1362+
*
1363+
* If the request was guilty, we presume the context is corrupt
1364+
* and have to at least restore the RING register in the context
1365+
* image back to the expected values to skip over the guilty request.
1366+
*/
1367+
if (!request || request->fence.error != -EIO)
1368+
return;
13561369

13571370
/* We want a simple context + ring to execute the breadcrumb update.
13581371
* We cannot rely on the context being intact across the GPU hang,
@@ -1361,6 +1374,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
13611374
* future request will be after userspace has had the opportunity
13621375
* to recreate its own state.
13631376
*/
1377+
ce = &request->ctx->engine[engine->id];
13641378
execlists_init_reg_state(ce->lrc_reg_state,
13651379
request->ctx, engine, ce->ring);
13661380

drivers/gpu/drm/i915/intel_ringbuffer.c

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -599,10 +599,62 @@ static int init_ring_common(struct intel_engine_cs *engine)
599599
static void reset_ring_common(struct intel_engine_cs *engine,
600600
struct drm_i915_gem_request *request)
601601
{
602-
struct intel_ring *ring = request->ring;
602+
/* Try to restore the logical GPU state to match the continuation
603+
* of the request queue. If we skip the context/PD restore, then
604+
* the next request may try to execute assuming that its context
605+
* is valid and loaded on the GPU and so may try to access invalid
606+
* memory, prompting repeated GPU hangs.
607+
*
608+
* If the request was guilty, we still restore the logical state
609+
* in case the next request requires it (e.g. the aliasing ppgtt),
610+
* but skip over the hung batch.
611+
*
612+
* If the request was innocent, we try to replay the request with
613+
* the restored context.
614+
*/
615+
if (request) {
616+
struct drm_i915_private *dev_priv = request->i915;
617+
struct intel_context *ce = &request->ctx->engine[engine->id];
618+
struct i915_hw_ppgtt *ppgtt;
619+
620+
/* FIXME consider gen8 reset */
621+
622+
if (ce->state) {
623+
I915_WRITE(CCID,
624+
i915_ggtt_offset(ce->state) |
625+
BIT(8) /* must be set! */ |
626+
CCID_EXTENDED_STATE_SAVE |
627+
CCID_EXTENDED_STATE_RESTORE |
628+
CCID_EN);
629+
}
603630

604-
ring->head = request->postfix;
605-
ring->last_retired_head = -1;
631+
ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt;
632+
if (ppgtt) {
633+
u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10;
634+
635+
I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G);
636+
I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset);
637+
638+
/* Wait for the PD reload to complete */
639+
if (intel_wait_for_register(dev_priv,
640+
RING_PP_DIR_BASE(engine),
641+
BIT(0), 0,
642+
10))
643+
DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n");
644+
645+
ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
646+
}
647+
648+
/* If the rq hung, jump to its breadcrumb and skip the batch */
649+
if (request->fence.error == -EIO) {
650+
struct intel_ring *ring = request->ring;
651+
652+
ring->head = request->postfix;
653+
ring->last_retired_head = -1;
654+
}
655+
} else {
656+
engine->legacy_active_context = NULL;
657+
}
606658
}
607659

608660
static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)

0 commit comments

Comments
 (0)