Skip to content

Commit 42fb60d

Browse files
committed
drm/i915/gem: Don't leak non-persistent requests on changing engines
If we have a set of active engines marked as being non-persistent, we lose track of those if the user replaces those engines with I915_CONTEXT_PARAM_ENGINES. As part of our uABI contract is that non-persistent requests are terminated if they are no longer being tracked by the user's context (in order to prevent a lost request causing an untracked and so unstoppable GPU hang), we need to apply the same context cancellation upon changing engines. v2: Track stale engines[] so we only reap at context closure. v3: Tvrtko spotted races with closing contexts and set-engines, so add a veneer of kill-everything paranoia to clean up after losing a race. Fixes: a0e0471 ("drm/i915/gem: Make context persistence optional") Testcase: igt/gem_ctx_peristence/replace Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 0b02f97 commit 42fb60d

File tree

4 files changed

+141
-13
lines changed

4 files changed

+141
-13
lines changed

drivers/gpu/drm/i915/gem/i915_gem_context.c

Lines changed: 114 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,8 @@ static struct i915_gem_engines *default_engines(struct i915_gem_context *ctx)
270270
if (!e)
271271
return ERR_PTR(-ENOMEM);
272272

273-
init_rcu_head(&e->rcu);
273+
e->ctx = ctx;
274+
274275
for_each_engine(engine, gt, id) {
275276
struct intel_context *ce;
276277

@@ -450,7 +451,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
450451
return engine;
451452
}
452453

453-
static void kill_context(struct i915_gem_context *ctx)
454+
static void kill_engines(struct i915_gem_engines *engines)
454455
{
455456
struct i915_gem_engines_iter it;
456457
struct intel_context *ce;
@@ -462,7 +463,7 @@ static void kill_context(struct i915_gem_context *ctx)
462463
* However, we only care about pending requests, so only include
463464
* engines on which there are incomplete requests.
464465
*/
465-
for_each_gem_engine(ce, __context_engines_static(ctx), it) {
466+
for_each_gem_engine(ce, engines, it) {
466467
struct intel_engine_cs *engine;
467468

468469
if (intel_context_set_banned(ce))
@@ -484,8 +485,37 @@ static void kill_context(struct i915_gem_context *ctx)
484485
* the context from the GPU, we have to resort to a full
485486
* reset. We hope the collateral damage is worth it.
486487
*/
487-
__reset_context(ctx, engine);
488+
__reset_context(engines->ctx, engine);
489+
}
490+
}
491+
492+
static void kill_stale_engines(struct i915_gem_context *ctx)
493+
{
494+
struct i915_gem_engines *pos, *next;
495+
unsigned long flags;
496+
497+
spin_lock_irqsave(&ctx->stale.lock, flags);
498+
list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) {
499+
if (!i915_sw_fence_await(&pos->fence))
500+
continue;
501+
502+
spin_unlock_irqrestore(&ctx->stale.lock, flags);
503+
504+
kill_engines(pos);
505+
506+
spin_lock_irqsave(&ctx->stale.lock, flags);
507+
list_safe_reset_next(pos, next, link);
508+
list_del_init(&pos->link); /* decouple from FENCE_COMPLETE */
509+
510+
i915_sw_fence_complete(&pos->fence);
488511
}
512+
spin_unlock_irqrestore(&ctx->stale.lock, flags);
513+
}
514+
515+
static void kill_context(struct i915_gem_context *ctx)
516+
{
517+
kill_stale_engines(ctx);
518+
kill_engines(__context_engines_static(ctx));
489519
}
490520

491521
static void set_closed_name(struct i915_gem_context *ctx)
@@ -602,6 +632,9 @@ __create_context(struct drm_i915_private *i915)
602632
ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
603633
mutex_init(&ctx->mutex);
604634

635+
spin_lock_init(&ctx->stale.lock);
636+
INIT_LIST_HEAD(&ctx->stale.engines);
637+
605638
mutex_init(&ctx->engines_mutex);
606639
e = default_engines(ctx);
607640
if (IS_ERR(e)) {
@@ -1529,6 +1562,77 @@ static const i915_user_extension_fn set_engines__extensions[] = {
15291562
[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
15301563
};
15311564

1565+
static int engines_notify(struct i915_sw_fence *fence,
1566+
enum i915_sw_fence_notify state)
1567+
{
1568+
struct i915_gem_engines *engines =
1569+
container_of(fence, typeof(*engines), fence);
1570+
1571+
switch (state) {
1572+
case FENCE_COMPLETE:
1573+
if (!list_empty(&engines->link)) {
1574+
struct i915_gem_context *ctx = engines->ctx;
1575+
unsigned long flags;
1576+
1577+
spin_lock_irqsave(&ctx->stale.lock, flags);
1578+
list_del(&engines->link);
1579+
spin_unlock_irqrestore(&ctx->stale.lock, flags);
1580+
}
1581+
break;
1582+
1583+
case FENCE_FREE:
1584+
init_rcu_head(&engines->rcu);
1585+
call_rcu(&engines->rcu, free_engines_rcu);
1586+
break;
1587+
}
1588+
1589+
return NOTIFY_DONE;
1590+
}
1591+
1592+
static void engines_idle_release(struct i915_gem_engines *engines)
1593+
{
1594+
struct i915_gem_engines_iter it;
1595+
struct intel_context *ce;
1596+
unsigned long flags;
1597+
1598+
GEM_BUG_ON(!engines);
1599+
i915_sw_fence_init(&engines->fence, engines_notify);
1600+
1601+
INIT_LIST_HEAD(&engines->link);
1602+
spin_lock_irqsave(&engines->ctx->stale.lock, flags);
1603+
if (!i915_gem_context_is_closed(engines->ctx))
1604+
list_add(&engines->link, &engines->ctx->stale.engines);
1605+
spin_unlock_irqrestore(&engines->ctx->stale.lock, flags);
1606+
if (list_empty(&engines->link)) /* raced, already closed */
1607+
goto kill;
1608+
1609+
for_each_gem_engine(ce, engines, it) {
1610+
struct dma_fence *fence;
1611+
int err;
1612+
1613+
if (!ce->timeline)
1614+
continue;
1615+
1616+
fence = i915_active_fence_get(&ce->timeline->last_request);
1617+
if (!fence)
1618+
continue;
1619+
1620+
err = i915_sw_fence_await_dma_fence(&engines->fence,
1621+
fence, 0,
1622+
GFP_KERNEL);
1623+
1624+
dma_fence_put(fence);
1625+
if (err < 0)
1626+
goto kill;
1627+
}
1628+
goto out;
1629+
1630+
kill:
1631+
kill_engines(engines);
1632+
out:
1633+
i915_sw_fence_commit(&engines->fence);
1634+
}
1635+
15321636
static int
15331637
set_engines(struct i915_gem_context *ctx,
15341638
const struct drm_i915_gem_context_param *args)
@@ -1571,7 +1675,8 @@ set_engines(struct i915_gem_context *ctx,
15711675
if (!set.engines)
15721676
return -ENOMEM;
15731677

1574-
init_rcu_head(&set.engines->rcu);
1678+
set.engines->ctx = ctx;
1679+
15751680
for (n = 0; n < num_engines; n++) {
15761681
struct i915_engine_class_instance ci;
15771682
struct intel_engine_cs *engine;
@@ -1631,7 +1736,8 @@ set_engines(struct i915_gem_context *ctx,
16311736
set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1);
16321737
mutex_unlock(&ctx->engines_mutex);
16331738

1634-
call_rcu(&set.engines->rcu, free_engines_rcu);
1739+
/* Keep track of old engine sets for kill_context() */
1740+
engines_idle_release(set.engines);
16351741

16361742
return 0;
16371743
}
@@ -1646,7 +1752,6 @@ __copy_engines(struct i915_gem_engines *e)
16461752
if (!copy)
16471753
return ERR_PTR(-ENOMEM);
16481754

1649-
init_rcu_head(&copy->rcu);
16501755
for (n = 0; n < e->num_engines; n++) {
16511756
if (e->engines[n])
16521757
copy->engines[n] = intel_context_get(e->engines[n]);
@@ -1890,7 +1995,8 @@ static int clone_engines(struct i915_gem_context *dst,
18901995
if (!clone)
18911996
goto err_unlock;
18921997

1893-
init_rcu_head(&clone->rcu);
1998+
clone->ctx = dst;
1999+
18942000
for (n = 0; n < e->num_engines; n++) {
18952001
struct intel_engine_cs *engine;
18962002

drivers/gpu/drm/i915/gem/i915_gem_context_types.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "gt/intel_context_types.h"
2121

2222
#include "i915_scheduler.h"
23+
#include "i915_sw_fence.h"
2324

2425
struct pid;
2526

@@ -30,7 +31,12 @@ struct intel_timeline;
3031
struct intel_ring;
3132

3233
struct i915_gem_engines {
33-
struct rcu_head rcu;
34+
union {
35+
struct list_head link;
36+
struct rcu_head rcu;
37+
};
38+
struct i915_sw_fence fence;
39+
struct i915_gem_context *ctx;
3440
unsigned int num_engines;
3541
struct intel_context *engines[];
3642
};
@@ -173,6 +179,11 @@ struct i915_gem_context {
173179
* context in messages.
174180
*/
175181
char name[TASK_COMM_LEN + 8];
182+
183+
struct {
184+
struct spinlock lock;
185+
struct list_head engines;
186+
} stale;
176187
};
177188

178189
#endif /* __I915_GEM_CONTEXT_TYPES_H__ */

drivers/gpu/drm/i915/i915_sw_fence.c

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,21 @@ void i915_sw_fence_complete(struct i915_sw_fence *fence)
211211
__i915_sw_fence_complete(fence, NULL);
212212
}
213213

214-
void i915_sw_fence_await(struct i915_sw_fence *fence)
214+
bool i915_sw_fence_await(struct i915_sw_fence *fence)
215215
{
216-
debug_fence_assert(fence);
217-
WARN_ON(atomic_inc_return(&fence->pending) <= 1);
216+
int pending;
217+
218+
/*
219+
* It is only safe to add a new await to the fence while it has
220+
* not yet been signaled (i.e. there are still existing signalers).
221+
*/
222+
pending = atomic_read(&fence->pending);
223+
do {
224+
if (pending < 1)
225+
return false;
226+
} while (!atomic_try_cmpxchg(&fence->pending, &pending, pending + 1));
227+
228+
return true;
218229
}
219230

220231
void __i915_sw_fence_init(struct i915_sw_fence *fence,

drivers/gpu/drm/i915/i915_sw_fence.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
9191
unsigned long timeout,
9292
gfp_t gfp);
9393

94-
void i915_sw_fence_await(struct i915_sw_fence *fence);
94+
bool i915_sw_fence_await(struct i915_sw_fence *fence);
9595
void i915_sw_fence_complete(struct i915_sw_fence *fence);
9696

9797
static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)

0 commit comments

Comments
 (0)