Skip to content

Commit 8ed9aaa

Browse files
committed
drm/xe: Force wedged state and block GT reset upon any GPU hang
In many validation situations when debugging GPU Hangs, it is useful to preserve the GT situation from the moment that the timeout occurred. This patch introduces a module parameter that could be used on situations like this. If xe.wedged module parameter is set to 2, Xe will be declared wedged on every single execution timeout (a.k.a. GPU hang) right after devcoredump snapshot capture and without attempting any kind of GT reset and blocking entirely any kind of execution. v2: Really block gt_reset from guc side. (Lucas) s/wedged/busted (Lucas) v3: - s/busted/wedged - Really use global_flags (Dafna) - More robust timeout handling when wedging it. v4: A really robust clean exit done by Matt Brost. No more kernel warns on unbind. v5: Simplify error message (Lucas) Cc: Matthew Brost <[email protected]> Cc: Dafna Hirschfeld <[email protected]> Cc: Lucas De Marchi <[email protected]> Cc: Alan Previn <[email protected]> Cc: Himanshu Somaiya <[email protected]> Reviewed-by: Lucas De Marchi <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] Signed-off-by: Rodrigo Vivi <[email protected]>
1 parent 6928186 commit 8ed9aaa

File tree

8 files changed

+129
-31
lines changed

8 files changed

+129
-31
lines changed

drivers/gpu/drm/xe/xe_device.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,3 +764,32 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
764764
{
765765
return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
766766
}
767+
768+
/**
769+
* xe_device_declare_wedged - Declare device wedged
770+
* @xe: xe device instance
771+
*
772+
* This is a final state that can only be cleared with a module
773+
* re-probe (unbind + bind).
774+
* In this state every IOCTL will be blocked so the GT cannot be used.
775+
* In general it will be called upon any critical error such as gt reset
776+
* failure or guc loading failure.
777+
* If xe.wedged module parameter is set to 2, this function will be called
778+
* on every single execution timeout (a.k.a. GPU hang) right after devcoredump
779+
* snapshot capture. In this mode, GT reset won't be attempted so the state of
780+
* the issue is preserved for further debugging.
781+
*/
782+
void xe_device_declare_wedged(struct xe_device *xe)
783+
{
784+
if (xe_modparam.wedged_mode == 0)
785+
return;
786+
787+
if (!atomic_xchg(&xe->wedged, 1)) {
788+
xe->needs_flr_on_fini = true;
789+
drm_err(&xe->drm,
790+
"CRITICAL: Xe has declared device %s as wedged.\n"
791+
"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
792+
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
793+
dev_name(xe->drm.dev));
794+
}
795+
}

drivers/gpu/drm/xe/xe_device.h

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
172172
return atomic_read(&xe->wedged);
173173
}
174174

175-
static inline void xe_device_declare_wedged(struct xe_device *xe)
176-
{
177-
if (!atomic_xchg(&xe->wedged, 1)) {
178-
xe->needs_flr_on_fini = true;
179-
drm_err(&xe->drm,
180-
"CRITICAL: Xe has declared device %s as wedged.\n"
181-
"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
182-
"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
183-
"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
184-
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
185-
dev_name(xe->drm.dev), dev_name(xe->drm.dev),
186-
dev_name(xe->drm.dev));
187-
}
188-
}
175+
void xe_device_declare_wedged(struct xe_device *xe);
189176

190177
#endif

drivers/gpu/drm/xe/xe_exec_queue.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q);
2626
void xe_exec_queue_destroy(struct kref *ref);
2727
void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance);
2828

29+
static inline struct xe_exec_queue *
30+
xe_exec_queue_get_unless_zero(struct xe_exec_queue *q)
31+
{
32+
if (kref_get_unless_zero(&q->refcount))
33+
return q;
34+
35+
return NULL;
36+
}
37+
2938
struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id);
3039

3140
static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q)

drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
245245
return seqno;
246246

247247
xe_gt_tlb_invalidation_wait(gt, seqno);
248-
} else if (xe_device_uc_enabled(xe)) {
248+
} else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
249249
xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
250250
if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
251251
xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,

drivers/gpu/drm/xe/xe_guc_ads.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "xe_lrc.h"
2121
#include "xe_map.h"
2222
#include "xe_mmio.h"
23+
#include "xe_module.h"
2324
#include "xe_platform_types.h"
2425
#include "xe_wa.h"
2526

@@ -440,11 +441,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
440441

441442
static void guc_policies_init(struct xe_guc_ads *ads)
442443
{
444+
u32 global_flags = 0;
445+
443446
ads_blob_write(ads, policies.dpc_promote_time,
444447
GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
445448
ads_blob_write(ads, policies.max_num_work_items,
446449
GLOBAL_POLICY_MAX_NUM_WI);
447-
ads_blob_write(ads, policies.global_flags, 0);
450+
451+
if (xe_modparam.wedged_mode == 2)
452+
global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
453+
454+
ads_blob_write(ads, policies.global_flags, global_flags);
448455
ads_blob_write(ads, policies.is_valid, 1);
449456
}
450457

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 75 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "xe_macros.h"
3636
#include "xe_map.h"
3737
#include "xe_mocs.h"
38+
#include "xe_module.h"
3839
#include "xe_ring_ops_types.h"
3940
#include "xe_sched_job.h"
4041
#include "xe_trace.h"
@@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
5960
#define ENGINE_STATE_SUSPENDED (1 << 5)
6061
#define EXEC_QUEUE_STATE_RESET (1 << 6)
6162
#define ENGINE_STATE_KILLED (1 << 7)
63+
#define EXEC_QUEUE_STATE_WEDGED (1 << 8)
6264

6365
static bool exec_queue_registered(struct xe_exec_queue *q)
6466
{
@@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
175177
atomic_or(ENGINE_STATE_KILLED, &q->guc->state);
176178
}
177179

178-
static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
180+
static bool exec_queue_wedged(struct xe_exec_queue *q)
179181
{
180-
return exec_queue_killed(q) || exec_queue_banned(q);
182+
return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED;
183+
}
184+
185+
static void set_exec_queue_wedged(struct xe_exec_queue *q)
186+
{
187+
atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
188+
}
189+
190+
static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
191+
{
192+
return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
193+
(EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED));
181194
}
182195

183196
#ifdef CONFIG_PROVE_LOCKING
@@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
240253
free_submit_wq(guc);
241254
}
242255

256+
static void guc_submit_wedged_fini(struct drm_device *drm, void *arg)
257+
{
258+
struct xe_guc *guc = arg;
259+
struct xe_exec_queue *q;
260+
unsigned long index;
261+
262+
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
263+
if (exec_queue_wedged(q))
264+
xe_exec_queue_put(q);
265+
}
266+
243267
static const struct xe_exec_queue_ops guc_exec_queue_ops;
244268

245269
static void primelockdep(struct xe_guc *guc)
@@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
708732

709733
trace_xe_sched_job_run(job);
710734

711-
if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) {
735+
if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
712736
if (!exec_queue_registered(q))
713737
register_engine(q);
714738
if (!lr) /* LR jobs are emitted in the exec IOCTL */
@@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
844868
xe_sched_tdr_queue_imm(&q->guc->sched);
845869
}
846870

871+
static void guc_submit_wedged(struct xe_guc *guc)
872+
{
873+
struct xe_exec_queue *q;
874+
unsigned long index;
875+
int err;
876+
877+
xe_device_declare_wedged(guc_to_xe(guc));
878+
xe_guc_submit_reset_prepare(guc);
879+
xe_guc_ct_stop(&guc->ct);
880+
881+
err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
882+
guc_submit_wedged_fini, guc);
883+
if (err)
884+
return;
885+
886+
mutex_lock(&guc->submission_state.lock);
887+
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
888+
if (xe_exec_queue_get_unless_zero(q))
889+
set_exec_queue_wedged(q);
890+
mutex_unlock(&guc->submission_state.lock);
891+
}
892+
847893
static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
848894
{
849895
struct xe_guc_exec_queue *ge =
@@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
852898
struct xe_guc *guc = exec_queue_to_guc(q);
853899
struct xe_device *xe = guc_to_xe(guc);
854900
struct xe_gpu_scheduler *sched = &ge->sched;
901+
bool wedged = xe_device_wedged(xe);
855902

856903
xe_assert(xe, xe_exec_queue_is_lr(q));
857904
trace_xe_exec_queue_lr_cleanup(q);
858905

906+
if (!wedged && xe_modparam.wedged_mode == 2) {
907+
guc_submit_wedged(exec_queue_to_guc(q));
908+
wedged = true;
909+
}
910+
859911
/* Kill the run_job / process_msg entry points */
860912
xe_sched_submission_stop(sched);
861913

@@ -870,7 +922,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
870922
* xe_guc_deregister_done_handler() which treats it as an unexpected
871923
* state.
872924
*/
873-
if (exec_queue_registered(q) && !exec_queue_destroyed(q)) {
925+
if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
874926
struct xe_guc *guc = exec_queue_to_guc(q);
875927
int ret;
876928

@@ -905,6 +957,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
905957
struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
906958
int err = -ETIME;
907959
int i = 0;
960+
bool wedged = xe_device_wedged(xe);
908961

909962
/*
910963
* TDR has fired before free job worker. Common if exec queue
@@ -928,15 +981,20 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
928981

929982
trace_xe_sched_job_timedout(job);
930983

984+
if (!wedged && xe_modparam.wedged_mode == 2) {
985+
guc_submit_wedged(exec_queue_to_guc(q));
986+
wedged = true;
987+
}
988+
931989
/* Kill the run_job entry point */
932990
xe_sched_submission_stop(sched);
933991

934992
/*
935993
* Kernel jobs should never fail, nor should VM jobs if they do
936994
* somethings has gone wrong and the GT needs a reset
937995
*/
938-
if (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
939-
(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) {
996+
if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
997+
(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
940998
if (!xe_sched_invalidate_job(job, 2)) {
941999
xe_sched_add_pending_job(sched, job);
9421000
xe_sched_submission_start(sched);
@@ -946,7 +1004,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
9461004
}
9471005

9481006
/* Engine state now stable, disable scheduling if needed */
949-
if (exec_queue_registered(q)) {
1007+
if (!wedged && exec_queue_registered(q)) {
9501008
struct xe_guc *guc = exec_queue_to_guc(q);
9511009
int ret;
9521010

@@ -989,6 +1047,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
9891047
*/
9901048
xe_sched_add_pending_job(sched, job);
9911049
xe_sched_submission_start(sched);
1050+
9921051
xe_guc_exec_queue_trigger_cleanup(q);
9931052

9941053
/* Mark all outstanding jobs as bad, thus completing them */
@@ -1028,7 +1087,7 @@ static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
10281087
INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
10291088

10301089
/* We must block on kernel engines so slabs are empty on driver unload */
1031-
if (q->flags & EXEC_QUEUE_FLAG_PERMANENT)
1090+
if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
10321091
__guc_exec_queue_fini_async(&q->guc->fini_async);
10331092
else
10341093
queue_work(system_wq, &q->guc->fini_async);
@@ -1063,7 +1122,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
10631122

10641123
static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
10651124
{
1066-
return !exec_queue_killed_or_banned(q) && exec_queue_registered(q);
1125+
return !exec_queue_killed_or_banned_or_wedged(q) && exec_queue_registered(q);
10671126
}
10681127

10691128
static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg)
@@ -1274,7 +1333,7 @@ static void guc_exec_queue_fini(struct xe_exec_queue *q)
12741333
{
12751334
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
12761335

1277-
if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT))
1336+
if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
12781337
guc_exec_queue_add_msg(q, msg, CLEANUP);
12791338
else
12801339
__guc_exec_queue_fini(exec_queue_to_guc(q), q);
@@ -1285,7 +1344,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
12851344
{
12861345
struct xe_sched_msg *msg;
12871346

1288-
if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q))
1347+
if (q->sched_props.priority == priority ||
1348+
exec_queue_killed_or_banned_or_wedged(q))
12891349
return 0;
12901350

12911351
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
@@ -1303,7 +1363,7 @@ static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_u
13031363
struct xe_sched_msg *msg;
13041364

13051365
if (q->sched_props.timeslice_us == timeslice_us ||
1306-
exec_queue_killed_or_banned(q))
1366+
exec_queue_killed_or_banned_or_wedged(q))
13071367
return 0;
13081368

13091369
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
@@ -1322,7 +1382,7 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
13221382
struct xe_sched_msg *msg;
13231383

13241384
if (q->sched_props.preempt_timeout_us == preempt_timeout_us ||
1325-
exec_queue_killed_or_banned(q))
1385+
exec_queue_killed_or_banned_or_wedged(q))
13261386
return 0;
13271387

13281388
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
@@ -1339,7 +1399,7 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
13391399
{
13401400
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
13411401

1342-
if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending)
1402+
if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
13431403
return -EINVAL;
13441404

13451405
q->guc->suspend_pending = true;
@@ -1485,7 +1545,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
14851545
{
14861546
struct xe_gpu_scheduler *sched = &q->guc->sched;
14871547

1488-
if (!exec_queue_killed_or_banned(q)) {
1548+
if (!exec_queue_killed_or_banned_or_wedged(q)) {
14891549
int i;
14901550

14911551
trace_xe_exec_queue_resubmit(q);

drivers/gpu/drm/xe/xe_module.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct xe_modparam xe_modparam = {
1717
.enable_display = true,
1818
.guc_log_level = 5,
1919
.force_probe = CONFIG_DRM_XE_FORCE_PROBE,
20+
.wedged_mode = 1,
2021
/* the rest are 0 by default */
2122
};
2223

@@ -55,6 +56,10 @@ MODULE_PARM_DESC(max_vfs,
5556
"(0 = no VFs [default]; N = allow up to N VFs)");
5657
#endif
5758

59+
module_param_named_unsafe(wedged_mode, xe_modparam.wedged_mode, int, 0600);
60+
MODULE_PARM_DESC(wedged_mode,
61+
"Module's default policy for the wedged mode - 0=never, 1=upon-critical-errors[default], 2=upon-any-hang");
62+
5863
struct init_funcs {
5964
int (*init)(void);
6065
void (*exit)(void);

drivers/gpu/drm/xe/xe_module.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct xe_modparam {
2121
#ifdef CONFIG_PCI_IOV
2222
unsigned int max_vfs;
2323
#endif
24+
int wedged_mode;
2425
};
2526

2627
extern struct xe_modparam xe_modparam;

0 commit comments

Comments
 (0)