35
35
#include "xe_macros.h"
36
36
#include "xe_map.h"
37
37
#include "xe_mocs.h"
38
+ #include "xe_module.h"
38
39
#include "xe_ring_ops_types.h"
39
40
#include "xe_sched_job.h"
40
41
#include "xe_trace.h"
@@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
59
60
#define ENGINE_STATE_SUSPENDED (1 << 5)
60
61
#define EXEC_QUEUE_STATE_RESET (1 << 6)
61
62
#define ENGINE_STATE_KILLED (1 << 7)
63
+ #define EXEC_QUEUE_STATE_WEDGED (1 << 8)
62
64
63
65
static bool exec_queue_registered (struct xe_exec_queue * q )
64
66
{
@@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
175
177
atomic_or (ENGINE_STATE_KILLED , & q -> guc -> state );
176
178
}
177
179
178
- static bool exec_queue_killed_or_banned (struct xe_exec_queue * q )
180
+ static bool exec_queue_wedged (struct xe_exec_queue * q )
179
181
{
180
- return exec_queue_killed (q ) || exec_queue_banned (q );
182
+ return atomic_read (& q -> guc -> state ) & EXEC_QUEUE_STATE_WEDGED ;
183
+ }
184
+
185
+ static void set_exec_queue_wedged (struct xe_exec_queue * q )
186
+ {
187
+ atomic_or (EXEC_QUEUE_STATE_WEDGED , & q -> guc -> state );
188
+ }
189
+
190
+ static bool exec_queue_killed_or_banned_or_wedged (struct xe_exec_queue * q )
191
+ {
192
+ return exec_queue_banned (q ) || (atomic_read (& q -> guc -> state ) &
193
+ (EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED ));
181
194
}
182
195
183
196
#ifdef CONFIG_PROVE_LOCKING
@@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
240
253
free_submit_wq (guc );
241
254
}
242
255
256
+ static void guc_submit_wedged_fini (struct drm_device * drm , void * arg )
257
+ {
258
+ struct xe_guc * guc = arg ;
259
+ struct xe_exec_queue * q ;
260
+ unsigned long index ;
261
+
262
+ xa_for_each (& guc -> submission_state .exec_queue_lookup , index , q )
263
+ if (exec_queue_wedged (q ))
264
+ xe_exec_queue_put (q );
265
+ }
266
+
243
267
static const struct xe_exec_queue_ops guc_exec_queue_ops ;
244
268
245
269
static void primelockdep (struct xe_guc * guc )
@@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
708
732
709
733
trace_xe_sched_job_run (job );
710
734
711
- if (!exec_queue_killed_or_banned (q ) && !xe_sched_job_is_error (job )) {
735
+ if (!exec_queue_killed_or_banned_or_wedged (q ) && !xe_sched_job_is_error (job )) {
712
736
if (!exec_queue_registered (q ))
713
737
register_engine (q );
714
738
if (!lr ) /* LR jobs are emitted in the exec IOCTL */
@@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
844
868
xe_sched_tdr_queue_imm (& q -> guc -> sched );
845
869
}
846
870
871
+ static void guc_submit_wedged (struct xe_guc * guc )
872
+ {
873
+ struct xe_exec_queue * q ;
874
+ unsigned long index ;
875
+ int err ;
876
+
877
+ xe_device_declare_wedged (guc_to_xe (guc ));
878
+ xe_guc_submit_reset_prepare (guc );
879
+ xe_guc_ct_stop (& guc -> ct );
880
+
881
+ err = drmm_add_action_or_reset (& guc_to_xe (guc )-> drm ,
882
+ guc_submit_wedged_fini , guc );
883
+ if (err )
884
+ return ;
885
+
886
+ mutex_lock (& guc -> submission_state .lock );
887
+ xa_for_each (& guc -> submission_state .exec_queue_lookup , index , q )
888
+ if (xe_exec_queue_get_unless_zero (q ))
889
+ set_exec_queue_wedged (q );
890
+ mutex_unlock (& guc -> submission_state .lock );
891
+ }
892
+
847
893
static void xe_guc_exec_queue_lr_cleanup (struct work_struct * w )
848
894
{
849
895
struct xe_guc_exec_queue * ge =
@@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
852
898
struct xe_guc * guc = exec_queue_to_guc (q );
853
899
struct xe_device * xe = guc_to_xe (guc );
854
900
struct xe_gpu_scheduler * sched = & ge -> sched ;
901
+ bool wedged = xe_device_wedged (xe );
855
902
856
903
xe_assert (xe , xe_exec_queue_is_lr (q ));
857
904
trace_xe_exec_queue_lr_cleanup (q );
858
905
906
+ if (!wedged && xe_modparam .wedged_mode == 2 ) {
907
+ guc_submit_wedged (exec_queue_to_guc (q ));
908
+ wedged = true;
909
+ }
910
+
859
911
/* Kill the run_job / process_msg entry points */
860
912
xe_sched_submission_stop (sched );
861
913
@@ -870,7 +922,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
870
922
* xe_guc_deregister_done_handler() which treats it as an unexpected
871
923
* state.
872
924
*/
873
- if (exec_queue_registered (q ) && !exec_queue_destroyed (q )) {
925
+ if (! wedged && exec_queue_registered (q ) && !exec_queue_destroyed (q )) {
874
926
struct xe_guc * guc = exec_queue_to_guc (q );
875
927
int ret ;
876
928
@@ -905,6 +957,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
905
957
struct xe_device * xe = guc_to_xe (exec_queue_to_guc (q ));
906
958
int err = - ETIME ;
907
959
int i = 0 ;
960
+ bool wedged = xe_device_wedged (xe );
908
961
909
962
/*
910
963
* TDR has fired before free job worker. Common if exec queue
@@ -928,15 +981,20 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
928
981
929
982
trace_xe_sched_job_timedout (job );
930
983
984
+ if (!wedged && xe_modparam .wedged_mode == 2 ) {
985
+ guc_submit_wedged (exec_queue_to_guc (q ));
986
+ wedged = true;
987
+ }
988
+
931
989
/* Kill the run_job entry point */
932
990
xe_sched_submission_stop (sched );
933
991
934
992
/*
935
993
* Kernel jobs should never fail, nor should VM jobs if they do
936
994
* somethings has gone wrong and the GT needs a reset
937
995
*/
938
- if (q -> flags & EXEC_QUEUE_FLAG_KERNEL ||
939
- (q -> flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed (q ))) {
996
+ if (! wedged && ( q -> flags & EXEC_QUEUE_FLAG_KERNEL ||
997
+ (q -> flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed (q ) ))) {
940
998
if (!xe_sched_invalidate_job (job , 2 )) {
941
999
xe_sched_add_pending_job (sched , job );
942
1000
xe_sched_submission_start (sched );
@@ -946,7 +1004,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
946
1004
}
947
1005
948
1006
/* Engine state now stable, disable scheduling if needed */
949
- if (exec_queue_registered (q )) {
1007
+ if (! wedged && exec_queue_registered (q )) {
950
1008
struct xe_guc * guc = exec_queue_to_guc (q );
951
1009
int ret ;
952
1010
@@ -989,6 +1047,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
989
1047
*/
990
1048
xe_sched_add_pending_job (sched , job );
991
1049
xe_sched_submission_start (sched );
1050
+
992
1051
xe_guc_exec_queue_trigger_cleanup (q );
993
1052
994
1053
/* Mark all outstanding jobs as bad, thus completing them */
@@ -1028,7 +1087,7 @@ static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
1028
1087
INIT_WORK (& q -> guc -> fini_async , __guc_exec_queue_fini_async );
1029
1088
1030
1089
/* We must block on kernel engines so slabs are empty on driver unload */
1031
- if (q -> flags & EXEC_QUEUE_FLAG_PERMANENT )
1090
+ if (q -> flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged ( q ) )
1032
1091
__guc_exec_queue_fini_async (& q -> guc -> fini_async );
1033
1092
else
1034
1093
queue_work (system_wq , & q -> guc -> fini_async );
@@ -1063,7 +1122,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
1063
1122
1064
1123
static bool guc_exec_queue_allowed_to_change_state (struct xe_exec_queue * q )
1065
1124
{
1066
- return !exec_queue_killed_or_banned (q ) && exec_queue_registered (q );
1125
+ return !exec_queue_killed_or_banned_or_wedged (q ) && exec_queue_registered (q );
1067
1126
}
1068
1127
1069
1128
static void __guc_exec_queue_process_msg_set_sched_props (struct xe_sched_msg * msg )
@@ -1274,7 +1333,7 @@ static void guc_exec_queue_fini(struct xe_exec_queue *q)
1274
1333
{
1275
1334
struct xe_sched_msg * msg = q -> guc -> static_msgs + STATIC_MSG_CLEANUP ;
1276
1335
1277
- if (!(q -> flags & EXEC_QUEUE_FLAG_PERMANENT ))
1336
+ if (!(q -> flags & EXEC_QUEUE_FLAG_PERMANENT ) && ! exec_queue_wedged ( q ) )
1278
1337
guc_exec_queue_add_msg (q , msg , CLEANUP );
1279
1338
else
1280
1339
__guc_exec_queue_fini (exec_queue_to_guc (q ), q );
@@ -1285,7 +1344,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
1285
1344
{
1286
1345
struct xe_sched_msg * msg ;
1287
1346
1288
- if (q -> sched_props .priority == priority || exec_queue_killed_or_banned (q ))
1347
+ if (q -> sched_props .priority == priority ||
1348
+ exec_queue_killed_or_banned_or_wedged (q ))
1289
1349
return 0 ;
1290
1350
1291
1351
msg = kmalloc (sizeof (* msg ), GFP_KERNEL );
@@ -1303,7 +1363,7 @@ static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_u
1303
1363
struct xe_sched_msg * msg ;
1304
1364
1305
1365
if (q -> sched_props .timeslice_us == timeslice_us ||
1306
- exec_queue_killed_or_banned (q ))
1366
+ exec_queue_killed_or_banned_or_wedged (q ))
1307
1367
return 0 ;
1308
1368
1309
1369
msg = kmalloc (sizeof (* msg ), GFP_KERNEL );
@@ -1322,7 +1382,7 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
1322
1382
struct xe_sched_msg * msg ;
1323
1383
1324
1384
if (q -> sched_props .preempt_timeout_us == preempt_timeout_us ||
1325
- exec_queue_killed_or_banned (q ))
1385
+ exec_queue_killed_or_banned_or_wedged (q ))
1326
1386
return 0 ;
1327
1387
1328
1388
msg = kmalloc (sizeof (* msg ), GFP_KERNEL );
@@ -1339,7 +1399,7 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
1339
1399
{
1340
1400
struct xe_sched_msg * msg = q -> guc -> static_msgs + STATIC_MSG_SUSPEND ;
1341
1401
1342
- if (exec_queue_killed_or_banned (q ) || q -> guc -> suspend_pending )
1402
+ if (exec_queue_killed_or_banned_or_wedged (q ) || q -> guc -> suspend_pending )
1343
1403
return - EINVAL ;
1344
1404
1345
1405
q -> guc -> suspend_pending = true;
@@ -1485,7 +1545,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
1485
1545
{
1486
1546
struct xe_gpu_scheduler * sched = & q -> guc -> sched ;
1487
1547
1488
- if (!exec_queue_killed_or_banned (q )) {
1548
+ if (!exec_queue_killed_or_banned_or_wedged (q )) {
1489
1549
int i ;
1490
1550
1491
1551
trace_xe_exec_queue_resubmit (q );
0 commit comments