Skip to content

Commit 6d7a20c

Browse files
committed
drm/etnaviv: replace hangcheck with scheduler timeout
This replaces the etnaviv internal hangcheck logic with the job timeout handling provided by the DRM scheduler. This simplifies the driver further and allows to replay jobs after a GPU reset, so only minimal state is lost. This introduces a user-visible change in that we don't allow jobs to run indefinitely as long as they make progress anymore, as this introduces quality of service issues when multiple processes are using the GPU. Userspace is now responsible to flush jobs in a way that the finish in a reasonable time, where reasonable is currently defined as less than 500ms. Signed-off-by: Lucas Stach <[email protected]>
1 parent e058025 commit 6d7a20c

File tree

5 files changed

+51
-114
lines changed

5 files changed

+51
-114
lines changed

drivers/gpu/drm/etnaviv/etnaviv_dump.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@
2020
#include "etnaviv_gem.h"
2121
#include "etnaviv_gpu.h"
2222
#include "etnaviv_mmu.h"
23+
#include "etnaviv_sched.h"
2324
#include "state.xml.h"
2425
#include "state_hi.xml.h"
2526

27+
static bool etnaviv_dump_core = true;
28+
module_param_named(dump_core, etnaviv_dump_core, bool, 0600);
29+
2630
struct core_dump_iterator {
2731
void *start;
2832
struct etnaviv_dump_object_header *hdr;
@@ -121,10 +125,16 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
121125
struct etnaviv_vram_mapping *vram;
122126
struct etnaviv_gem_object *obj;
123127
struct etnaviv_gem_submit *submit;
128+
struct drm_sched_job *s_job;
124129
unsigned int n_obj, n_bomap_pages;
125130
size_t file_size, mmu_size;
126131
__le64 *bomap, *bomap_start;
127132

133+
/* Only catch the first event, or when manually re-armed */
134+
if (!etnaviv_dump_core)
135+
return;
136+
etnaviv_dump_core = false;
137+
128138
mmu_size = etnaviv_iommu_dump_size(gpu->mmu);
129139

130140
/* We always dump registers, mmu, ring and end marker */
@@ -135,10 +145,13 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
135145
mmu_size + gpu->buffer.size;
136146

137147
/* Add in the active command buffers */
138-
list_for_each_entry(submit, &gpu->active_submit_list, node) {
148+
spin_lock(&gpu->sched.job_list_lock);
149+
list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
150+
submit = to_etnaviv_submit(s_job);
139151
file_size += submit->cmdbuf.size;
140152
n_obj++;
141153
}
154+
spin_unlock(&gpu->sched.job_list_lock);
142155

143156
/* Add in the active buffer objects */
144157
list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
@@ -180,10 +193,14 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
180193
gpu->buffer.size,
181194
etnaviv_cmdbuf_get_va(&gpu->buffer));
182195

183-
list_for_each_entry(submit, &gpu->active_submit_list, node)
196+
spin_lock(&gpu->sched.job_list_lock);
197+
list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
198+
submit = to_etnaviv_submit(s_job);
184199
etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
185200
submit->cmdbuf.vaddr, submit->cmdbuf.size,
186201
etnaviv_cmdbuf_get_va(&submit->cmdbuf));
202+
}
203+
spin_unlock(&gpu->sched.job_list_lock);
187204

188205
/* Reserve space for the bomap */
189206
if (n_bomap_pages) {

drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,6 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
542542
goto err_submit_objects;
543543

544544
memcpy(submit->cmdbuf.vaddr, stream, args->stream_size);
545-
submit->cmdbuf.user_size = ALIGN(args->stream_size, 8);
546545

547546
ret = submit_lock_objects(submit, &ticket);
548547
if (ret)

drivers/gpu/drm/etnaviv/etnaviv_gpu.c

Lines changed: 10 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ static const struct platform_device_id gpu_ids[] = {
4141
{ },
4242
};
4343

44-
static bool etnaviv_dump_core = true;
45-
module_param_named(dump_core, etnaviv_dump_core, bool, 0600);
46-
4744
/*
4845
* Driver functions:
4946
*/
@@ -919,38 +916,24 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
919916
}
920917
#endif
921918

922-
/*
923-
* Hangcheck detection for locked gpu:
924-
*/
925-
static void recover_worker(struct work_struct *work)
919+
void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
926920
{
927-
struct etnaviv_gpu *gpu = container_of(work, struct etnaviv_gpu,
928-
recover_work);
929921
unsigned long flags;
930922
unsigned int i = 0;
931923

932-
dev_err(gpu->dev, "hangcheck recover!\n");
924+
dev_err(gpu->dev, "recover hung GPU!\n");
933925

934926
if (pm_runtime_get_sync(gpu->dev) < 0)
935927
return;
936928

937929
mutex_lock(&gpu->lock);
938930

939-
/* Only catch the first event, or when manually re-armed */
940-
if (etnaviv_dump_core) {
941-
etnaviv_core_dump(gpu);
942-
etnaviv_dump_core = false;
943-
}
944-
945931
etnaviv_hw_reset(gpu);
946932

947933
/* complete all events, the GPU won't do it after the reset */
948934
spin_lock_irqsave(&gpu->event_spinlock, flags);
949-
for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS) {
950-
dma_fence_signal(gpu->event[i].fence);
951-
gpu->event[i].fence = NULL;
935+
for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS)
952936
complete(&gpu->event_free);
953-
}
954937
bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS);
955938
spin_unlock_irqrestore(&gpu->event_spinlock, flags);
956939
gpu->completed_fence = gpu->active_fence;
@@ -964,53 +947,6 @@ static void recover_worker(struct work_struct *work)
964947
pm_runtime_put_autosuspend(gpu->dev);
965948
}
966949

967-
static void hangcheck_timer_reset(struct etnaviv_gpu *gpu)
968-
{
969-
DBG("%s", dev_name(gpu->dev));
970-
mod_timer(&gpu->hangcheck_timer,
971-
round_jiffies_up(jiffies + DRM_ETNAVIV_HANGCHECK_JIFFIES));
972-
}
973-
974-
static void hangcheck_handler(struct timer_list *t)
975-
{
976-
struct etnaviv_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
977-
u32 fence = gpu->completed_fence;
978-
bool progress = false;
979-
980-
if (fence != gpu->hangcheck_fence) {
981-
gpu->hangcheck_fence = fence;
982-
progress = true;
983-
}
984-
985-
if (!progress) {
986-
u32 dma_addr = gpu_read(gpu, VIVS_FE_DMA_ADDRESS);
987-
int change = dma_addr - gpu->hangcheck_dma_addr;
988-
989-
if (change < 0 || change > 16) {
990-
gpu->hangcheck_dma_addr = dma_addr;
991-
progress = true;
992-
}
993-
}
994-
995-
if (!progress && fence_after(gpu->active_fence, fence)) {
996-
dev_err(gpu->dev, "hangcheck detected gpu lockup!\n");
997-
dev_err(gpu->dev, " completed fence: %u\n", fence);
998-
dev_err(gpu->dev, " active fence: %u\n",
999-
gpu->active_fence);
1000-
queue_work(gpu->wq, &gpu->recover_work);
1001-
}
1002-
1003-
/* if still more pending work, reset the hangcheck timer: */
1004-
if (fence_after(gpu->active_fence, gpu->hangcheck_fence))
1005-
hangcheck_timer_reset(gpu);
1006-
}
1007-
1008-
static void hangcheck_disable(struct etnaviv_gpu *gpu)
1009-
{
1010-
del_timer_sync(&gpu->hangcheck_timer);
1011-
cancel_work_sync(&gpu->recover_work);
1012-
}
1013-
1014950
/* fence object management */
1015951
struct etnaviv_fence {
1016952
struct etnaviv_gpu *gpu;
@@ -1286,10 +1222,12 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
12861222
unsigned int i, nr_events = 1, event[3];
12871223
int ret;
12881224

1289-
ret = pm_runtime_get_sync(gpu->dev);
1290-
if (ret < 0)
1291-
return NULL;
1292-
submit->runtime_resumed = true;
1225+
if (!submit->runtime_resumed) {
1226+
ret = pm_runtime_get_sync(gpu->dev);
1227+
if (ret < 0)
1228+
return NULL;
1229+
submit->runtime_resumed = true;
1230+
}
12931231

12941232
/*
12951233
* if there are performance monitor requests we need to have
@@ -1327,6 +1265,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
13271265
}
13281266

13291267
gpu->event[event[0]].fence = gpu_fence;
1268+
submit->cmdbuf.user_size = submit->cmdbuf.size - 8;
13301269
etnaviv_buffer_queue(gpu, submit->exec_state, event[0],
13311270
&submit->cmdbuf);
13321271

@@ -1337,8 +1276,6 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
13371276
etnaviv_sync_point_queue(gpu, event[2]);
13381277
}
13391278

1340-
hangcheck_timer_reset(gpu);
1341-
13421279
out_unlock:
13431280
mutex_unlock(&gpu->lock);
13441281

@@ -1626,13 +1563,9 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master,
16261563
idr_init(&gpu->fence_idr);
16271564
spin_lock_init(&gpu->fence_spinlock);
16281565

1629-
INIT_LIST_HEAD(&gpu->active_submit_list);
16301566
INIT_WORK(&gpu->sync_point_work, sync_point_worker);
1631-
INIT_WORK(&gpu->recover_work, recover_worker);
16321567
init_waitqueue_head(&gpu->fence_event);
16331568

1634-
timer_setup(&gpu->hangcheck_timer, hangcheck_handler, TIMER_DEFERRABLE);
1635-
16361569
priv->gpu[priv->num_gpus++] = gpu;
16371570

16381571
pm_runtime_mark_last_busy(gpu->dev);
@@ -1660,8 +1593,6 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master,
16601593

16611594
DBG("%s", dev_name(gpu->dev));
16621595

1663-
hangcheck_disable(gpu);
1664-
16651596
flush_workqueue(gpu->wq);
16661597
destroy_workqueue(gpu->wq);
16671598

drivers/gpu/drm/etnaviv/etnaviv_gpu.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,6 @@ struct etnaviv_gpu {
123123
struct completion event_free;
124124
spinlock_t event_spinlock;
125125

126-
/* list of currently in-flight command buffers */
127-
struct list_head active_submit_list;
128-
129126
u32 idle_mask;
130127

131128
/* Fencing support */
@@ -153,13 +150,6 @@ struct etnaviv_gpu {
153150
struct clk *clk_core;
154151
struct clk *clk_shader;
155152

156-
/* Hang Detction: */
157-
#define DRM_ETNAVIV_HANGCHECK_PERIOD 500 /* in ms */
158-
#define DRM_ETNAVIV_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_ETNAVIV_HANGCHECK_PERIOD)
159-
struct timer_list hangcheck_timer;
160-
u32 hangcheck_fence;
161-
u32 hangcheck_dma_addr;
162-
struct work_struct recover_work;
163153
unsigned int freq_scale;
164154
unsigned long base_rate_core;
165155
unsigned long base_rate_shader;
@@ -188,6 +178,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu);
188178
int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m);
189179
#endif
190180

181+
void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu);
191182
void etnaviv_gpu_retire(struct etnaviv_gpu *gpu);
192183
int etnaviv_gpu_wait_fence_interruptible(struct etnaviv_gpu *gpu,
193184
u32 fence, struct timespec *timeout);

drivers/gpu/drm/etnaviv/etnaviv_sched.c

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,24 +14,19 @@
1414
* this program. If not, see <http://www.gnu.org/licenses/>.
1515
*/
1616

17-
#include <drm/gpu_scheduler.h>
1817
#include <linux/kthread.h>
1918

2019
#include "etnaviv_drv.h"
20+
#include "etnaviv_dump.h"
2121
#include "etnaviv_gem.h"
2222
#include "etnaviv_gpu.h"
23+
#include "etnaviv_sched.h"
2324

2425
static int etnaviv_job_hang_limit = 0;
2526
module_param_named(job_hang_limit, etnaviv_job_hang_limit, int , 0444);
2627
static int etnaviv_hw_jobs_limit = 2;
2728
module_param_named(hw_job_limit, etnaviv_hw_jobs_limit, int , 0444);
2829

29-
static inline
30-
struct etnaviv_gem_submit *to_etnaviv_submit(struct drm_sched_job *sched_job)
31-
{
32-
return container_of(sched_job, struct etnaviv_gem_submit, sched_job);
33-
}
34-
3530
struct dma_fence *etnaviv_sched_dependency(struct drm_sched_job *sched_job,
3631
struct drm_sched_entity *entity)
3732
{
@@ -86,34 +81,38 @@ struct dma_fence *etnaviv_sched_dependency(struct drm_sched_job *sched_job,
8681
struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job)
8782
{
8883
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
89-
struct dma_fence *fence;
90-
91-
mutex_lock(&submit->gpu->lock);
92-
list_add_tail(&submit->node, &submit->gpu->active_submit_list);
93-
mutex_unlock(&submit->gpu->lock);
84+
struct dma_fence *fence = NULL;
9485

95-
fence = etnaviv_gpu_submit(submit);
96-
if (!fence) {
97-
etnaviv_submit_put(submit);
98-
return NULL;
99-
}
86+
if (likely(!sched_job->s_fence->finished.error))
87+
fence = etnaviv_gpu_submit(submit);
88+
else
89+
dev_dbg(submit->gpu->dev, "skipping bad job\n");
10090

10191
return fence;
10292
}
10393

10494
static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
10595
{
106-
/* this replaces the hangcheck */
96+
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
97+
struct etnaviv_gpu *gpu = submit->gpu;
98+
99+
/* block scheduler */
100+
kthread_park(gpu->sched.thread);
101+
drm_sched_hw_job_reset(&gpu->sched, sched_job);
102+
103+
/* get the GPU back into the init state */
104+
etnaviv_core_dump(gpu);
105+
etnaviv_gpu_recover_hang(gpu);
106+
107+
/* restart scheduler after GPU is usable again */
108+
drm_sched_job_recovery(&gpu->sched);
109+
kthread_unpark(gpu->sched.thread);
107110
}
108111

109112
static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
110113
{
111114
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
112115

113-
mutex_lock(&submit->gpu->lock);
114-
list_del(&submit->node);
115-
mutex_unlock(&submit->gpu->lock);
116-
117116
etnaviv_submit_put(submit);
118117
}
119118

0 commit comments

Comments
 (0)