Skip to content

Commit 728e7e0

Browse files
Jiange Zhaoalexdeucher
authored andcommitted
drm/amdgpu: Add autodump debugfs node for gpu reset v8
When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. v2: (1) changed 'registered' to 'app_listening' (2) add a mutex in open() to prevent race condition v3 (chk): grab the reset lock to avoid race in autodump_open, rename debugfs file to amdgpu_autodump, provide autodump_read as well, style and code cleanups v4: add 'bool app_listening' to differentiate situations, so that the node can be reopened; also, there is no need to wait for completion when no app is waiting for a dump. v5: change 'bool app_listening' to 'enum amdgpu_autodump_state' add 'app_state_mutex' for race conditions: (1)Only 1 user can open this file node (2)wait_dump() can only take effect after poll() executed. (3)eliminated the race condition between release() and wait_dump() v6: removed 'enum amdgpu_autodump_state' and 'app_state_mutex' removed state checking in amdgpu_debugfs_wait_dump Improve on top of version 3 so that the node can be reopened. v7: move reinit_completion into open() so that only one user can open it. v8: remove complete_all() from amdgpu_debugfs_wait_dump(). Signed-off-by: Jiange Zhao <[email protected]> Reviewed-by: Christian König <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent b7f0656 commit 728e7e0

File tree

4 files changed

+87
-1
lines changed

4 files changed

+87
-1
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,8 @@ struct amdgpu_device {
989989
char product_number[16];
990990
char product_name[32];
991991
char serial[16];
992+
993+
struct amdgpu_autodump autodump;
992994
};
993995

994996
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#include <linux/pci.h>
2828
#include <linux/uaccess.h>
2929
#include <linux/pm_runtime.h>
30-
30+
#include <linux/poll.h>
3131
#include <drm/drm_debugfs.h>
3232

3333
#include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
7474
return 0;
7575
}
7676

77+
int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
78+
{
79+
#if defined(CONFIG_DEBUG_FS)
80+
unsigned long timeout = 600 * HZ;
81+
int ret;
82+
83+
wake_up_interruptible(&adev->autodump.gpu_hang);
84+
85+
ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
86+
if (ret == 0) {
87+
pr_err("autodump: timeout, move on to gpu recovery\n");
88+
return -ETIMEDOUT;
89+
}
90+
#endif
91+
return 0;
92+
}
93+
7794
#if defined(CONFIG_DEBUG_FS)
7895

96+
static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
97+
{
98+
struct amdgpu_device *adev = inode->i_private;
99+
int ret;
100+
101+
file->private_data = adev;
102+
103+
mutex_lock(&adev->lock_reset);
104+
if (adev->autodump.dumping.done) {
105+
reinit_completion(&adev->autodump.dumping);
106+
ret = 0;
107+
} else {
108+
ret = -EBUSY;
109+
}
110+
mutex_unlock(&adev->lock_reset);
111+
112+
return ret;
113+
}
114+
115+
static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
116+
{
117+
struct amdgpu_device *adev = file->private_data;
118+
119+
complete_all(&adev->autodump.dumping);
120+
return 0;
121+
}
122+
123+
static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
124+
{
125+
struct amdgpu_device *adev = file->private_data;
126+
127+
poll_wait(file, &adev->autodump.gpu_hang, poll_table);
128+
129+
if (adev->in_gpu_reset)
130+
return POLLIN | POLLRDNORM | POLLWRNORM;
131+
132+
return 0;
133+
}
134+
135+
static const struct file_operations autodump_debug_fops = {
136+
.owner = THIS_MODULE,
137+
.open = amdgpu_debugfs_autodump_open,
138+
.poll = amdgpu_debugfs_autodump_poll,
139+
.release = amdgpu_debugfs_autodump_release,
140+
};
141+
142+
static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
143+
{
144+
init_completion(&adev->autodump.dumping);
145+
complete_all(&adev->autodump.dumping);
146+
init_waitqueue_head(&adev->autodump.gpu_hang);
147+
148+
debugfs_create_file("amdgpu_autodump", 0600,
149+
adev->ddev->primary->debugfs_root,
150+
adev, &autodump_debug_fops);
151+
}
152+
79153
/**
80154
* amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
81155
*
@@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
14341508

14351509
amdgpu_ras_debugfs_create_all(adev);
14361510

1511+
amdgpu_debugfs_autodump_init(adev);
1512+
14371513
return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
14381514
ARRAY_SIZE(amdgpu_debugfs_list));
14391515
}

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
3131
unsigned num_files;
3232
};
3333

34+
struct amdgpu_autodump {
35+
struct completion dumping;
36+
struct wait_queue_head gpu_hang;
37+
};
38+
3439
int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
3540
int amdgpu_debugfs_init(struct amdgpu_device *adev);
3641
void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
4045
int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
4146
int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
4247
int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
48+
int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
39273927
int i, r = 0;
39283928
bool need_full_reset = *need_full_reset_arg;
39293929

3930+
amdgpu_debugfs_wait_dump(adev);
3931+
39303932
/* block all schedulers and reset given job's ring */
39313933
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
39323934
struct amdgpu_ring *ring = adev->rings[i];

0 commit comments

Comments
 (0)