Skip to content

Commit 5edfd7d

Browse files
committed
Merge tag 'amd-drm-next-6.8-2023-12-01' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.8-2023-12-01: amdgpu: - Add new 64 bit sequence number infrastructure. This will ultimately be used for user queue synchronization. - GPUVM updates - Misc code cleanups - RAS updates - DCN 3.5 updates - Rework PCIe link speed handling - Document GPU reset types - DMUB fixes - eDP fixes - NBIO 7.9 updates - NBIO 7.11 updates - SubVP updates - DCN 3.1.4 fixes - ABM fixes - AGP aperture fix - DCN 3.1.5 fix - Fix some potential error path memory leaks - Enable PCIe PMEs - Add XGMI, PCIe state dumping for aqua vanjaram - GFX11 golden register updates - Misc display fixes amdkfd: - Migrate TLB flushing logic to amdgpu - Trap handler fixes - Fix restore workers handling on suspend and reset - Fix possible memory leak in pqm_uninit() radeon: - Fix some possible overflows in command buffer checking - Check for errors in ring_lock From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] Signed-off-by: Dave Airlie <[email protected]>
2 parents a13fee3 + b719a9c commit 5edfd7d

File tree

222 files changed

+3434
-1524
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

222 files changed

+3434
-1524
lines changed

Documentation/gpu/amdgpu/display/dc-debug.rst

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,44 @@ change in real-time by using something like::
7575

7676
When reporting a bug related to DC, consider attaching this log before and
7777
after you reproduce the bug.
78+
79+
DMUB Firmware Debug
80+
===================
81+
82+
Sometimes, dmesg logs aren't enough. This is especially true if a feature is
83+
implemented primarily in DMUB firmware. In such cases, all we see in dmesg when
84+
an issue arises is some generic timeout error. So, to get more relevant
85+
information, we can trace DMUB commands by enabling the relevant bits in
86+
`amdgpu_dm_dmub_trace_mask`.
87+
88+
Currently, we support the tracing of the following groups:
89+
90+
Trace Groups
91+
------------
92+
93+
.. csv-table::
94+
:header-rows: 1
95+
:widths: 1, 1
96+
:file: ./trace-groups-table.csv
97+
98+
**Note: Not all ASICs support all of the listed trace groups**
99+
100+
So, to enable just PSR tracing you can use the following command::
101+
102+
# echo 0x8020 > /sys/kernel/debug/dri/0/amdgpu_dm_dmub_trace_mask
103+
104+
Then, you need to enable logging trace events to the buffer, which you can do
105+
using the following::
106+
107+
# echo 1 > /sys/kernel/debug/dri/0/amdgpu_dm_dmcub_trace_event_en
108+
109+
Lastly, after you are able to reproduce the issue you are trying to debug,
110+
you can disable tracing and read the trace log by using the following::
111+
112+
# echo 0 > /sys/kernel/debug/dri/0/amdgpu_dm_dmcub_trace_event_en
113+
# cat /sys/kernel/debug/dri/0/amdgpu_dm_dmub_tracebuffer
114+
115+
So, when reporting bugs related to features such as PSR and ABM, consider
116+
enabling the relevant bits in the mask before reproducing the issue and
117+
attach the log that you obtain from the trace buffer in any bug reports that you
118+
create.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
Name, Mask Value
2+
INFO, 0x1
3+
IRQ SVC, 0x2
4+
VBIOS, 0x4
5+
REGISTER, 0x8
6+
PHY DBG, 0x10
7+
PSR, 0x20
8+
AUX, 0x40
9+
SMU, 0x80
10+
MALL, 0x100
11+
ABM, 0x200
12+
ALPM, 0x400
13+
TIMER, 0x800
14+
HW LOCK MGR, 0x1000
15+
INBOX1, 0x2000
16+
PHY SEQ, 0x4000
17+
PSR STATE, 0x8000
18+
ZSTATE, 0x10000
19+
TRANSMITTER CTL, 0x20000
20+
PANEL CNTL, 0x40000
21+
FAMS, 0x80000
22+
DPIA, 0x100000
23+
SUBVP, 0x200000
24+
INBOX0, 0x400000
25+
SDP, 0x4000000
26+
REPLAY, 0x8000000
27+
REPLAY RESIDENCY, 0x20000000
28+
CURSOR INFO, 0x80000000
29+
IPS, 0x100000000

drivers/gpu/drm/amd/amdgpu/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
8080
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
8181
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
8282
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
83-
amdgpu_ring_mux.o amdgpu_xcp.o
83+
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o
8484

8585
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
8686

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@
109109
#include "amdgpu_mca.h"
110110
#include "amdgpu_ras.h"
111111
#include "amdgpu_xcp.h"
112+
#include "amdgpu_seq64.h"
113+
#include "amdgpu_reg_state.h"
112114

113115
#define MAX_GPU_INSTANCE 64
114116

@@ -468,6 +470,7 @@ struct amdgpu_fpriv {
468470
struct amdgpu_vm vm;
469471
struct amdgpu_bo_va *prt_va;
470472
struct amdgpu_bo_va *csa_va;
473+
struct amdgpu_bo_va *seq64_va;
471474
struct mutex bo_list_lock;
472475
struct idr bo_list_handles;
473476
struct amdgpu_ctx_mgr ctx_mgr;
@@ -506,6 +509,31 @@ struct amdgpu_allowed_register_entry {
506509
bool grbm_indexed;
507510
};
508511

512+
/**
513+
* enum amd_reset_method - Methods for resetting AMD GPU devices
514+
*
515+
* @AMD_RESET_METHOD_NONE: The device will not be reset.
516+
* @AMD_RESET_LEGACY: Method reserved for SI, CIK and VI ASICs.
517+
* @AMD_RESET_MODE0: Reset the entire ASIC. Not currently available for the
518+
* any device.
519+
* @AMD_RESET_MODE1: Resets all IP blocks on the ASIC (SDMA, GFX, VCN, etc.)
520+
* individually. Suitable only for some discrete GPU, not
521+
* available for all ASICs.
522+
* @AMD_RESET_MODE2: Resets a lesser level of IPs compared to MODE1. Which IPs
523+
* are reset depends on the ASIC. Notably doesn't reset IPs
524+
* shared with the CPU on APUs or the memory controllers (so
525+
* VRAM is not lost). Not available on all ASICs.
526+
* @AMD_RESET_BACO: BACO (Bus Alive, Chip Off) method powers off and on the card
527+
* but without powering off the PCI bus. Suitable only for
528+
* discrete GPUs.
529+
* @AMD_RESET_PCI: Does a full bus reset using core Linux subsystem PCI reset
530+
* and does a secondary bus reset or FLR, depending on what the
531+
* underlying hardware supports.
532+
*
533+
* Methods available for AMD GPU driver for resetting the device. Not all
534+
* methods are suitable for every device. User can override the method using
535+
* module parameter `reset_method`.
536+
*/
509537
enum amd_reset_method {
510538
AMD_RESET_METHOD_NONE = -1,
511539
AMD_RESET_METHOD_LEGACY = 0,
@@ -585,6 +613,10 @@ struct amdgpu_asic_funcs {
585613
const struct amdgpu_video_codecs **codecs);
586614
/* encode "> 32bits" smn addressing */
587615
u64 (*encode_ext_smn_addressing)(int ext_id);
616+
617+
ssize_t (*get_reg_state)(struct amdgpu_device *adev,
618+
enum amdgpu_reg_state reg_state, void *buf,
619+
size_t max_size);
588620
};
589621

590622
/*
@@ -986,6 +1018,9 @@ struct amdgpu_device {
9861018
/* GDS */
9871019
struct amdgpu_gds gds;
9881020

1021+
/* for userq and VM fences */
1022+
struct amdgpu_seq64 seq64;
1023+
9891024
/* KFD */
9901025
struct amdgpu_kfd_dev kfd;
9911026

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
547547
struct amdgpu_device *adev = dst, *peer_adev;
548548
int num_links;
549549

550-
if (adev->asic_type != CHIP_ALDEBARAN)
550+
if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 4, 2))
551551
return 0;
552552

553553
if (src)
@@ -710,35 +710,6 @@ bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid)
710710
return false;
711711
}
712712

713-
int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
714-
uint16_t vmid)
715-
{
716-
if (adev->family == AMDGPU_FAMILY_AI) {
717-
int i;
718-
719-
for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS)
720-
amdgpu_gmc_flush_gpu_tlb(adev, vmid, i, 0);
721-
} else {
722-
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB(0), 0);
723-
}
724-
725-
return 0;
726-
}
727-
728-
int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
729-
uint16_t pasid,
730-
enum TLB_FLUSH_TYPE flush_type,
731-
uint32_t inst)
732-
{
733-
bool all_hub = false;
734-
735-
if (adev->family == AMDGPU_FAMILY_AI ||
736-
adev->family == AMDGPU_FAMILY_RV)
737-
all_hub = true;
738-
739-
return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub, inst);
740-
}
741-
742713
bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
743714
{
744715
return adev->have_atomics_support;

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,6 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
162162
uint32_t *ib_cmd, uint32_t ib_len);
163163
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle);
164164
bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
165-
int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
166-
uint16_t vmid);
167-
int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
168-
uint16_t pasid, enum TLB_FLUSH_TYPE flush_type,
169-
uint32_t inst);
170165

171166
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
172167

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ int kgd_arcturus_hqd_sdma_dump(struct amdgpu_device *adev,
200200
#undef HQD_N_REGS
201201
#define HQD_N_REGS (19+6+7+10)
202202

203-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
203+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
204204
if (*dump == NULL)
205205
return -ENOMEM;
206206

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ static int kgd_gfx_v9_4_3_hqd_sdma_dump(struct amdgpu_device *adev,
141141
(*dump)[i++][1] = RREG32(addr); \
142142
} while (0)
143143

144-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
144+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
145145
if (*dump == NULL)
146146
return -ENOMEM;
147147

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ static int kgd_hqd_dump(struct amdgpu_device *adev,
214214
(*dump)[i++][1] = RREG32(addr); \
215215
} while (0)
216216

217-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
217+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
218218
if (*dump == NULL)
219219
return -ENOMEM;
220220

@@ -301,7 +301,7 @@ static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
301301
#undef HQD_N_REGS
302302
#define HQD_N_REGS (19+4)
303303

304-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
304+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
305305
if (*dump == NULL)
306306
return -ENOMEM;
307307

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ static int kgd_hqd_dump(struct amdgpu_device *adev,
238238
(*dump)[i++][1] = RREG32(addr); \
239239
} while (0)
240240

241-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
241+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
242242
if (*dump == NULL)
243243
return -ENOMEM;
244244

@@ -324,7 +324,7 @@ static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
324324
#undef HQD_N_REGS
325325
#define HQD_N_REGS (19+4+2+3+7)
326326

327-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
327+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
328328
if (*dump == NULL)
329329
return -ENOMEM;
330330

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev,
363363
(*dump)[i++][1] = RREG32(addr); \
364364
} while (0)
365365

366-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
366+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
367367
if (*dump == NULL)
368368
return -ENOMEM;
369369

@@ -460,7 +460,7 @@ static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
460460
#undef HQD_N_REGS
461461
#define HQD_N_REGS (19+6+7+10)
462462

463-
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
463+
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
464464
if (*dump == NULL)
465465
return -ENOMEM;
466466

0 commit comments

Comments
 (0)