Skip to content

Commit 9ccde17

Browse files
committed
Merge tag 'amd-drm-next-6.7-2023-11-03' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.7-2023-11-03: amdgpu: - Fix RAS support check - RAS fixes - MES fixes - SMU13 fixes - Contiguous memory allocation fix - BACO fixes - GPU reset fixes - Min power limit fixes - GFX11 fixes - USB4/TB hotplug fixes - ARM regression fix - GFX9.4.3 fixes - KASAN/KCSAN stack size check fixes - SR-IOV fixes - SMU14 fixes - PSP13 fixes - Display blend fixes - Flexible array size fixes amdkfd: - GPUVM fix radeon: - Flexible array size fixes Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents f056cb9 + 6d5e003 commit 9ccde17

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+853
-527
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -363,9 +363,6 @@ struct amdgpu_ip_block_version {
363363
const struct amd_ip_funcs *funcs;
364364
};
365365

366-
#define HW_REV(_Major, _Minor, _Rev) \
367-
((((uint32_t) (_Major)) << 16) | ((uint32_t) (_Minor) << 8) | ((uint32_t) (_Rev)))
368-
369366
struct amdgpu_ip_block {
370367
struct amdgpu_ip_block_status status;
371368
const struct amdgpu_ip_block_version *version;

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,32 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
425425
return ret;
426426
}
427427

428+
static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
429+
uint32_t domain,
430+
struct dma_fence *fence)
431+
{
432+
int ret = amdgpu_bo_reserve(bo, false);
433+
434+
if (ret)
435+
return ret;
436+
437+
ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
438+
if (ret)
439+
goto unreserve_out;
440+
441+
ret = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
442+
if (ret)
443+
goto unreserve_out;
444+
445+
dma_resv_add_fence(bo->tbo.base.resv, fence,
446+
DMA_RESV_USAGE_BOOKKEEP);
447+
448+
unreserve_out:
449+
amdgpu_bo_unreserve(bo);
450+
451+
return ret;
452+
}
453+
428454
static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
429455
{
430456
return amdgpu_amdkfd_bo_validate(bo, bo->allowed_domains, false);
@@ -1784,6 +1810,15 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
17841810
}
17851811
bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT;
17861812
bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT;
1813+
} else {
1814+
mutex_lock(&avm->process_info->lock);
1815+
if (avm->process_info->eviction_fence &&
1816+
!dma_fence_is_signaled(&avm->process_info->eviction_fence->base))
1817+
ret = amdgpu_amdkfd_bo_validate_and_fence(bo, domain,
1818+
&avm->process_info->eviction_fence->base);
1819+
mutex_unlock(&avm->process_info->lock);
1820+
if (ret)
1821+
goto err_validate_bo;
17871822
}
17881823

17891824
if (offset)
@@ -1793,6 +1828,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
17931828

17941829
allocate_init_user_pages_failed:
17951830
err_pin_bo:
1831+
err_validate_bo:
17961832
remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
17971833
drm_vma_node_revoke(&gobj->vma_node, drm_priv);
17981834
err_node_allow:
@@ -1866,10 +1902,6 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
18661902
if (unlikely(ret))
18671903
return ret;
18681904

1869-
/* The eviction fence should be removed by the last unmap.
1870-
* TODO: Log an error condition if the bo still has the eviction fence
1871-
* attached
1872-
*/
18731905
amdgpu_amdkfd_remove_eviction_fence(mem->bo,
18741906
process_info->eviction_fence);
18751907
pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
@@ -1998,19 +2030,6 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
19982030
if (unlikely(ret))
19992031
goto out_unreserve;
20002032

2001-
if (mem->mapped_to_gpu_memory == 0 &&
2002-
!amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
2003-
/* Validate BO only once. The eviction fence gets added to BO
2004-
* the first time it is mapped. Validate will wait for all
2005-
* background evictions to complete.
2006-
*/
2007-
ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
2008-
if (ret) {
2009-
pr_debug("Validate failed\n");
2010-
goto out_unreserve;
2011-
}
2012-
}
2013-
20142033
list_for_each_entry(entry, &mem->attachments, list) {
20152034
if (entry->bo_va->base.vm != avm || entry->is_mapped)
20162035
continue;
@@ -2037,10 +2056,6 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
20372056
mem->mapped_to_gpu_memory);
20382057
}
20392058

2040-
if (!amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) && !bo->tbo.pin_count)
2041-
dma_resv_add_fence(bo->tbo.base.resv,
2042-
&avm->process_info->eviction_fence->base,
2043-
DMA_RESV_USAGE_BOOKKEEP);
20442059
ret = unreserve_bo_and_vms(&ctx, false, false);
20452060

20462061
goto out;
@@ -2074,7 +2089,6 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
20742089
struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv)
20752090
{
20762091
struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
2077-
struct amdkfd_process_info *process_info = avm->process_info;
20782092
unsigned long bo_size = mem->bo->tbo.base.size;
20792093
struct kfd_mem_attachment *entry;
20802094
struct bo_vm_reservation_context ctx;
@@ -2115,15 +2129,6 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
21152129
mem->mapped_to_gpu_memory);
21162130
}
21172131

2118-
/* If BO is unmapped from all VMs, unfence it. It can be evicted if
2119-
* required.
2120-
*/
2121-
if (mem->mapped_to_gpu_memory == 0 &&
2122-
!amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) &&
2123-
!mem->bo->tbo.pin_count)
2124-
amdgpu_amdkfd_remove_eviction_fence(mem->bo,
2125-
process_info->eviction_fence);
2126-
21272132
unreserve_out:
21282133
unreserve_bo_and_vms(&ctx, false, false);
21292134
out:
@@ -2351,8 +2356,20 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
23512356
amdgpu_sync_create(&(*mem)->sync);
23522357
(*mem)->is_imported = true;
23532358

2359+
mutex_lock(&avm->process_info->lock);
2360+
if (avm->process_info->eviction_fence &&
2361+
!dma_fence_is_signaled(&avm->process_info->eviction_fence->base))
2362+
ret = amdgpu_amdkfd_bo_validate_and_fence(bo, (*mem)->domain,
2363+
&avm->process_info->eviction_fence->base);
2364+
mutex_unlock(&avm->process_info->lock);
2365+
if (ret)
2366+
goto err_remove_mem;
2367+
23542368
return 0;
23552369

2370+
err_remove_mem:
2371+
remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
2372+
drm_vma_node_revoke(&obj->vma_node, drm_priv);
23562373
err_free_mem:
23572374
kfree(*mem);
23582375
err_put_obj:

drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "amdgpu.h"
3030
#include "atom.h"
3131

32+
#include <linux/device.h>
3233
#include <linux/pci.h>
3334
#include <linux/slab.h>
3435
#include <linux/acpi.h>
@@ -287,6 +288,10 @@ static bool amdgpu_atrm_get_bios(struct amdgpu_device *adev)
287288
if (adev->flags & AMD_IS_APU)
288289
return false;
289290

291+
/* ATRM is for on-platform devices only */
292+
if (dev_is_removable(&adev->pdev->dev))
293+
return false;
294+
290295
while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) {
291296
dhandle = ACPI_HANDLE(&pdev->dev);
292297
if (!dhandle)

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,11 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11161116
return r;
11171117
}
11181118

1119+
/* FIXME: In theory this loop shouldn't be needed any more when
1120+
* amdgpu_vm_handle_moved handles all moved BOs that are reserved
1121+
* with p->ticket. But removing it caused test regressions, so I'm
1122+
* leaving it here for now.
1123+
*/
11191124
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
11201125
bo_va = e->bo_va;
11211126
if (bo_va == NULL)
@@ -1130,7 +1135,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
11301135
return r;
11311136
}
11321137

1133-
r = amdgpu_vm_handle_moved(adev, vm);
1138+
r = amdgpu_vm_handle_moved(adev, vm, &p->exec.ticket);
11341139
if (r)
11351140
return r;
11361141

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <drm/drm_fb_helper.h>
4242
#include <drm/drm_probe_helper.h>
4343
#include <drm/amdgpu_drm.h>
44+
#include <linux/device.h>
4445
#include <linux/vgaarb.h>
4546
#include <linux/vga_switcheroo.h>
4647
#include <linux/efi.h>
@@ -1073,6 +1074,8 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)
10731074
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
10741075
amdgpu_psp_wait_for_bootloader(adev);
10751076
ret = amdgpu_atomfirmware_asic_init(adev, true);
1077+
/* TODO: check the return val and stop device initialization if boot fails */
1078+
amdgpu_psp_query_boot_status(adev);
10761079
return ret;
10771080
} else {
10781081
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
@@ -2223,7 +2226,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
22232226
*/
22242227
static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
22252228
{
2226-
struct drm_device *dev = adev_to_drm(adev);
22272229
struct pci_dev *parent;
22282230
int i, r;
22292231
bool total;
@@ -2294,7 +2296,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
22942296
(amdgpu_is_atpx_hybrid() ||
22952297
amdgpu_has_atpx_dgpu_power_cntl()) &&
22962298
((adev->flags & AMD_IS_APU) == 0) &&
2297-
!pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2299+
!dev_is_removable(&adev->pdev->dev))
22982300
adev->flags |= AMD_IS_PX;
22992301

23002302
if (!(adev->flags & AMD_IS_APU)) {
@@ -3962,13 +3964,23 @@ int amdgpu_device_init(struct amdgpu_device *adev,
39623964
}
39633965
}
39643966
} else {
3965-
tmp = amdgpu_reset_method;
3966-
/* It should do a default reset when loading or reloading the driver,
3967-
* regardless of the module parameter reset_method.
3968-
*/
3969-
amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3970-
r = amdgpu_asic_reset(adev);
3971-
amdgpu_reset_method = tmp;
3967+
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
3968+
case IP_VERSION(13, 0, 0):
3969+
case IP_VERSION(13, 0, 7):
3970+
case IP_VERSION(13, 0, 10):
3971+
r = psp_gpu_reset(adev);
3972+
break;
3973+
default:
3974+
tmp = amdgpu_reset_method;
3975+
/* It should do a default reset when loading or reloading the driver,
3976+
* regardless of the module parameter reset_method.
3977+
*/
3978+
amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3979+
r = amdgpu_asic_reset(adev);
3980+
amdgpu_reset_method = tmp;
3981+
break;
3982+
}
3983+
39723984
if (r) {
39733985
dev_err(adev->dev, "asic reset on init failed\n");
39743986
goto failed;
@@ -4132,7 +4144,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
41324144

41334145
px = amdgpu_device_supports_px(ddev);
41344146

4135-
if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4147+
if (px || (!dev_is_removable(&adev->pdev->dev) &&
41364148
apple_gmux_detect(NULL, NULL)))
41374149
vga_switcheroo_register_client(adev->pdev,
41384150
&amdgpu_switcheroo_ops, px);
@@ -4282,7 +4294,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
42824294

42834295
px = amdgpu_device_supports_px(adev_to_drm(adev));
42844296

4285-
if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4297+
if (px || (!dev_is_removable(&adev->pdev->dev) &&
42864298
apple_gmux_detect(NULL, NULL)))
42874299
vga_switcheroo_unregister_client(adev->pdev);
42884300

@@ -5566,10 +5578,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
55665578
drm_sched_start(&ring->sched, true);
55675579
}
55685580

5569-
if (adev->enable_mes &&
5570-
amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3))
5571-
amdgpu_mes_self_test(tmp_adev);
5572-
55735581
if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
55745582
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
55755583

drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
100100

101101
#define mmRCC_CONFIG_MEMSIZE 0xde3
102+
#define mmMP0_SMN_C2PMSG_33 0x16061
102103
#define mmMM_INDEX 0x0
103104
#define mmMM_INDEX_HI 0x6
104105
#define mmMM_DATA 0x1
@@ -239,8 +240,26 @@ static int amdgpu_discovery_read_binary_from_sysmem(struct amdgpu_device *adev,
239240
static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
240241
uint8_t *binary)
241242
{
242-
uint64_t vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
243-
int ret = 0;
243+
uint64_t vram_size;
244+
u32 msg;
245+
int i, ret = 0;
246+
247+
/* It can take up to a second for IFWI init to complete on some dGPUs,
248+
* but generally it should be in the 60-100ms range. Normally this starts
249+
* as soon as the device gets power so by the time the OS loads this has long
250+
* completed. However, when a card is hotplugged via e.g., USB4, we need to
251+
* wait for this to complete. Once the C2PMSG is updated, we can
252+
* continue.
253+
*/
254+
if (dev_is_removable(&adev->pdev->dev)) {
255+
for (i = 0; i < 1000; i++) {
256+
msg = RREG32(mmMP0_SMN_C2PMSG_33);
257+
if (msg & 0x80000000)
258+
break;
259+
msleep(1);
260+
}
261+
}
262+
vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
244263

245264
if (vram_size) {
246265
uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
@@ -2449,6 +2468,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
24492468
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == IP_VERSION(4, 8, 0))
24502469
adev->gmc.xgmi.supported = true;
24512470

2471+
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3))
2472+
adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 4, 0);
2473+
24522474
/* set NBIO version */
24532475
switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
24542476
case IP_VERSION(6, 1, 0):

drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
409409
if (!r)
410410
r = amdgpu_vm_clear_freed(adev, vm, NULL);
411411
if (!r)
412-
r = amdgpu_vm_handle_moved(adev, vm);
412+
r = amdgpu_vm_handle_moved(adev, vm, ticket);
413413

414414
if (r && r != -EBUSY)
415415
DRM_ERROR("Failed to invalidate VM page tables (%d))\n",

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2041,6 +2041,14 @@ static const struct pci_device_id pciidlist[] = {
20412041

20422042
MODULE_DEVICE_TABLE(pci, pciidlist);
20432043

2044+
static const struct amdgpu_asic_type_quirk asic_type_quirks[] = {
2045+
/* differentiate between P10 and P11 asics with the same DID */
2046+
{0x67FF, 0xE3, CHIP_POLARIS10},
2047+
{0x67FF, 0xE7, CHIP_POLARIS10},
2048+
{0x67FF, 0xF3, CHIP_POLARIS10},
2049+
{0x67FF, 0xF7, CHIP_POLARIS10},
2050+
};
2051+
20442052
static const struct drm_driver amdgpu_kms_driver;
20452053

20462054
static void amdgpu_get_secondary_funcs(struct amdgpu_device *adev)
@@ -2083,6 +2091,22 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
20832091
}
20842092
}
20852093

2094+
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
2095+
{
2096+
int i;
2097+
2098+
for (i = 0; i < ARRAY_SIZE(asic_type_quirks); i++) {
2099+
if (pdev->device == asic_type_quirks[i].device &&
2100+
pdev->revision == asic_type_quirks[i].revision) {
2101+
flags &= ~AMD_ASIC_MASK;
2102+
flags |= asic_type_quirks[i].type;
2103+
break;
2104+
}
2105+
}
2106+
2107+
return flags;
2108+
}
2109+
20862110
static int amdgpu_pci_probe(struct pci_dev *pdev,
20872111
const struct pci_device_id *ent)
20882112
{
@@ -2110,15 +2134,8 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
21102134
"See modparam exp_hw_support\n");
21112135
return -ENODEV;
21122136
}
2113-
/* differentiate between P10 and P11 asics with the same DID */
2114-
if (pdev->device == 0x67FF &&
2115-
(pdev->revision == 0xE3 ||
2116-
pdev->revision == 0xE7 ||
2117-
pdev->revision == 0xF3 ||
2118-
pdev->revision == 0xF7)) {
2119-
flags &= ~AMD_ASIC_MASK;
2120-
flags |= CHIP_POLARIS10;
2121-
}
2137+
2138+
flags = amdgpu_fix_asic_type(pdev, flags);
21222139

21232140
/* Due to hardware bugs, S/G Display on raven requires a 1:1 IOMMU mapping,
21242141
* however, SME requires an indirect IOMMU mapping because the encryption

0 commit comments

Comments
 (0)