Skip to content

Commit d85f69d

Browse files
ankita-nvawilliam
authored andcommitted
vfio/nvgrace-gpu: Check the HBM training and C2C link status
In contrast to Grace Hopper systems, the HBM training has been moved out of the UEFI on the Grace Blackwell systems. This reduces the system bootup time significantly. The onus of checking whether the HBM training has completed thus falls on the module. The HBM training status can be determined from a BAR0 register. Similarly, another BAR0 register exposes the status of the CPU-GPU chip-to-chip (C2C) cache coherent interconnect. Based on testing, 30s is determined to be sufficient to ensure initialization completion on all the Grace based systems. Thus poll these register and check for 30s. If the HBM training is not complete or if the C2C link is not ready, fail the probe. While the time is not required on Grace Hopper systems, it is beneficial to make the check to ensure the device is in an expected state. Hence keeping it generalized to both the generations. Ensure that the BAR0 is enabled before accessing the registers. CC: Alex Williamson <[email protected]> CC: Kevin Tian <[email protected]> CC: Jason Gunthorpe <[email protected]> Signed-off-by: Ankit Agrawal <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alex Williamson <[email protected]>
1 parent 6a9eb2d commit d85f69d

File tree

1 file changed

+72
-0
lines changed
  • drivers/vfio/pci/nvgrace-gpu

1 file changed

+72
-0
lines changed

drivers/vfio/pci/nvgrace-gpu/main.c

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
#include <linux/sizes.h>
77
#include <linux/vfio_pci_core.h>
8+
#include <linux/delay.h>
9+
#include <linux/jiffies.h>
810

911
/*
1012
* The device memory usable to the workloads running in the VM is cached
@@ -25,6 +27,13 @@
2527

2628
#define GPU_CAP_DVSEC_REGISTER 3
2729

30+
#define C2C_LINK_BAR0_OFFSET 0x1498
31+
#define HBM_TRAINING_BAR0_OFFSET 0x200BC
32+
#define STATUS_READY 0xFF
33+
34+
#define POLL_QUANTUM_MS 1000
35+
#define POLL_TIMEOUT_MS (30 * 1000)
36+
2837
/*
2938
* The state of the two device memory region - resmem and usemem - is
3039
* saved as struct mem_region.
@@ -861,6 +870,65 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
861870
return true;
862871
}
863872

873+
/*
874+
* To reduce the system bootup time, the HBM training has
875+
* been moved out of the UEFI on the Grace-Blackwell systems.
876+
*
877+
* The onus of checking whether the HBM training has completed
878+
* thus falls on the module. The HBM training status can be
879+
* determined from a BAR0 register.
880+
*
881+
* Similarly, another BAR0 register exposes the status of the
882+
* CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
883+
*
884+
* Poll these register and check for 30s. If the HBM training is
885+
* not complete or if the C2C link is not ready, fail the probe.
886+
*
887+
* While the wait is not required on Grace Hopper systems, it
888+
* is beneficial to make the check to ensure the device is in an
889+
* expected state.
890+
*
891+
* Ensure that the BAR0 region is enabled before accessing the
892+
* registers.
893+
*/
894+
static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
895+
{
896+
unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
897+
void __iomem *io;
898+
int ret = -ETIME;
899+
900+
ret = pci_enable_device(pdev);
901+
if (ret)
902+
return ret;
903+
904+
ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
905+
if (ret)
906+
goto request_region_exit;
907+
908+
io = pci_iomap(pdev, 0, 0);
909+
if (!io) {
910+
ret = -ENOMEM;
911+
goto iomap_exit;
912+
}
913+
914+
do {
915+
if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
916+
(ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
917+
ret = 0;
918+
goto reg_check_exit;
919+
}
920+
msleep(POLL_QUANTUM_MS);
921+
} while (!time_after(jiffies, timeout));
922+
923+
reg_check_exit:
924+
pci_iounmap(pdev, io);
925+
iomap_exit:
926+
pci_release_selected_regions(pdev, 1 << 0);
927+
request_region_exit:
928+
pci_disable_device(pdev);
929+
return ret;
930+
}
931+
864932
static int nvgrace_gpu_probe(struct pci_dev *pdev,
865933
const struct pci_device_id *id)
866934
{
@@ -869,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
869937
u64 memphys, memlength;
870938
int ret;
871939

940+
ret = nvgrace_gpu_wait_device_ready(pdev);
941+
if (ret)
942+
return ret;
943+
872944
ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
873945
if (!ret)
874946
ops = &nvgrace_gpu_pci_ops;

0 commit comments

Comments
 (0)