Skip to content

Commit 3673f5b

Browse files
committed
Merge tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio
Pull vfio updates from Alex Williamson: - Extend vfio-pci 8-byte read/write support to include archs defining CONFIG_GENERIC_IOMAP, such as x86, and remove now extraneous #ifdefs around 64-bit accessors (Ramesh Thomas) - Update vfio-pci shadow ROM handling and allow cached ROM from setup data to be exposed as a functional ROM BAR region when available (Yunxiang Li) - Update nvgrace-gpu vfio-pci variant driver for new Grace Blackwell hardware, conditionalizing the uncached BAR workaround for previous generation hardware based on the presence of a flag in a new DVSEC capability, and include a delay during probe for link training to complete, a new requirement for GB devices (Ankit Agrawal) * tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio: vfio/nvgrace-gpu: Add GB200 SKU to the devid table vfio/nvgrace-gpu: Check the HBM training and C2C link status vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem vfio/platform: check the bounds of read/write syscalls vfio/pci: Expose setup ROM at ROM bar when needed vfio/pci: Remove shadow ROM specific code paths vfio/pci: Remove #ifdef iowrite64 and #ifdef ioread64 vfio/pci: Enable iowrite64 and ioread64 for vfio pci
2 parents 2ab002c + 2bb4475 commit 3673f5b

File tree

5 files changed

+196
-69
lines changed

5 files changed

+196
-69
lines changed

drivers/vfio/pci/nvgrace-gpu/main.c

Lines changed: 147 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
#include <linux/sizes.h>
77
#include <linux/vfio_pci_core.h>
8+
#include <linux/delay.h>
9+
#include <linux/jiffies.h>
810

911
/*
1012
* The device memory usable to the workloads running in the VM is cached
@@ -17,12 +19,21 @@
1719
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
1820
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
1921

20-
/* Memory size expected as non cached and reserved by the VM driver */
21-
#define RESMEM_SIZE SZ_1G
22-
2322
/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
2423
#define MEMBLK_SIZE SZ_512M
2524

25+
#define DVSEC_BITMAP_OFFSET 0xA
26+
#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
27+
28+
#define GPU_CAP_DVSEC_REGISTER 3
29+
30+
#define C2C_LINK_BAR0_OFFSET 0x1498
31+
#define HBM_TRAINING_BAR0_OFFSET 0x200BC
32+
#define STATUS_READY 0xFF
33+
34+
#define POLL_QUANTUM_MS 1000
35+
#define POLL_TIMEOUT_MS (30 * 1000)
36+
2637
/*
2738
* The state of the two device memory region - resmem and usemem - is
2839
* saved as struct mem_region.
@@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
4657
struct mem_region resmem;
4758
/* Lock to control device memory kernel mapping */
4859
struct mutex remap_lock;
60+
bool has_mig_hw_bug;
4961
};
5062

5163
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
6678
if (index == USEMEM_REGION_INDEX)
6779
return &nvdev->usemem;
6880

69-
if (index == RESMEM_REGION_INDEX)
81+
if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
7082
return &nvdev->resmem;
7183

7284
return NULL;
@@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
751763
u64 memphys, u64 memlength)
752764
{
753765
int ret = 0;
766+
u64 resmem_size = 0;
754767

755768
/*
756-
* The VM GPU device driver needs a non-cacheable region to support
757-
* the MIG feature. Since the device memory is mapped as NORMAL cached,
758-
* carve out a region from the end with a different NORMAL_NC
759-
* property (called as reserved memory and represented as resmem). This
760-
* region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
761-
* exposing the rest (termed as usable memory and represented using usemem)
762-
* as cacheable 64b BAR (region 4 and 5).
769+
* On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
770+
* region to support the MIG feature owing to a hardware bug. Since the
771+
* device memory is mapped as NORMAL cached, carve out a region from the end
772+
* with a different NORMAL_NC property (called as reserved memory and
773+
* represented as resmem). This region then is exposed as a 64b BAR
774+
* (region 2 and 3) to the VM, while exposing the rest (termed as usable
775+
* memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
763776
*
764777
* devmem (memlength)
765778
* |-------------------------------------------------|
766779
* | |
767780
* usemem.memphys resmem.memphys
781+
*
782+
* This hardware bug is fixed on the Grace Blackwell platforms and the
783+
* presence of the bug can be determined through nvdev->has_mig_hw_bug.
784+
* Thus on systems with the hardware fix, there is no need to partition
785+
* the GPU device memory and the entire memory is usable and mapped as
786+
* NORMAL cached (i.e. resmem size is 0).
768787
*/
788+
if (nvdev->has_mig_hw_bug)
789+
resmem_size = SZ_1G;
790+
769791
nvdev->usemem.memphys = memphys;
770792

771793
/*
772794
* The device memory exposed to the VM is added to the kernel by the
773-
* VM driver module in chunks of memory block size. Only the usable
774-
* memory (usemem) is added to the kernel for usage by the VM
775-
* workloads. Make the usable memory size memblock aligned.
795+
* VM driver module in chunks of memory block size. Note that only the
796+
* usable memory (usemem) is added to the kernel for usage by the VM
797+
* workloads.
776798
*/
777-
if (check_sub_overflow(memlength, RESMEM_SIZE,
799+
if (check_sub_overflow(memlength, resmem_size,
778800
&nvdev->usemem.memlength)) {
779801
ret = -EOVERFLOW;
780802
goto done;
781803
}
782804

783805
/*
784-
* The USEMEM part of the device memory has to be MEMBLK_SIZE
785-
* aligned. This is a hardwired ABI value between the GPU FW and
786-
* VFIO driver. The VM device driver is also aware of it and make
787-
* use of the value for its calculation to determine USEMEM size.
806+
* The usemem region is exposed as a 64B Bar composed of region 4 and 5.
807+
* Calculate and save the BAR size for the region.
808+
*/
809+
nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
810+
811+
/*
812+
* If the hardware has the fix for MIG, there is no requirement
813+
* for splitting the device memory to create RESMEM. The entire
814+
* device memory is usable and will be USEMEM. Return here for
815+
* such case.
816+
*/
817+
if (!nvdev->has_mig_hw_bug)
818+
goto done;
819+
820+
/*
821+
* When the device memory is split to workaround the MIG bug on
822+
* Grace Hopper, the USEMEM part of the device memory has to be
823+
* MEMBLK_SIZE aligned. This is a hardwired ABI value between the
824+
* GPU FW and VFIO driver. The VM device driver is also aware of it
825+
* and make use of the value for its calculation to determine USEMEM
826+
* size. Note that the device memory may not be 512M aligned.
788827
*/
789828
nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
790829
MEMBLK_SIZE);
@@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
803842
}
804843

805844
/*
806-
* The memory regions are exposed as BARs. Calculate and save
807-
* the BAR size for them.
845+
* The resmem region is exposed as a 64b BAR composed of region 2 and 3
846+
* for Grace Hopper. Calculate and save the BAR size for the region.
808847
*/
809-
nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
810848
nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
811849
done:
812850
return ret;
813851
}
814852

853+
static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
854+
{
855+
int pcie_dvsec;
856+
u16 dvsec_ctrl16;
857+
858+
pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
859+
GPU_CAP_DVSEC_REGISTER);
860+
861+
if (pcie_dvsec) {
862+
pci_read_config_word(pdev,
863+
pcie_dvsec + DVSEC_BITMAP_OFFSET,
864+
&dvsec_ctrl16);
865+
866+
if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
867+
return false;
868+
}
869+
870+
return true;
871+
}
872+
873+
/*
874+
* To reduce the system bootup time, the HBM training has
875+
* been moved out of the UEFI on the Grace-Blackwell systems.
876+
*
877+
* The onus of checking whether the HBM training has completed
878+
* thus falls on the module. The HBM training status can be
879+
* determined from a BAR0 register.
880+
*
881+
* Similarly, another BAR0 register exposes the status of the
882+
* CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
883+
*
884+
* Poll these register and check for 30s. If the HBM training is
885+
* not complete or if the C2C link is not ready, fail the probe.
886+
*
887+
* While the wait is not required on Grace Hopper systems, it
888+
* is beneficial to make the check to ensure the device is in an
889+
* expected state.
890+
*
891+
* Ensure that the BAR0 region is enabled before accessing the
892+
* registers.
893+
*/
894+
static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
895+
{
896+
unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
897+
void __iomem *io;
898+
int ret = -ETIME;
899+
900+
ret = pci_enable_device(pdev);
901+
if (ret)
902+
return ret;
903+
904+
ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
905+
if (ret)
906+
goto request_region_exit;
907+
908+
io = pci_iomap(pdev, 0, 0);
909+
if (!io) {
910+
ret = -ENOMEM;
911+
goto iomap_exit;
912+
}
913+
914+
do {
915+
if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
916+
(ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
917+
ret = 0;
918+
goto reg_check_exit;
919+
}
920+
msleep(POLL_QUANTUM_MS);
921+
} while (!time_after(jiffies, timeout));
922+
923+
reg_check_exit:
924+
pci_iounmap(pdev, io);
925+
iomap_exit:
926+
pci_release_selected_regions(pdev, 1 << 0);
927+
request_region_exit:
928+
pci_disable_device(pdev);
929+
return ret;
930+
}
931+
815932
static int nvgrace_gpu_probe(struct pci_dev *pdev,
816933
const struct pci_device_id *id)
817934
{
@@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
820937
u64 memphys, memlength;
821938
int ret;
822939

940+
ret = nvgrace_gpu_wait_device_ready(pdev);
941+
if (ret)
942+
return ret;
943+
823944
ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
824945
if (!ret)
825946
ops = &nvgrace_gpu_pci_ops;
@@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
832953
dev_set_drvdata(&pdev->dev, &nvdev->core_device);
833954

834955
if (ops == &nvgrace_gpu_pci_ops) {
956+
nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
957+
835958
/*
836959
* Device memory properties are identified in the host ACPI
837960
* table. Set the nvgrace_gpu_pci_core_device structure.
@@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
868991
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
869992
/* GH200 SKU */
870993
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
994+
/* GB200 SKU */
995+
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
871996
{}
872997
};
873998

drivers/vfio/pci/vfio_pci_config.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
511511
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
512512
mask |= PCI_ROM_ADDRESS_ENABLE;
513513
*vbar &= cpu_to_le32((u32)mask);
514-
} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
515-
IORESOURCE_ROM_SHADOW) {
516-
mask = ~(0x20000 - 1);
514+
} else if (pdev->rom && pdev->romlen) {
515+
mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
517516
mask |= PCI_ROM_ADDRESS_ENABLE;
518517
*vbar &= cpu_to_le32((u32)mask);
519-
} else
518+
} else {
520519
*vbar = 0;
520+
}
521521

522522
vdev->bardirty = false;
523523
}

drivers/vfio/pci/vfio_pci_core.c

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,31 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
10541054

10551055
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
10561056
info.flags = 0;
1057+
info.size = 0;
10571058

1058-
/* Report the BAR size, not the ROM size */
1059-
info.size = pci_resource_len(pdev, info.index);
1060-
if (!info.size) {
1061-
/* Shadow ROMs appear as PCI option ROMs */
1062-
if (pdev->resource[PCI_ROM_RESOURCE].flags &
1063-
IORESOURCE_ROM_SHADOW)
1064-
info.size = 0x20000;
1065-
else
1066-
break;
1067-
}
1068-
1069-
/*
1070-
* Is it really there? Enable memory decode for implicit access
1071-
* in pci_map_rom().
1072-
*/
1073-
cmd = vfio_pci_memory_lock_and_enable(vdev);
1074-
io = pci_map_rom(pdev, &size);
1075-
if (io) {
1059+
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
1060+
/*
1061+
* Check ROM content is valid. Need to enable memory
1062+
* decode for ROM access in pci_map_rom().
1063+
*/
1064+
cmd = vfio_pci_memory_lock_and_enable(vdev);
1065+
io = pci_map_rom(pdev, &size);
1066+
if (io) {
1067+
info.flags = VFIO_REGION_INFO_FLAG_READ;
1068+
/* Report the BAR size, not the ROM size. */
1069+
info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
1070+
pci_unmap_rom(pdev, io);
1071+
}
1072+
vfio_pci_memory_unlock_and_restore(vdev, cmd);
1073+
} else if (pdev->rom && pdev->romlen) {
10761074
info.flags = VFIO_REGION_INFO_FLAG_READ;
1077-
pci_unmap_rom(pdev, io);
1078-
} else {
1079-
info.size = 0;
1075+
/* Report BAR size as power of two. */
1076+
info.size = roundup_pow_of_two(pdev->romlen);
10801077
}
1081-
vfio_pci_memory_unlock_and_restore(vdev, cmd);
10821078

10831079
break;
10841080
}

0 commit comments

Comments
 (0)