5
5
6
6
#include <linux/sizes.h>
7
7
#include <linux/vfio_pci_core.h>
8
+ #include <linux/delay.h>
9
+ #include <linux/jiffies.h>
8
10
9
11
/*
10
12
* The device memory usable to the workloads running in the VM is cached
17
19
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
18
20
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
19
21
20
- /* Memory size expected as non cached and reserved by the VM driver */
21
- #define RESMEM_SIZE SZ_1G
22
-
23
22
/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
24
23
#define MEMBLK_SIZE SZ_512M
25
24
25
+ #define DVSEC_BITMAP_OFFSET 0xA
26
+ #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
27
+
28
+ #define GPU_CAP_DVSEC_REGISTER 3
29
+
30
+ #define C2C_LINK_BAR0_OFFSET 0x1498
31
+ #define HBM_TRAINING_BAR0_OFFSET 0x200BC
32
+ #define STATUS_READY 0xFF
33
+
34
+ #define POLL_QUANTUM_MS 1000
35
+ #define POLL_TIMEOUT_MS (30 * 1000)
36
+
26
37
/*
27
38
* The state of the two device memory region - resmem and usemem - is
28
39
* saved as struct mem_region.
@@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
46
57
struct mem_region resmem ;
47
58
/* Lock to control device memory kernel mapping */
48
59
struct mutex remap_lock ;
60
+ bool has_mig_hw_bug ;
49
61
};
50
62
51
63
static void nvgrace_gpu_init_fake_bar_emu_regs (struct vfio_device * core_vdev )
@@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
66
78
if (index == USEMEM_REGION_INDEX )
67
79
return & nvdev -> usemem ;
68
80
69
- if (index == RESMEM_REGION_INDEX )
81
+ if (nvdev -> resmem . memlength && index == RESMEM_REGION_INDEX )
70
82
return & nvdev -> resmem ;
71
83
72
84
return NULL ;
@@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
751
763
u64 memphys , u64 memlength )
752
764
{
753
765
int ret = 0 ;
766
+ u64 resmem_size = 0 ;
754
767
755
768
/*
756
- * The VM GPU device driver needs a non-cacheable region to support
757
- * the MIG feature. Since the device memory is mapped as NORMAL cached,
758
- * carve out a region from the end with a different NORMAL_NC
759
- * property (called as reserved memory and represented as resmem). This
760
- * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
761
- * exposing the rest (termed as usable memory and represented using usemem)
762
- * as cacheable 64b BAR (region 4 and 5).
769
+ * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
770
+ * region to support the MIG feature owing to a hardware bug. Since the
771
+ * device memory is mapped as NORMAL cached, carve out a region from the end
772
+ * with a different NORMAL_NC property (called as reserved memory and
773
+ * represented as resmem). This region then is exposed as a 64b BAR
774
+ * (region 2 and 3) to the VM, while exposing the rest (termed as usable
775
+ * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
763
776
*
764
777
* devmem (memlength)
765
778
* |-------------------------------------------------|
766
779
* | |
767
780
* usemem.memphys resmem.memphys
781
+ *
782
+ * This hardware bug is fixed on the Grace Blackwell platforms and the
783
+ * presence of the bug can be determined through nvdev->has_mig_hw_bug.
784
+ * Thus on systems with the hardware fix, there is no need to partition
785
+ * the GPU device memory and the entire memory is usable and mapped as
786
+ * NORMAL cached (i.e. resmem size is 0).
768
787
*/
788
+ if (nvdev -> has_mig_hw_bug )
789
+ resmem_size = SZ_1G ;
790
+
769
791
nvdev -> usemem .memphys = memphys ;
770
792
771
793
/*
772
794
* The device memory exposed to the VM is added to the kernel by the
773
- * VM driver module in chunks of memory block size. Only the usable
774
- * memory (usemem) is added to the kernel for usage by the VM
775
- * workloads. Make the usable memory size memblock aligned.
795
+ * VM driver module in chunks of memory block size. Note that only the
796
+ * usable memory (usemem) is added to the kernel for usage by the VM
797
+ * workloads.
776
798
*/
777
- if (check_sub_overflow (memlength , RESMEM_SIZE ,
799
+ if (check_sub_overflow (memlength , resmem_size ,
778
800
& nvdev -> usemem .memlength )) {
779
801
ret = - EOVERFLOW ;
780
802
goto done ;
781
803
}
782
804
783
805
/*
784
- * The USEMEM part of the device memory has to be MEMBLK_SIZE
785
- * aligned. This is a hardwired ABI value between the GPU FW and
786
- * VFIO driver. The VM device driver is also aware of it and make
787
- * use of the value for its calculation to determine USEMEM size.
806
+ * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
807
+ * Calculate and save the BAR size for the region.
808
+ */
809
+ nvdev -> usemem .bar_size = roundup_pow_of_two (nvdev -> usemem .memlength );
810
+
811
+ /*
812
+ * If the hardware has the fix for MIG, there is no requirement
813
+ * for splitting the device memory to create RESMEM. The entire
814
+ * device memory is usable and will be USEMEM. Return here for
815
+ * such case.
816
+ */
817
+ if (!nvdev -> has_mig_hw_bug )
818
+ goto done ;
819
+
820
+ /*
821
+ * When the device memory is split to workaround the MIG bug on
822
+ * Grace Hopper, the USEMEM part of the device memory has to be
823
+ * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
824
+ * GPU FW and VFIO driver. The VM device driver is also aware of it
825
+ * and make use of the value for its calculation to determine USEMEM
826
+ * size. Note that the device memory may not be 512M aligned.
788
827
*/
789
828
nvdev -> usemem .memlength = round_down (nvdev -> usemem .memlength ,
790
829
MEMBLK_SIZE );
@@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
803
842
}
804
843
805
844
/*
806
- * The memory regions are exposed as BARs. Calculate and save
807
- * the BAR size for them .
845
+ * The resmem region is exposed as a 64b BAR composed of region 2 and 3
846
+ * for Grace Hopper. Calculate and save the BAR size for the region .
808
847
*/
809
- nvdev -> usemem .bar_size = roundup_pow_of_two (nvdev -> usemem .memlength );
810
848
nvdev -> resmem .bar_size = roundup_pow_of_two (nvdev -> resmem .memlength );
811
849
done :
812
850
return ret ;
813
851
}
814
852
853
+ static bool nvgrace_gpu_has_mig_hw_bug (struct pci_dev * pdev )
854
+ {
855
+ int pcie_dvsec ;
856
+ u16 dvsec_ctrl16 ;
857
+
858
+ pcie_dvsec = pci_find_dvsec_capability (pdev , PCI_VENDOR_ID_NVIDIA ,
859
+ GPU_CAP_DVSEC_REGISTER );
860
+
861
+ if (pcie_dvsec ) {
862
+ pci_read_config_word (pdev ,
863
+ pcie_dvsec + DVSEC_BITMAP_OFFSET ,
864
+ & dvsec_ctrl16 );
865
+
866
+ if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM )
867
+ return false;
868
+ }
869
+
870
+ return true;
871
+ }
872
+
873
+ /*
874
+ * To reduce the system bootup time, the HBM training has
875
+ * been moved out of the UEFI on the Grace-Blackwell systems.
876
+ *
877
+ * The onus of checking whether the HBM training has completed
878
+ * thus falls on the module. The HBM training status can be
879
+ * determined from a BAR0 register.
880
+ *
881
+ * Similarly, another BAR0 register exposes the status of the
882
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
883
+ *
884
+ * Poll these register and check for 30s. If the HBM training is
885
+ * not complete or if the C2C link is not ready, fail the probe.
886
+ *
887
+ * While the wait is not required on Grace Hopper systems, it
888
+ * is beneficial to make the check to ensure the device is in an
889
+ * expected state.
890
+ *
891
+ * Ensure that the BAR0 region is enabled before accessing the
892
+ * registers.
893
+ */
894
+ static int nvgrace_gpu_wait_device_ready (struct pci_dev * pdev )
895
+ {
896
+ unsigned long timeout = jiffies + msecs_to_jiffies (POLL_TIMEOUT_MS );
897
+ void __iomem * io ;
898
+ int ret = - ETIME ;
899
+
900
+ ret = pci_enable_device (pdev );
901
+ if (ret )
902
+ return ret ;
903
+
904
+ ret = pci_request_selected_regions (pdev , 1 << 0 , KBUILD_MODNAME );
905
+ if (ret )
906
+ goto request_region_exit ;
907
+
908
+ io = pci_iomap (pdev , 0 , 0 );
909
+ if (!io ) {
910
+ ret = - ENOMEM ;
911
+ goto iomap_exit ;
912
+ }
913
+
914
+ do {
915
+ if ((ioread32 (io + C2C_LINK_BAR0_OFFSET ) == STATUS_READY ) &&
916
+ (ioread32 (io + HBM_TRAINING_BAR0_OFFSET ) == STATUS_READY )) {
917
+ ret = 0 ;
918
+ goto reg_check_exit ;
919
+ }
920
+ msleep (POLL_QUANTUM_MS );
921
+ } while (!time_after (jiffies , timeout ));
922
+
923
+ reg_check_exit :
924
+ pci_iounmap (pdev , io );
925
+ iomap_exit :
926
+ pci_release_selected_regions (pdev , 1 << 0 );
927
+ request_region_exit :
928
+ pci_disable_device (pdev );
929
+ return ret ;
930
+ }
931
+
815
932
static int nvgrace_gpu_probe (struct pci_dev * pdev ,
816
933
const struct pci_device_id * id )
817
934
{
@@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
820
937
u64 memphys , memlength ;
821
938
int ret ;
822
939
940
+ ret = nvgrace_gpu_wait_device_ready (pdev );
941
+ if (ret )
942
+ return ret ;
943
+
823
944
ret = nvgrace_gpu_fetch_memory_property (pdev , & memphys , & memlength );
824
945
if (!ret )
825
946
ops = & nvgrace_gpu_pci_ops ;
@@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
832
953
dev_set_drvdata (& pdev -> dev , & nvdev -> core_device );
833
954
834
955
if (ops == & nvgrace_gpu_pci_ops ) {
956
+ nvdev -> has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug (pdev );
957
+
835
958
/*
836
959
* Device memory properties are identified in the host ACPI
837
960
* table. Set the nvgrace_gpu_pci_core_device structure.
@@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
868
991
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO (PCI_VENDOR_ID_NVIDIA , 0x2345 ) },
869
992
/* GH200 SKU */
870
993
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO (PCI_VENDOR_ID_NVIDIA , 0x2348 ) },
994
+ /* GB200 SKU */
995
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO (PCI_VENDOR_ID_NVIDIA , 0x2941 ) },
871
996
{}
872
997
};
873
998
0 commit comments