@@ -239,6 +239,137 @@ group and can access them as follows::
239
239
/* Gratuitous device reset and go... */
240
240
ioctl(device, VFIO_DEVICE_RESET);
241
241
242
+ IOMMUFD and vfio_iommu_type1
243
+ ----------------------------
244
+
245
+ IOMMUFD is the new user API to manage I/O page tables from userspace.
246
+ It intends to be the portal of delivering advanced userspace DMA
247
+ features (nested translation [5 ]_, PASID [6 ]_, etc.) while also providing
248
+ a backwards compatibility interface for existing VFIO_TYPE1v2_IOMMU use
249
+ cases. Eventually the vfio_iommu_type1 driver, as well as the legacy
250
+ vfio container and group model is intended to be deprecated.
251
+
252
+ The IOMMUFD backwards compatibility interface can be enabled two ways.
253
+ In the first method, the kernel can be configured with
254
+ CONFIG_IOMMUFD_VFIO_CONTAINER, in which case the IOMMUFD subsystem
255
+ transparently provides the entire infrastructure for the VFIO
256
+ container and IOMMU backend interfaces. The compatibility mode can
257
+ also be accessed if the VFIO container interface, ie. /dev/vfio/vfio is
258
+ simply symlink'd to /dev/iommu. Note that at the time of writing, the
259
+ compatibility mode is not entirely feature complete relative to
260
+ VFIO_TYPE1v2_IOMMU (ex. DMA mapping MMIO) and does not attempt to
261
+ provide compatibility to the VFIO_SPAPR_TCE_IOMMU interface. Therefore
262
+ it is not generally advisable at this time to switch from native VFIO
263
+ implementations to the IOMMUFD compatibility interfaces.
264
+
265
+ Long term, VFIO users should migrate to device access through the cdev
266
+ interface described below, and native access through the IOMMUFD
267
+ provided interfaces.
268
+
269
+ VFIO Device cdev
270
+ ----------------
271
+
272
+ Traditionally user acquires a device fd via VFIO_GROUP_GET_DEVICE_FD
273
+ in a VFIO group.
274
+
275
+ With CONFIG_VFIO_DEVICE_CDEV=y the user can now acquire a device fd
276
+ by directly opening a character device /dev/vfio/devices/vfioX where
277
+ "X" is the number allocated uniquely by VFIO for registered devices.
278
+ cdev interface does not support noiommu devices, so user should use
279
+ the legacy group interface if noiommu is wanted.
280
+
281
+ The cdev only works with IOMMUFD. Both VFIO drivers and applications
282
+ must adapt to the new cdev security model which requires using
283
+ VFIO_DEVICE_BIND_IOMMUFD to claim DMA ownership before starting to
284
+ actually use the device. Once BIND succeeds then a VFIO device can
285
+ be fully accessed by the user.
286
+
287
+ VFIO device cdev doesn't rely on VFIO group/container/iommu drivers.
288
+ Hence those modules can be fully compiled out in an environment
289
+ where no legacy VFIO application exists.
290
+
291
+ So far SPAPR does not support IOMMUFD yet. So it cannot support device
292
+ cdev either.
293
+
294
+ vfio device cdev access is still bound by IOMMU group semantics, ie. there
295
+ can be only one DMA owner for the group. Devices belonging to the same
296
+ group can not be bound to multiple iommufd_ctx or shared between native
297
+ kernel and vfio bus driver or other driver supporting the driver_managed_dma
298
+ flag. A violation of this ownership requirement will fail at the
299
+ VFIO_DEVICE_BIND_IOMMUFD ioctl, which gates full device access.
300
+
301
+ Device cdev Example
302
+ -------------------
303
+
304
+ Assume user wants to access PCI device 0000:6a:01.0::
305
+
306
+ $ ls /sys/bus/pci/devices/0000:6a:01.0/vfio-dev/
307
+ vfio0
308
+
309
+ This device is therefore represented as vfio0. The user can verify
310
+ its existence::
311
+
312
+ $ ls -l /dev/vfio/devices/vfio0
313
+ crw------- 1 root root 511, 0 Feb 16 01:22 /dev/vfio/devices/vfio0
314
+ $ cat /sys/bus/pci/devices/0000:6a:01.0/vfio-dev/vfio0/dev
315
+ 511:0
316
+ $ ls -l /dev/char/511\:0
317
+ lrwxrwxrwx 1 root root 21 Feb 16 01:22 /dev/char/511:0 -> ../vfio/devices/vfio0
318
+
319
+ Then provide the user with access to the device if unprivileged
320
+ operation is desired::
321
+
322
+ $ chown user:user /dev/vfio/devices/vfio0
323
+
324
+ Finally the user could get cdev fd by::
325
+
326
+ cdev_fd = open("/dev/vfio/devices/vfio0", O_RDWR);
327
+
328
+ An opened cdev_fd doesn't give the user any permission of accessing
329
+ the device except binding the cdev_fd to an iommufd. After that point
330
+ then the device is fully accessible including attaching it to an
331
+ IOMMUFD IOAS/HWPT to enable userspace DMA::
332
+
333
+ struct vfio_device_bind_iommufd bind = {
334
+ .argsz = sizeof(bind),
335
+ .flags = 0,
336
+ };
337
+ struct iommu_ioas_alloc alloc_data = {
338
+ .size = sizeof(alloc_data),
339
+ .flags = 0,
340
+ };
341
+ struct vfio_device_attach_iommufd_pt attach_data = {
342
+ .argsz = sizeof(attach_data),
343
+ .flags = 0,
344
+ };
345
+ struct iommu_ioas_map map = {
346
+ .size = sizeof(map),
347
+ .flags = IOMMU_IOAS_MAP_READABLE |
348
+ IOMMU_IOAS_MAP_WRITEABLE |
349
+ IOMMU_IOAS_MAP_FIXED_IOVA,
350
+ .__reserved = 0,
351
+ };
352
+
353
+ iommufd = open("/dev/iommu", O_RDWR);
354
+
355
+ bind.iommufd = iommufd;
356
+ ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
357
+
358
+ ioctl(iommufd, IOMMU_IOAS_ALLOC, &alloc_data);
359
+ attach_data.pt_id = alloc_data.out_ioas_id;
360
+ ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
361
+
362
+ /* Allocate some space and setup a DMA mapping */
363
+ map.user_va = (int64_t)mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
364
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
365
+ map.iova = 0; /* 1MB starting at 0x0 from device view */
366
+ map.length = 1024 * 1024;
367
+ map.ioas_id = alloc_data.out_ioas_id;;
368
+
369
+ ioctl(iommufd, IOMMU_IOAS_MAP, &map);
370
+
371
+ /* Other device operations as stated in "VFIO Usage Example" */
372
+
242
373
VFIO User API
243
374
-------------------------------------------------------------------------------
244
375
@@ -566,3 +697,11 @@ This implementation has some specifics:
566
697
\- 0d.1
567
698
568
699
00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90)
700
+
701
+ .. [5 ] Nested translation is an IOMMU feature which supports two stage
702
+ address translations. This improves the address translation efficiency
703
+ in IOMMU virtualization.
704
+
705
+ .. [6 ] PASID stands for Process Address Space ID, introduced by PCI
706
+ Express. It is a prerequisite for Shared Virtual Addressing (SVA)
707
+ and Scalable I/O Virtualization (Scalable IOV).
0 commit comments