@@ -305,6 +305,7 @@ struct vfio_region_info_cap_type {
305
305
#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
306
306
#define VFIO_REGION_TYPE_GFX (1)
307
307
#define VFIO_REGION_TYPE_CCW (2)
308
+ #define VFIO_REGION_TYPE_MIGRATION (3)
308
309
309
310
/* sub-types for VFIO_REGION_TYPE_PCI_* */
310
311
@@ -379,6 +380,233 @@ struct vfio_region_gfx_edid {
379
380
/* sub-types for VFIO_REGION_TYPE_CCW */
380
381
#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1)
381
382
383
+ /* sub-types for VFIO_REGION_TYPE_MIGRATION */
384
+ #define VFIO_REGION_SUBTYPE_MIGRATION (1)
385
+
386
+ /*
387
+ * The structure vfio_device_migration_info is placed at the 0th offset of
388
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
389
+ * migration information. Field accesses from this structure are only supported
390
+ * at their native width and alignment. Otherwise, the result is undefined and
391
+ * vendor drivers should return an error.
392
+ *
393
+ * device_state: (read/write)
394
+ * - The user application writes to this field to inform the vendor driver
395
+ * about the device state to be transitioned to.
396
+ * - The vendor driver should take the necessary actions to change the
397
+ * device state. After successful transition to a given state, the
398
+ * vendor driver should return success on write(device_state, state)
399
+ * system call. If the device state transition fails, the vendor driver
400
+ * should return an appropriate -errno for the fault condition.
401
+ * - On the user application side, if the device state transition fails,
402
+ * that is, if write(device_state, state) returns an error, read
403
+ * device_state again to determine the current state of the device from
404
+ * the vendor driver.
405
+ * - The vendor driver should return previous state of the device unless
406
+ * the vendor driver has encountered an internal error, in which case
407
+ * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
408
+ * - The user application must use the device reset ioctl to recover the
409
+ * device from VFIO_DEVICE_STATE_ERROR state. If the device is
410
+ * indicated to be in a valid device state by reading device_state, the
411
+ * user application may attempt to transition the device to any valid
412
+ * state reachable from the current state or terminate itself.
413
+ *
414
+ * device_state consists of 3 bits:
415
+ * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
416
+ * it indicates the _STOP state. When the device state is changed to
417
+ * _STOP, driver should stop the device before write() returns.
418
+ * - If bit 1 is set, it indicates the _SAVING state, which means that the
419
+ * driver should start gathering device state information that will be
420
+ * provided to the VFIO user application to save the device's state.
421
+ * - If bit 2 is set, it indicates the _RESUMING state, which means that
422
+ * the driver should prepare to resume the device. Data provided through
423
+ * the migration region should be used to resume the device.
424
+ * Bits 3 - 31 are reserved for future use. To preserve them, the user
425
+ * application should perform a read-modify-write operation on this
426
+ * field when modifying the specified bits.
427
+ *
428
+ * +------- _RESUMING
429
+ * |+------ _SAVING
430
+ * ||+----- _RUNNING
431
+ * |||
432
+ * 000b => Device Stopped, not saving or resuming
433
+ * 001b => Device running, which is the default state
434
+ * 010b => Stop the device & save the device state, stop-and-copy state
435
+ * 011b => Device running and save the device state, pre-copy state
436
+ * 100b => Device stopped and the device state is resuming
437
+ * 101b => Invalid state
438
+ * 110b => Error state
439
+ * 111b => Invalid state
440
+ *
441
+ * State transitions:
442
+ *
443
+ * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
444
+ * (100b) (001b) (011b) (010b) (000b)
445
+ * 0. Running or default state
446
+ * |
447
+ *
448
+ * 1. Normal Shutdown (optional)
449
+ * |------------------------------------->|
450
+ *
451
+ * 2. Save the state or suspend
452
+ * |------------------------->|---------->|
453
+ *
454
+ * 3. Save the state during live migration
455
+ * |----------->|------------>|---------->|
456
+ *
457
+ * 4. Resuming
458
+ * |<---------|
459
+ *
460
+ * 5. Resumed
461
+ * |--------->|
462
+ *
463
+ * 0. Default state of VFIO device is _RUNNNG when the user application starts.
464
+ * 1. During normal shutdown of the user application, the user application may
465
+ * optionally change the VFIO device state from _RUNNING to _STOP. This
466
+ * transition is optional. The vendor driver must support this transition but
467
+ * must not require it.
468
+ * 2. When the user application saves state or suspends the application, the
469
+ * device state transitions from _RUNNING to stop-and-copy and then to _STOP.
470
+ * On state transition from _RUNNING to stop-and-copy, driver must stop the
471
+ * device, save the device state and send it to the application through the
472
+ * migration region. The sequence to be followed for such transition is given
473
+ * below.
474
+ * 3. In live migration of user application, the state transitions from _RUNNING
475
+ * to pre-copy, to stop-and-copy, and to _STOP.
476
+ * On state transition from _RUNNING to pre-copy, the driver should start
477
+ * gathering the device state while the application is still running and send
478
+ * the device state data to application through the migration region.
479
+ * On state transition from pre-copy to stop-and-copy, the driver must stop
480
+ * the device, save the device state and send it to the user application
481
+ * through the migration region.
482
+ * Vendor drivers must support the pre-copy state even for implementations
483
+ * where no data is provided to the user before the stop-and-copy state. The
484
+ * user must not be required to consume all migration data before the device
485
+ * transitions to a new state, including the stop-and-copy state.
486
+ * The sequence to be followed for above two transitions is given below.
487
+ * 4. To start the resuming phase, the device state should be transitioned from
488
+ * the _RUNNING to the _RESUMING state.
489
+ * In the _RESUMING state, the driver should use the device state data
490
+ * received through the migration region to resume the device.
491
+ * 5. After providing saved device data to the driver, the application should
492
+ * change the state from _RESUMING to _RUNNING.
493
+ *
494
+ * reserved:
495
+ * Reads on this field return zero and writes are ignored.
496
+ *
497
+ * pending_bytes: (read only)
498
+ * The number of pending bytes still to be migrated from the vendor driver.
499
+ *
500
+ * data_offset: (read only)
501
+ * The user application should read data_offset field from the migration
502
+ * region. The user application should read the device data from this
503
+ * offset within the migration region during the _SAVING state or write
504
+ * the device data during the _RESUMING state. See below for details of
505
+ * sequence to be followed.
506
+ *
507
+ * data_size: (read/write)
508
+ * The user application should read data_size to get the size in bytes of
509
+ * the data copied in the migration region during the _SAVING state and
510
+ * write the size in bytes of the data copied in the migration region
511
+ * during the _RESUMING state.
512
+ *
513
+ * The format of the migration region is as follows:
514
+ * ------------------------------------------------------------------
515
+ * |vfio_device_migration_info| data section |
516
+ * | | /////////////////////////////// |
517
+ * ------------------------------------------------------------------
518
+ * ^ ^
519
+ * offset 0-trapped part data_offset
520
+ *
521
+ * The structure vfio_device_migration_info is always followed by the data
522
+ * section in the region, so data_offset will always be nonzero. The offset
523
+ * from where the data is copied is decided by the kernel driver. The data
524
+ * section can be trapped, mmapped, or partitioned, depending on how the kernel
525
+ * driver defines the data section. The data section partition can be defined
526
+ * as mapped by the sparse mmap capability. If mmapped, data_offset must be
527
+ * page aligned, whereas initial section which contains the
528
+ * vfio_device_migration_info structure, might not end at the offset, which is
529
+ * page aligned. The user is not required to access through mmap regardless
530
+ * of the capabilities of the region mmap.
531
+ * The vendor driver should determine whether and how to partition the data
532
+ * section. The vendor driver should return data_offset accordingly.
533
+ *
534
+ * The sequence to be followed while in pre-copy state and stop-and-copy state
535
+ * is as follows:
536
+ * a. Read pending_bytes, indicating the start of a new iteration to get device
537
+ * data. Repeated read on pending_bytes at this stage should have no side
538
+ * effects.
539
+ * If pending_bytes == 0, the user application should not iterate to get data
540
+ * for that device.
541
+ * If pending_bytes > 0, perform the following steps.
542
+ * b. Read data_offset, indicating that the vendor driver should make data
543
+ * available through the data section. The vendor driver should return this
544
+ * read operation only after data is available from (region + data_offset)
545
+ * to (region + data_offset + data_size).
546
+ * c. Read data_size, which is the amount of data in bytes available through
547
+ * the migration region.
548
+ * Read on data_offset and data_size should return the offset and size of
549
+ * the current buffer if the user application reads data_offset and
550
+ * data_size more than once here.
551
+ * d. Read data_size bytes of data from (region + data_offset) from the
552
+ * migration region.
553
+ * e. Process the data.
554
+ * f. Read pending_bytes, which indicates that the data from the previous
555
+ * iteration has been read. If pending_bytes > 0, go to step b.
556
+ *
557
+ * The user application can transition from the _SAVING|_RUNNING
558
+ * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
559
+ * number of pending bytes. The user application should iterate in _SAVING
560
+ * (stop-and-copy) until pending_bytes is 0.
561
+ *
562
+ * The sequence to be followed while _RESUMING device state is as follows:
563
+ * While data for this device is available, repeat the following steps:
564
+ * a. Read data_offset from where the user application should write data.
565
+ * b. Write migration data starting at the migration region + data_offset for
566
+ * the length determined by data_size from the migration source.
567
+ * c. Write data_size, which indicates to the vendor driver that data is
568
+ * written in the migration region. Vendor driver must return this write
569
+ * operations on consuming data. Vendor driver should apply the
570
+ * user-provided migration region data to the device resume state.
571
+ *
572
+ * If an error occurs during the above sequences, the vendor driver can return
573
+ * an error code for next read() or write() operation, which will terminate the
574
+ * loop. The user application should then take the next necessary action, for
575
+ * example, failing migration or terminating the user application.
576
+ *
577
+ * For the user application, data is opaque. The user application should write
578
+ * data in the same order as the data is received and the data should be of
579
+ * same transaction size at the source.
580
+ */
581
+
582
+ struct vfio_device_migration_info {
583
+ __u32 device_state ; /* VFIO device state */
584
+ #define VFIO_DEVICE_STATE_STOP (0)
585
+ #define VFIO_DEVICE_STATE_RUNNING (1 << 0)
586
+ #define VFIO_DEVICE_STATE_SAVING (1 << 1)
587
+ #define VFIO_DEVICE_STATE_RESUMING (1 << 2)
588
+ #define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
589
+ VFIO_DEVICE_STATE_SAVING | \
590
+ VFIO_DEVICE_STATE_RESUMING)
591
+
592
+ #define VFIO_DEVICE_STATE_VALID (state ) \
593
+ (state & VFIO_DEVICE_STATE_RESUMING ? \
594
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
595
+
596
+ #define VFIO_DEVICE_STATE_IS_ERROR (state ) \
597
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
598
+ VFIO_DEVICE_STATE_RESUMING))
599
+
600
+ #define VFIO_DEVICE_STATE_SET_ERROR (state ) \
601
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
602
+ VFIO_DEVICE_STATE_RESUMING)
603
+
604
+ __u32 reserved ;
605
+ __u64 pending_bytes ;
606
+ __u64 data_offset ;
607
+ __u64 data_size ;
608
+ };
609
+
382
610
/*
383
611
* The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
384
612
* which allows direct access to non-MSIX registers which happened to be within
0 commit comments