Skip to content

Commit 99781d6

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Cyborg evacuate support"
2 parents 940dfce + 1356ef5 commit 99781d6

File tree

17 files changed

+414
-191
lines changed

17 files changed

+414
-191
lines changed

api-guide/source/accelerator-support.rst

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,8 @@ appropriate privileges) must do the following:
2929
3030
openstack server create --flavor $myflavor --image $myimage $servername
3131
32-
As of 21.0.0 (Ussuri), nova supports only specific operations for instances
33-
with accelerators. The lists of supported and unsupported operations are as
34-
below:
32+
Nova supports only specific operations for instances with accelerators.
33+
The lists of supported and unsupported operations are as below:
3534

3635
* Supported operations.
3736

@@ -42,17 +41,21 @@ below:
4241
* Take a snapshot.
4342
* Backup.
4443
* Rescue and unrescue.
44+
* Rebuild.
45+
* Evacuate.
4546

4647
* Unsupported operations
4748

48-
* Rebuild.
4949
* Resize.
50-
* Evacuate.
5150
* Suspend and resume.
5251
* Shelve and unshelve.
5352
* Cold migration.
5453
* Live migration.
5554

55+
.. versionchanged:: 22.0.0(Victoria)
56+
57+
Added support for rebuild and evacuate operations.
58+
5659
Some operations, such as lock and unlock, work as they are effectively
5760
no-ops for accelerators.
5861

nova/accelerator/cyborg.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,3 +302,12 @@ def delete_arqs_by_uuid(self, arq_uuids):
302302
if err_msg:
303303
# No point raising an exception.
304304
LOG.error('Failed to delete ARQs %s', arq_uuid_str)
305+
306+
def get_arq_uuids_for_instance(self, instance):
307+
"""Get ARQ UUIDs for the instance.
308+
309+
:param instance: Instance Object
310+
:return: ARQ UUIDs.
311+
"""
312+
return [arq['uuid']
313+
for arq in self.get_arqs_for_instance(instance.uuid)]

nova/compute/api.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@
109109
# TODO(huaqiang): Remove in Wallaby
110110
MIN_VER_NOVA_COMPUTE_MIXED_POLICY = 52
111111

112+
SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD = 53
113+
112114
# FIXME(danms): Keep a global cache of the cells we find the
113115
# first time we look. This needs to be refreshed on a timer or
114116
# trigger.
@@ -307,14 +309,27 @@ def _get_image_meta_obj(image_meta_dict):
307309
return image_meta
308310

309311

310-
def block_accelerators(func):
311-
@functools.wraps(func)
312-
def wrapper(self, context, instance, *args, **kwargs):
313-
dp_name = instance.flavor.extra_specs.get('accel:device_profile')
314-
if dp_name:
315-
raise exception.ForbiddenWithAccelerators()
316-
return func(self, context, instance, *args, **kwargs)
317-
return wrapper
312+
def block_accelerators(until_service=None):
313+
def inner(func):
314+
@functools.wraps(func)
315+
def wrapper(self, context, instance, *args, **kwargs):
316+
# NOTE(brinzhang): Catch a request operating a mixed instance,
317+
# make sure all nova-compute services have been upgraded and
318+
# support the accelerators.
319+
dp_name = instance.flavor.extra_specs.get('accel:device_profile')
320+
service_support = False
321+
if not dp_name:
322+
service_support = True
323+
elif until_service:
324+
min_version = objects.service.get_minimum_version_all_cells(
325+
nova_context.get_admin_context(), ['nova-compute'])
326+
if min_version >= until_service:
327+
service_support = True
328+
if not service_support:
329+
raise exception.ForbiddenWithAccelerators()
330+
return func(self, context, instance, *args, **kwargs)
331+
return wrapper
332+
return inner
318333

319334

320335
@profiler.trace_cls("compute_api")
@@ -3393,7 +3408,7 @@ def _check_image_arch(self, image=None):
33933408
fields_obj.Architecture.canonicalize(img_arch)
33943409

33953410
@reject_vtpm_instances(instance_actions.REBUILD)
3396-
@block_accelerators
3411+
@block_accelerators(until_service=SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD)
33973412
# TODO(stephenfin): We should expand kwargs out to named args
33983413
@check_instance_lock
33993414
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
@@ -3930,7 +3945,7 @@ def _validate_host_for_cold_migrate(
39303945
# finally split resize and cold migration into separate code paths
39313946
# TODO(stephenfin): The 'block_accelerators' decorator doesn't take into
39323947
# account the accelerators requested in the new flavor
3933-
@block_accelerators
3948+
@block_accelerators()
39343949
@check_instance_lock
39353950
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED])
39363951
@check_instance_host(check_is_up=True)
@@ -4159,7 +4174,7 @@ def _allow_resize_to_same_host(self, cold_migrate, instance):
41594174
return allow_same_host
41604175

41614176
@reject_vtpm_instances(instance_actions.SHELVE)
4162-
@block_accelerators
4177+
@block_accelerators()
41634178
@check_instance_lock
41644179
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
41654180
vm_states.PAUSED, vm_states.SUSPENDED])
@@ -4324,7 +4339,7 @@ def get_instance_diagnostics(self, context, instance):
43244339
return self.compute_rpcapi.get_instance_diagnostics(context,
43254340
instance=instance)
43264341

4327-
@block_accelerators
4342+
@block_accelerators()
43284343
@reject_sev_instances(instance_actions.SUSPEND)
43294344
@check_instance_lock
43304345
@check_instance_state(vm_state=[vm_states.ACTIVE])
@@ -5028,7 +5043,7 @@ def update_instance_metadata(self, context, instance,
50285043
diff=diff)
50295044
return _metadata
50305045

5031-
@block_accelerators
5046+
@block_accelerators()
50325047
@reject_vtpm_instances(instance_actions.LIVE_MIGRATION)
50335048
@reject_sev_instances(instance_actions.LIVE_MIGRATION)
50345049
@check_instance_lock
@@ -5160,7 +5175,7 @@ def live_migrate_abort(self, context, instance, migration_id,
51605175
instance, migration.id)
51615176

51625177
@reject_vtpm_instances(instance_actions.EVACUATE)
5163-
@block_accelerators
5178+
@block_accelerators(until_service=SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD)
51645179
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
51655180
vm_states.ERROR])
51665181
def evacuate(self, context, instance, host, on_shared_storage,

nova/compute/manager.py

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,7 @@ def update_compute_provider_status(self, context, rp_uuid, enabled):
530530
class ComputeManager(manager.Manager):
531531
"""Manages the running instances from creation to destruction."""
532532

533-
target = messaging.Target(version='5.11')
533+
target = messaging.Target(version='5.12')
534534

535535
def __init__(self, compute_driver=None, *args, **kwargs):
536536
"""Load configuration options and connect to the hypervisor."""
@@ -3256,25 +3256,44 @@ def _set_migration_status(migration, status):
32563256
migration.status = status
32573257
migration.save()
32583258

3259-
def _rebuild_default_impl(self, context, instance, image_meta,
3260-
injected_files, admin_password, allocations,
3261-
bdms, detach_block_devices, attach_block_devices,
3262-
network_info=None,
3263-
evacuate=False, block_device_info=None,
3264-
preserve_ephemeral=False):
3259+
def _rebuild_default_impl(
3260+
self, context, instance, image_meta, injected_files,
3261+
admin_password, allocations, bdms, detach_block_devices,
3262+
attach_block_devices, network_info=None, evacuate=False,
3263+
block_device_info=None, preserve_ephemeral=False,
3264+
accel_uuids=None):
32653265
if preserve_ephemeral:
32663266
# The default code path does not support preserving ephemeral
32673267
# partitions.
32683268
raise exception.PreserveEphemeralNotSupported()
32693269

3270+
accel_info = []
32703271
if evacuate:
3272+
if instance.flavor.extra_specs.get('accel:device_profile'):
3273+
try:
3274+
accel_info = self._get_bound_arq_resources(
3275+
context, instance, accel_uuids or [])
3276+
except (Exception, eventlet.timeout.Timeout) as exc:
3277+
LOG.exception(exc)
3278+
self._build_resources_cleanup(instance, network_info)
3279+
msg = _('Failure getting accelerator resources.')
3280+
raise exception.BuildAbortException(
3281+
instance_uuid=instance.uuid, reason=msg)
32713282
detach_block_devices(context, bdms)
32723283
else:
32733284
self._power_off_instance(instance, clean_shutdown=True)
32743285
detach_block_devices(context, bdms)
32753286
self.driver.destroy(context, instance,
32763287
network_info=network_info,
32773288
block_device_info=block_device_info)
3289+
try:
3290+
accel_info = self._get_accel_info(context, instance)
3291+
except Exception as exc:
3292+
LOG.exception(exc)
3293+
self._build_resources_cleanup(instance, network_info)
3294+
msg = _('Failure getting accelerator resources.')
3295+
raise exception.BuildAbortException(
3296+
instance_uuid=instance.uuid, reason=msg)
32783297

32793298
instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
32803299
instance.save(expected_task_state=[task_states.REBUILDING])
@@ -3289,7 +3308,8 @@ def _rebuild_default_impl(self, context, instance, image_meta,
32893308
self.driver.spawn(context, instance, image_meta, injected_files,
32903309
admin_password, allocations,
32913310
network_info=network_info,
3292-
block_device_info=new_block_device_info)
3311+
block_device_info=new_block_device_info,
3312+
accel_info=accel_info)
32933313

32943314
def _notify_instance_rebuild_error(self, context, instance, error, bdms):
32953315
self._notify_about_instance_usage(context, instance,
@@ -3298,7 +3318,8 @@ def _notify_instance_rebuild_error(self, context, instance, error, bdms):
32983318
context, instance, self.host,
32993319
phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms)
33003320

3301-
@messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)
3321+
@messaging.expected_exceptions(exception.PreserveEphemeralNotSupported,
3322+
exception.BuildAbortException)
33023323
@wrap_exception()
33033324
@reverts_task_state
33043325
@wrap_instance_event(prefix='compute')
@@ -3307,7 +3328,7 @@ def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
33073328
injected_files, new_pass, orig_sys_metadata,
33083329
bdms, recreate, on_shared_storage,
33093330
preserve_ephemeral, migration,
3310-
scheduled_node, limits, request_spec):
3331+
scheduled_node, limits, request_spec, accel_uuids):
33113332
"""Destroy and re-make this instance.
33123333

33133334
A 'rebuild' effectively purges all existing data from the system and
@@ -3338,6 +3359,7 @@ def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
33383359
:param limits: Overcommit limits set by the scheduler. If a host was
33393360
specified by the user, this will be None
33403361
:param request_spec: a RequestSpec object used to schedule the instance
3362+
:param accel_uuids: a list of cyborg ARQ uuids.
33413363

33423364
"""
33433365
# recreate=True means the instance is being evacuated from a failed
@@ -3402,7 +3424,7 @@ def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
34023424
image_meta, injected_files, new_pass, orig_sys_metadata,
34033425
bdms, evacuate, on_shared_storage, preserve_ephemeral,
34043426
migration, request_spec, allocs, rebuild_claim,
3405-
scheduled_node, limits)
3427+
scheduled_node, limits, accel_uuids)
34063428
except (exception.ComputeResourcesUnavailable,
34073429
exception.RescheduledException) as e:
34083430
if isinstance(e, exception.ComputeResourcesUnavailable):
@@ -3469,7 +3491,7 @@ def _do_rebuild_instance_with_claim(
34693491
self, context, instance, orig_image_ref, image_meta,
34703492
injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
34713493
on_shared_storage, preserve_ephemeral, migration, request_spec,
3472-
allocations, rebuild_claim, scheduled_node, limits):
3494+
allocations, rebuild_claim, scheduled_node, limits, accel_uuids):
34733495
"""Helper to avoid deep nesting in the top-level method."""
34743496

34753497
provider_mapping = None
@@ -3490,7 +3512,7 @@ def _do_rebuild_instance_with_claim(
34903512
context, instance, orig_image_ref, image_meta, injected_files,
34913513
new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage,
34923514
preserve_ephemeral, migration, request_spec, allocations,
3493-
provider_mapping)
3515+
provider_mapping, accel_uuids)
34943516

34953517
@staticmethod
34963518
def _get_image_name(image_meta):
@@ -3499,12 +3521,12 @@ def _get_image_name(image_meta):
34993521
else:
35003522
return ''
35013523

3502-
def _do_rebuild_instance(self, context, instance, orig_image_ref,
3503-
image_meta, injected_files, new_pass,
3504-
orig_sys_metadata, bdms, evacuate,
3505-
on_shared_storage, preserve_ephemeral,
3506-
migration, request_spec, allocations,
3507-
request_group_resource_providers_mapping):
3524+
def _do_rebuild_instance(
3525+
self, context, instance, orig_image_ref, image_meta,
3526+
injected_files, new_pass, orig_sys_metadata, bdms, evacuate,
3527+
on_shared_storage, preserve_ephemeral, migration, request_spec,
3528+
allocations, request_group_resource_providers_mapping,
3529+
accel_uuids):
35083530
orig_vm_state = instance.vm_state
35093531

35103532
if evacuate:
@@ -3645,7 +3667,8 @@ def detach_block_devices(context, bdms):
36453667
block_device_info=block_device_info,
36463668
network_info=network_info,
36473669
preserve_ephemeral=preserve_ephemeral,
3648-
evacuate=evacuate)
3670+
evacuate=evacuate,
3671+
accel_uuids=accel_uuids)
36493672
try:
36503673
with instance.mutated_migration_context():
36513674
self.driver.rebuild(**kwargs)

nova/compute/rpcapi.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,8 @@ class ComputeAPI(object):
378378
* 5.10 - Add finish_revert_snapshot_based_resize_at_source()
379379
* 5.11 - Add accel_uuids (accelerator requests) parameter to
380380
build_and_run_instance()
381+
* 5.12 - Add accel_uuids (accelerator requests) parameter to
382+
rebuild_instance()
381383
'''
382384

383385
VERSION_ALIASES = {
@@ -1056,20 +1058,29 @@ def reboot_instance(self, ctxt, instance, block_device_info,
10561058
block_device_info=block_device_info,
10571059
reboot_type=reboot_type)
10581060

1059-
def rebuild_instance(self, ctxt, instance, new_pass, injected_files,
1061+
def rebuild_instance(
1062+
self, ctxt, instance, new_pass, injected_files,
10601063
image_ref, orig_image_ref, orig_sys_metadata, bdms,
10611064
recreate, on_shared_storage, host, node,
1062-
preserve_ephemeral, migration, limits, request_spec):
1065+
preserve_ephemeral, migration, limits, request_spec, accel_uuids):
1066+
10631067
# NOTE(edleafe): compute nodes can only use the dict form of limits.
10641068
if isinstance(limits, objects.SchedulerLimits):
10651069
limits = limits.to_dict()
1066-
msg_args = {'preserve_ephemeral': preserve_ephemeral,
1067-
'migration': migration,
1068-
'scheduled_node': node,
1069-
'limits': limits,
1070-
'request_spec': request_spec}
1071-
version = '5.0'
1070+
1071+
msg_args = {
1072+
'preserve_ephemeral': preserve_ephemeral,
1073+
'migration': migration,
1074+
'scheduled_node': node,
1075+
'limits': limits,
1076+
'request_spec': request_spec,
1077+
'accel_uuids': accel_uuids
1078+
}
1079+
version = '5.12'
10721080
client = self.router.client(ctxt)
1081+
if not client.can_send_version(version):
1082+
del msg_args['accel_uuids']
1083+
version = '5.0'
10731084
cctxt = client.prepare(server=_compute_host(host, instance),
10741085
version=version)
10751086
cctxt.cast(ctxt, 'rebuild_instance',

0 commit comments

Comments
 (0)