Skip to content

Commit 1235dc3

Browse files
author
Balazs Gibizer
committed
[rt] Apply migration context for incoming migrations
There is a race condition between an incoming resize and an update_available_resource periodic in the resource tracker. The race window starts when the resize_instance RPC finishes and ends when the finish_resize compute RPC finally applies the migration context on the instance. In the race window, if the update_available_resource periodic is run on the destination node, then it will see the instance as being tracked on this host as the instance.node is already pointing to the dest. But the instance.numa_topology still points to the source host topology as the migration context is not applied yet. This leads to CPU pinning error if the source topology does not fit to the dest topology. Also it stops the periodic task and leaves the tracker in an inconsistent state. The inconsistent state only cleanup up after the periodic is run outside of the race window. This patch applies the migration context temporarily to the specific instances during the periodic to keep resource accounting correct. Change-Id: Icaad155e22c9e2d86e464a0deb741c73f0dfb28a Closes-Bug: #1953359 Closes-Bug: #1952915 (cherry picked from commit 32c1044)
1 parent 0411962 commit 1235dc3

File tree

2 files changed

+40
-24
lines changed

2 files changed

+40
-24
lines changed

nova/compute/resource_tracker.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -932,14 +932,41 @@ def _update_available_resource(self, context, resources, startup=False):
932932
'flavor', 'migration_context',
933933
'resources'])
934934

935-
# Now calculate usage based on instance utilization:
936-
instance_by_uuid = self._update_usage_from_instances(
937-
context, instances, nodename)
938-
939935
# Grab all in-progress migrations and error migrations:
940936
migrations = objects.MigrationList.get_in_progress_and_error(
941937
context, self.host, nodename)
942938

939+
# Check for tracked instances with in-progress, incoming, but not
940+
# finished migrations. For those instance the migration context
941+
# is not applied yet (it will be during finish_resize when the
942+
# migration goes to finished state). We need to manually and
943+
# temporary apply the migration context here when the resource usage is
944+
# updated. See bug 1953359 for more details.
945+
instance_by_uuid = {instance.uuid: instance for instance in instances}
946+
for migration in migrations:
947+
if (
948+
migration.instance_uuid in instance_by_uuid and
949+
migration.dest_compute == self.host and
950+
migration.dest_node == nodename
951+
):
952+
# we does not check for the 'post-migrating' migration status
953+
# as applying the migration context for an instance already
954+
# in finished migration status is a no-op anyhow.
955+
instance = instance_by_uuid[migration.instance_uuid]
956+
LOG.debug(
957+
'Applying migration context for instance %s as it has an '
958+
'incoming, in-progress migration %s. '
959+
'Migration status is %s',
960+
migration.instance_uuid, migration.uuid, migration.status
961+
)
962+
# It is OK not to revert the migration context at the end of
963+
# the periodic as the instance is not saved during the periodic
964+
instance.apply_migration_context()
965+
966+
# Now calculate usage based on instance utilization:
967+
instance_by_uuid = self._update_usage_from_instances(
968+
context, instances, nodename)
969+
943970
self._pair_instances_to_migrations(migrations, instance_by_uuid)
944971
self._update_usage_from_migrations(context, migrations, nodename)
945972

nova/tests/functional/libvirt/test_numa_servers.py

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -990,11 +990,11 @@ def fake_finish_resize(*args, **kwargs):
990990
'vCPUs mapping: [(0, 1)]',
991991
log,
992992
)
993-
# But the periodic fails as it tries to apply the source topology
994-
# on the dest. This is bug 1953359.
993+
# We expect that the periodic not fails as bug 1953359 is fixed.
995994
log = self.stdlog.logger.output
996-
self.assertIn('Error updating resources for node compute2', log)
997-
self.assertIn(
995+
self.assertIn('Running periodic for compute (compute2)', log)
996+
self.assertNotIn('Error updating resources for node compute2', log)
997+
self.assertNotIn(
998998
'nova.exception.CPUPinningInvalid: CPU set to pin [0] must be '
999999
'a subset of free CPU set [1]',
10001000
log,
@@ -1012,27 +1012,16 @@ def fake_finish_resize(*args, **kwargs):
10121012
new=fake_finish_resize,
10131013
):
10141014
post = {'migrate': None}
1015-
# this is expected to succeed but logs are emitted
1016-
# from the racing periodic task. See fake_finish_resize
1017-
# for the asserts
1015+
# this is expected to succeed
10181016
self.admin_api.post_server_action(server['id'], post)
10191017

10201018
server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
10211019

1022-
# as the periodic job raced and failed during the resize if we revert
1023-
# the instance now then it tries to unpin its cpus from the dest host
1024-
# but those was never pinned as the periodic failed. So the unpinning
1025-
# will fail too.
1020+
# As bug 1953359 is fixed the revert should succeed too
10261021
post = {'revertResize': {}}
1027-
ex = self.assertRaises(
1028-
client.OpenStackApiException,
1029-
self.admin_api.post_server_action, server['id'], post
1030-
)
1031-
# This is still bug 1953359.
1032-
self.assertEqual(500, ex.response.status_code)
1033-
server = self.api.get_server(server['id'])
1034-
self.assertEqual('ERROR', server['status'])
1035-
self.assertIn(
1022+
self.admin_api.post_server_action(server['id'], post)
1023+
self._wait_for_state_change(server, 'ACTIVE')
1024+
self.assertNotIn(
10361025
'nova.exception.CPUUnpinningInvalid: CPU set to unpin [1] must be '
10371026
'a subset of pinned CPU set [0]',
10381027
self.stdlog.logger.output,

0 commit comments

Comments
 (0)