Skip to content

[stable-only][cve] Check VMDK create-type against an allowed list #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: stackhpc/ussuri
Choose a base branch
from
Open
36 changes: 26 additions & 10 deletions nova/compute/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -9896,6 +9896,8 @@ def update_available_resource(self, context, startup=False):
use_slave=True,
startup=startup)

self.rt.clean_compute_node_cache(compute_nodes_in_db)

# Delete orphan compute node not reported by driver but still in db
for cn in compute_nodes_in_db:
if cn.hypervisor_hostname not in nodenames:
Expand All @@ -9904,17 +9906,31 @@ def update_available_resource(self, context, startup=False):
"nodes are %(nodes)s",
{'id': cn.id, 'hh': cn.hypervisor_hostname,
'nodes': nodenames})
cn.destroy()
self.rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations.
try:
self.reportclient.delete_resource_provider(context, cn,
cascade=True)
except keystone_exception.ClientException as e:
LOG.error(
"Failed to delete compute node resource provider "
"for compute node %s: %s", cn.uuid, six.text_type(e))
cn.destroy()
except exception.ObjectActionError:
# NOTE(mgoddard): it's possible that another compute
# service took ownership of this compute node since we
# queried it due to a rebalance, and this will cause the
# deletion to fail. Ignore the error in that case.
LOG.info("Ignoring failure to delete orphan compute node "
"%(id)s on hypervisor host %(hh)s due to "
"possible node rebalance",
{'id': cn.id, 'hh': cn.hypervisor_hostname})
self.rt.remove_node(cn.hypervisor_hostname)
self.reportclient.invalidate_resource_provider(cn.uuid)
else:
self.rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations.
try:
self.reportclient.delete_resource_provider(
context, cn, cascade=True)
except keystone_exception.ClientException as e:
LOG.error(
"Failed to delete compute node resource provider "
"for compute node %s: %s", cn.uuid,
six.text_type(e))

for nodename in nodenames:
self._update_available_resource_for_node(context, nodename,
Expand Down
18 changes: 18 additions & 0 deletions nova/compute/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1831,3 +1831,21 @@ def finish_evacuation(self, instance, node, migration):
if migration:
migration.status = 'done'
migration.save()

@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def clean_compute_node_cache(self, compute_nodes_in_db):
"""Clean the compute node cache of any nodes that no longer exist.

:param compute_nodes_in_db: list of ComputeNode objects from the DB.
"""
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
for cn in compute_nodes_in_db}
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames

for stale_cn in stale_cns:
# NOTE(mgoddard): we have found a node in the cache that has no
# compute node in the DB. This could be due to a node rebalance
# where another compute service took ownership of the node. Clean
# up the cache.
self.remove_node(stale_cn)
self.reportclient.invalidate_resource_provider(stale_cn)
9 changes: 9 additions & 0 deletions nova/conf/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,6 +959,15 @@
* Any integer >= 1 represents the maximum allowed. A value of 0 will cause the
``nova-compute`` service to fail to start, as 0 disk devices is an invalid
configuration that would prevent instances from being able to boot.
"""),
cfg.ListOpt('vmdk_allowed_types',
default=['streamOptimized', 'monolithicSparse'],
help="""
A list of strings describing allowed VMDK "create-type" subformats
that will be allowed. This is recommended to only include
single-file-with-sparse-header variants to avoid potential host file
exposure due to processing named extents. If this list is empty, then no
form of VMDK image will be allowed.
"""),
]

Expand Down
8 changes: 8 additions & 0 deletions nova/conf/workarounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,14 @@
* :oslo.config:option:`DEFAULT.instances_path`
* :oslo.config:option:`image_cache.subdirectory_name`
* :oslo.config:option:`update_resources_interval`
"""),
cfg.BoolOpt('skip_cpu_compare_on_dest',
default=False,
help="""
With the libvirt driver, during live migration, skip comparing guest CPU
with the destination host. When using QEMU >= 2.9 and libvirt >=
4.4.0, libvirt will do the correct thing with respect to checking CPU
compatibility on the destination host during live migration.
"""),
]

Expand Down
37 changes: 30 additions & 7 deletions nova/db/sqlalchemy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,16 +690,29 @@ def compute_node_search_by_hypervisor(context, hypervisor_match):


@pick_context_manager_writer
def compute_node_create(context, values):
def _compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data.
"""
convert_objects_related_datetimes(values)

compute_node_ref = models.ComputeNode()
compute_node_ref.update(values)
compute_node_ref.save(context.session)
return compute_node_ref


# NOTE(mgoddard): We avoid decorating this with @pick_context_manager_writer,
# so that we get a separate transaction in the exception handler. This avoids
# an error message about inactive DB sessions during a transaction rollback.
# See https://bugs.launchpad.net/nova/+bug/1853159.
def compute_node_create(context, values):
"""Creates a new ComputeNode and populates the capacity fields
with the most recent data. Will restore a soft deleted compute node if a
UUID has been explicitly requested.
"""
try:
compute_node_ref.save(context.session)
compute_node_ref = _compute_node_create(context, values)
except db_exc.DBDuplicateEntry:
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
# Check to see if we have a (soft) deleted ComputeNode with the
Expand Down Expand Up @@ -763,14 +776,24 @@ def compute_node_update(context, compute_id, values):


@pick_context_manager_writer
def compute_node_delete(context, compute_id):
def compute_node_delete(context, compute_id, constraint=None):
"""Delete a ComputeNode record."""
result = model_query(context, models.ComputeNode).\
filter_by(id=compute_id).\
soft_delete(synchronize_session=False)
query = model_query(context, models.ComputeNode).filter_by(id=compute_id)

if constraint is not None:
query = constraint.apply(models.ComputeNode, query)

result = query.soft_delete(synchronize_session=False)

if not result:
raise exception.ComputeHostNotFound(host=compute_id)
# The soft_delete could fail for one of two reasons:
# 1) The compute node no longer exists
# 2) The constraint, if specified, was not met
# Try to read the compute node and let it raise ComputeHostNotFound if
# 1) happened.
compute_node_get(context, compute_id)
# Else, raise ConstraintNotMet if 2) happened.
raise exception.ConstraintNotMet()


@pick_context_manager_reader
Expand Down
15 changes: 14 additions & 1 deletion nova/objects/compute_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,20 @@ def save(self, prune_stats=False):

@base.remotable
def destroy(self):
db.compute_node_delete(self._context, self.id)
if self.obj_attr_is_set('host') and self.host:
# NOTE(melwitt): If our host is set, avoid a race between
# nova-computes during ironic driver node rebalances which can
# change node ownership.
constraint = db.constraint(host=db.equal_any(self.host))
else:
constraint = None

try:
sa_api.compute_node_delete(
self._context, self.id, constraint=constraint)
except exception.ConstraintNotMet:
raise exception.ObjectActionError(action='destroy',
reason='host changed')

def update_from_virt_driver(self, resources):
# NOTE(pmurray): the virt driver provides a dict of values that
Expand Down
17 changes: 12 additions & 5 deletions nova/scheduler/client/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,11 +676,7 @@ def _delete_provider(self, rp_uuid, global_request_id=None):
if resp:
LOG.info("Deleted resource provider %s", rp_uuid)
# clean the caches
try:
self._provider_tree.remove(rp_uuid)
except ValueError:
pass
self._association_refresh_time.pop(rp_uuid, None)
self.invalidate_resource_provider(rp_uuid)
return

msg = ("[%(placement_req_id)s] Failed to delete resource provider "
Expand Down Expand Up @@ -2161,6 +2157,17 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
# for backward compatibility.
pass

def invalidate_resource_provider(self, name_or_uuid):
"""Invalidate the cache for a resource provider.

:param name_or_uuid: Name or UUID of the resource provider to look up.
"""
try:
self._provider_tree.remove(name_or_uuid)
except ValueError:
pass
self._association_refresh_time.pop(name_or_uuid, None)

def get_provider_by_name(self, context, name):
"""Queries the placement API for resource provider information matching
a supplied name.
Expand Down
171 changes: 171 additions & 0 deletions nova/tests/functional/regressions/test_bug_1853009.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import mock

from nova import context
from nova import objects
from nova.tests.functional import integrated_helpers


class NodeRebalanceDeletedComputeNodeRaceTestCase(
integrated_helpers.ProviderUsageBaseTestCase,
):
"""Regression test for bug 1853009 observed in Rocky & later.

When an ironic node re-balances from one host to another, there can be a
race where the old host deletes the orphan compute node after the new host
has taken ownership of it which results in the new host failing to create
the compute node and resource provider because the ResourceTracker does not
detect a change.
"""
# Make sure we're using the fake driver that has predictable uuids
# for each node.
compute_driver = 'fake.PredictableNodeUUIDDriver'

def _assert_hypervisor_api(self, nodename, expected_host):
# We should have one compute node shown by the API.
hypervisors = self.api.api_get(
'/os-hypervisors/detail'
).body['hypervisors']
self.assertEqual(1, len(hypervisors), hypervisors)
hypervisor = hypervisors[0]
self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
self.assertEqual(expected_host, hypervisor['service']['host'])

def _start_compute(self, host):
host = self.start_service('compute', host)
# Ironic compute driver has rebalances_nodes = True.
host.manager.driver.rebalances_nodes = True
return host

def setUp(self):
super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()

self.nodename = 'fake-node'
self.ctxt = context.get_admin_context()

def test_node_rebalance_deleted_compute_node_race(self):
# Simulate a service running and then stopping. host_a runs, creates
# fake-node, then is stopped. The fake-node compute node is destroyed.
# This leaves a soft-deleted node in the DB.
host_a = self._start_compute('host_a')
host_a.manager.driver._set_nodes([self.nodename])
host_a.manager.update_available_resource(self.ctxt)
host_a.stop()
cn = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_a', self.nodename,
)
cn.destroy()

self.assertEqual(0, len(objects.ComputeNodeList.get_all(self.ctxt)))

# Now we create a new compute service to manage our node.
host_b = self._start_compute('host_b')
host_b.manager.driver._set_nodes([self.nodename])

# When start_service runs, it will create a host_b ComputeNode. We want
# to delete that and inject our fake node into the driver which will
# be re-balanced to another host later. First assert this actually
# exists.
self._assert_hypervisor_api('host_b', expected_host='host_b')

# Now run the update_available_resource periodic to register fake-node
# and have it managed by host_b. This will also detect the "host_b"
# node as orphaned and delete it along with its resource provider.

# host_b[1]: Finds no compute record in RT. Tries to create one
# (_init_compute_node).
host_b.manager.update_available_resource(self.ctxt)
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
# There should only be one resource provider (fake-node).
original_rps = self._get_all_providers()
self.assertEqual(1, len(original_rps), original_rps)
self.assertEqual(self.nodename, original_rps[0]['name'])

# Simulate a re-balance by restarting host_a and make it manage
# fake-node. At this point both host_b and host_a think they own
# fake-node.
host_a = self._start_compute('host_a')
host_a.manager.driver._set_nodes([self.nodename])

# host_a[1]: Finds no compute record in RT, 'moves' existing node from
# host_b
host_a.manager.update_available_resource(self.ctxt)
# Assert that fake-node was re-balanced from host_b to host_a.
self.assertIn(
'ComputeNode fake-node moving from host_b to host_a',
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, expected_host='host_a')

# host_a[2]: Begins periodic update, queries compute nodes for this
# host, finds the fake-node.
cn = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_a', self.nodename,
)

# host_b[2]: Finds no compute record in RT, 'moves' existing node from
# host_a
host_b.manager.update_available_resource(self.ctxt)
# Assert that fake-node was re-balanced from host_a to host_b.
self.assertIn(
'ComputeNode fake-node moving from host_a to host_b',
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, expected_host='host_b')

# Complete rebalance, as host_a realises it does not own fake-node.
host_a.manager.driver._set_nodes([])

# host_a[2]: Deletes orphan compute node.
# Mock out the compute node query to simulate a race condition where
# the list includes an orphan compute node that is newly owned by
# host_b by the time host_a attempts to delete it.
with mock.patch(
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
) as mock_get:
mock_get.return_value = [cn]
host_a.manager.update_available_resource(self.ctxt)

# Verify that the node was almost deleted, but was saved by the host
# check
self.assertIn(
'Deleting orphan compute node %s hypervisor host '
'is fake-node, nodes are' % cn.id,
self.stdlog.logger.output)
self.assertIn(
'Ignoring failure to delete orphan compute node %s on '
'hypervisor host fake-node' % cn.id,
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, 'host_b')
rps = self._get_all_providers()
self.assertEqual(1, len(rps), rps)
self.assertEqual(self.nodename, rps[0]['name'])

# Simulate deletion of an orphan by host_a. It shouldn't happen
# anymore, but let's assume it already did.
cn = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_b', self.nodename)
cn.destroy()
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
host_a.manager.reportclient.delete_resource_provider(
self.ctxt, cn, cascade=True)

# host_b[3]: Should recreate compute node and resource provider.
host_b.manager.update_available_resource(self.ctxt)

# Verify that the node was recreated.
self._assert_hypervisor_api(self.nodename, 'host_b')

# The resource provider has now been created.
rps = self._get_all_providers()
self.assertEqual(1, len(rps), rps)
self.assertEqual(self.nodename, rps[0]['name'])
Loading