Skip to content

Commit 3f0605c

Browse files
committed
Sync COMPUTE_STATUS_DISABLED from API
This adds the os-services API change which will call the compute service when the service's disabled value changes to sync the COMPUTE_STATUS_DISABLED trait on the compute node resource providers managed by the updated compute service. If the compute service is down or not yet upgraded to the service version from change Ia95de2f23f12b002b2189c9294ec190569a628ab then the API will not call the service. In this case the change from I3005b46221ac3c0e559e1072131a7e4846c9867c will sync the trait when the compute service is restarted. Since the compute service could be running the ironic driver and managing hundreds or over 1000 compute nodes, the set_host_enabled RPC call now uses the long_rpc_timeout configuration option. A functional test is added which covers the 2.53+ PUT /os-services/{service_id} API and pre-2.53 os-services APIs for enabling/disabling and forcing down a service. The functional test also covers the sync-on-restart behavior from change I3005b46221ac3c0e559e1072131a7e4846c9867c. The scheduler pre-filter added in change I317cabbe49a337848325f96df79d478fd65811d9 is also tested as part of the functional test. Closes-Bug: #1805984 Implements blueprint pre-filter-disabled-computes Change-Id: If32bca070185937ef83f689b7163d965a89ec10a
1 parent 1c51443 commit 3f0605c

File tree

7 files changed

+258
-6
lines changed

7 files changed

+258
-6
lines changed

nova/compute/api.py

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
MIN_COMPUTE_TRUSTED_CERTS = 31
108108
MIN_COMPUTE_ABORT_QUEUED_LIVE_MIGRATION = 34
109109
MIN_COMPUTE_VOLUME_TYPE = 36
110+
MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED = 38
110111

111112
# FIXME(danms): Keep a global cache of the cells we find the
112113
# first time we look. This needs to be refreshed on a timer or
@@ -5090,17 +5091,69 @@ def service_get_by_compute_host(self, context, host_name):
50905091
"""Get service entry for the given compute hostname."""
50915092
return objects.Service.get_by_compute_host(context, host_name)
50925093

5094+
def _update_compute_provider_status(self, context, service):
5095+
"""Calls the compute service to sync the COMPUTE_STATUS_DISABLED trait.
5096+
5097+
There are two cases where the API will not call the compute service:
5098+
5099+
* The compute service is down. In this case the trait is synchronized
5100+
when the compute service is restarted.
5101+
* The compute service is old. In this case the trait is synchronized
5102+
when the compute service is upgraded and restarted.
5103+
5104+
:param context: nova auth RequestContext
5105+
:param service: nova.objects.Service object which has been enabled
5106+
or disabled (see ``service_update``).
5107+
"""
5108+
# Make sure the service is up so we can make the RPC call.
5109+
if not self.servicegroup_api.service_is_up(service):
5110+
LOG.info('Compute service on host %s is down. The '
5111+
'COMPUTE_STATUS_DISABLED trait will be synchronized '
5112+
'when the service is restarted.', service.host)
5113+
return
5114+
5115+
# Make sure the compute service is new enough for the trait sync
5116+
# behavior.
5117+
# TODO(mriedem): Remove this compat check in the U release.
5118+
if service.version < MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED:
5119+
LOG.info('Compute service on host %s is too old to sync the '
5120+
'COMPUTE_STATUS_DISABLED trait in Placement. The '
5121+
'trait will be synchronized when the service is '
5122+
'upgraded and restarted.', service.host)
5123+
return
5124+
5125+
enabled = not service.disabled
5126+
# Avoid leaking errors out of the API.
5127+
try:
5128+
LOG.debug('Calling the compute service on host %s to sync the '
5129+
'COMPUTE_STATUS_DISABLED trait.', service.host)
5130+
self.rpcapi.set_host_enabled(context, service.host, enabled)
5131+
except Exception:
5132+
LOG.exception('An error occurred while updating host enabled '
5133+
'status to "%s" for compute host: %s',
5134+
'enabled' if enabled else 'disabled',
5135+
service.host)
5136+
50935137
def service_update(self, context, service):
50945138
"""Performs the actual service update operation.
50955139
5140+
If the "disabled" field is changed, potentially calls the compute
5141+
service to sync the COMPUTE_STATUS_DISABLED trait on the compute node
5142+
resource providers managed by this compute service.
5143+
50965144
:param context: nova auth RequestContext
50975145
:param service: nova.objects.Service object with changes already
50985146
set on the object
50995147
"""
5148+
# Before persisting changes and resetting the changed fields on the
5149+
# Service object, determine if the disabled field changed.
5150+
update_placement = 'disabled' in service.obj_what_changed()
5151+
# Persist the Service object changes to the database.
51005152
service.save()
5101-
# TODO(mriedem): Reflect COMPUTE_STATUS_DISABLED trait changes to the
5102-
# associated compute node resource providers if the service's disabled
5103-
# status changed.
5153+
# If the disabled field changed, potentially call the compute service
5154+
# to sync the COMPUTE_STATUS_DISABLED trait.
5155+
if update_placement:
5156+
self._update_compute_provider_status(context, service)
51045157
return service
51055158

51065159
@target_host_cell

nova/compute/rpcapi.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,9 @@ def set_admin_password(self, ctxt, instance, new_pass):
913913
def set_host_enabled(self, ctxt, host, enabled):
914914
version = '5.0'
915915
cctxt = self.router.client(ctxt).prepare(
916-
server=host, version=version)
916+
server=host, version=version,
917+
call_monitor_timeout=CONF.rpc_response_timeout,
918+
timeout=CONF.long_rpc_timeout)
917919
return cctxt.call(ctxt, 'set_host_enabled', enabled=enabled)
918920

919921
def swap_volume(self, ctxt, instance, old_volume_id, new_volume_id,

nova/conf/rpc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
2929
* live migration
3030
* scheduling
31+
* enabling/disabling a compute service
3132
3233
Related options:
3334

nova/tests/functional/api_sample_tests/test_services.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ def setUp(self):
3737
test_services.fake_service_get_by_host_binary)
3838
self.stub_out("nova.db.api.service_update",
3939
test_services.fake_service_update)
40+
# If we are not using real services, we need to stub out
41+
# HostAPI._update_compute_provider_status so we don't actually
42+
# try to call a fake service over RPC.
43+
self.stub_out('nova.compute.api.HostAPI.'
44+
'_update_compute_provider_status',
45+
lambda *args, **kwargs: None)
4046
self.useFixture(utils_fixture.TimeFixture(test_services.fake_utcnow()))
4147

4248
def test_services_list(self):

nova/tests/functional/wsgi/test_services.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@
1111
# under the License.
1212

1313
import os_resource_classes as orc
14+
import os_traits
1415
import six
1516

1617
from nova import context as nova_context
1718
from nova import exception
1819
from nova import objects
1920
from nova.tests.functional.api import client as api_client
2021
from nova.tests.functional import integrated_helpers
22+
from nova.tests.unit.image import fake as fake_image
23+
from nova import utils
2124

2225

2326
class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
@@ -171,3 +174,132 @@ def test_evacuate_then_delete_compute_service(self):
171174
log_output = self.stdlog.logger.output
172175
self.assertIn('Error updating resources for node host1.', log_output)
173176
self.assertIn('Failed to create resource provider host1', log_output)
177+
178+
179+
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
180+
"""Tests the API, compute service and Placement interaction with the
181+
COMPUTE_STATUS_DISABLED trait when a compute service is enable/disabled.
182+
183+
This version of the test uses the 2.latest microversion for testing the
184+
2.53+ behavior of the PUT /os-services/{service_id} API.
185+
"""
186+
compute_driver = 'fake.SmallFakeDriver'
187+
188+
def _update_service(self, service, disabled, forced_down=None):
189+
"""Update the service using the 2.53 request schema.
190+
191+
:param service: dict representing the service resource in the API
192+
:param disabled: True if the service should be disabled, False if the
193+
service should be enabled
194+
:param forced_down: Optionally change the forced_down value.
195+
"""
196+
status = 'disabled' if disabled else 'enabled'
197+
req = {'status': status}
198+
if forced_down is not None:
199+
req['forced_down'] = forced_down
200+
self.admin_api.put_service(service['id'], req)
201+
202+
def test_compute_status_filter(self):
203+
"""Tests the compute_status_filter placement request filter"""
204+
# Start a compute service so a compute node and resource provider is
205+
# created.
206+
compute = self._start_compute('host1')
207+
# Get the UUID of the resource provider that was created.
208+
rp_uuid = self._get_provider_uuid_by_host('host1')
209+
# Get the service from the compute API.
210+
services = self.admin_api.get_services(binary='nova-compute',
211+
host='host1')
212+
self.assertEqual(1, len(services))
213+
service = services[0]
214+
215+
# At this point, the service should be enabled and the
216+
# COMPUTE_STATUS_DISABLED trait should not be set on the
217+
# resource provider in placement.
218+
self.assertEqual('enabled', service['status'])
219+
rp_traits = self._get_provider_traits(rp_uuid)
220+
trait = os_traits.COMPUTE_STATUS_DISABLED
221+
self.assertNotIn(trait, rp_traits)
222+
223+
# Now disable the compute service via the API.
224+
self._update_service(service, disabled=True)
225+
226+
# The update to placement should be synchronous so check the provider
227+
# traits and COMPUTE_STATUS_DISABLED should be set.
228+
rp_traits = self._get_provider_traits(rp_uuid)
229+
self.assertIn(trait, rp_traits)
230+
231+
# Try creating a server which should fail because nothing is available.
232+
networks = [{'port': self.neutron.port_1['id']}]
233+
server_req = self._build_minimal_create_server_request(
234+
self.api, 'test_compute_status_filter',
235+
image_uuid=fake_image.get_valid_image_id(), networks=networks)
236+
server = self.api.post_server({'server': server_req})
237+
server = self._wait_for_state_change(self.api, server, 'ERROR')
238+
# There should be a NoValidHost fault recorded.
239+
self.assertIn('fault', server)
240+
self.assertIn('No valid host', server['fault']['message'])
241+
242+
# Now enable the service and the trait should be gone.
243+
self._update_service(service, disabled=False)
244+
rp_traits = self._get_provider_traits(rp_uuid)
245+
self.assertNotIn(trait, rp_traits)
246+
247+
# Try creating another server and it should be OK.
248+
server = self.api.post_server({'server': server_req})
249+
self._wait_for_state_change(self.api, server, 'ACTIVE')
250+
251+
# Stop, force-down and disable the service so the API cannot call
252+
# the compute service to sync the trait.
253+
compute.stop()
254+
self._update_service(service, disabled=True, forced_down=True)
255+
# The API should have logged a message about the service being down.
256+
self.assertIn('Compute service on host host1 is down. The '
257+
'COMPUTE_STATUS_DISABLED trait will be synchronized '
258+
'when the service is restarted.',
259+
self.stdlog.logger.output)
260+
# The trait should not be on the provider even though the node is
261+
# disabled.
262+
rp_traits = self._get_provider_traits(rp_uuid)
263+
self.assertNotIn(trait, rp_traits)
264+
# Restart the compute service which should sync and set the trait on
265+
# the provider in placement.
266+
self.restart_compute_service(compute)
267+
rp_traits = self._get_provider_traits(rp_uuid)
268+
self.assertIn(trait, rp_traits)
269+
270+
271+
class ComputeStatusFilterTest211(ComputeStatusFilterTest):
272+
"""Extends ComputeStatusFilterTest and uses the 2.11 API for the
273+
legacy os-services disable/enable/force-down API behavior
274+
"""
275+
microversion = '2.11'
276+
277+
def _update_service(self, service, disabled, forced_down=None):
278+
"""Update the service using the 2.11 request schema.
279+
280+
:param service: dict representing the service resource in the API
281+
:param disabled: True if the service should be disabled, False if the
282+
service should be enabled
283+
:param forced_down: Optionally change the forced_down value.
284+
"""
285+
# Before 2.53 the service is uniquely identified by host and binary.
286+
body = {
287+
'host': service['host'],
288+
'binary': service['binary']
289+
}
290+
# Handle forced_down first if provided since the enable/disable
291+
# behavior in the API depends on it.
292+
if forced_down is not None:
293+
body['forced_down'] = forced_down
294+
self.admin_api.api_put('/os-services/force-down', body)
295+
296+
if disabled:
297+
self.admin_api.api_put('/os-services/disable', body)
298+
else:
299+
self.admin_api.api_put('/os-services/enable', body)
300+
301+
def _get_provider_uuid_by_host(self, host):
302+
# We have to temporarily mutate to 2.53 to get the hypervisor UUID.
303+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
304+
return super(ComputeStatusFilterTest211,
305+
self)._get_provider_uuid_by_host(host)

nova/tests/unit/compute/test_host_api.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import fixtures
1818
import mock
19+
import oslo_messaging as messaging
1920
from oslo_utils.fixture import uuidsentinel as uuids
2021

2122
from nova.api.openstack.compute import services
@@ -327,18 +328,73 @@ def test_service_update_by_host_and_binary(self):
327328
service_id = 42
328329
expected_result = dict(test_service.fake_service, id=service_id)
329330

331+
@mock.patch.object(self.host_api, '_update_compute_provider_status')
330332
@mock.patch.object(self.host_api.db, 'service_get_by_host_and_binary')
331333
@mock.patch.object(self.host_api.db, 'service_update')
332-
def _do_test(mock_service_update, mock_service_get_by_host_and_binary):
334+
def _do_test(mock_service_update, mock_service_get_by_host_and_binary,
335+
mock_update_compute_provider_status):
333336
mock_service_get_by_host_and_binary.return_value = expected_result
334337
mock_service_update.return_value = expected_result
335338

336339
result = self.host_api.service_update_by_host_and_binary(
337340
self.ctxt, host_name, binary, params_to_update)
338341
self._compare_obj(result, expected_result)
342+
mock_update_compute_provider_status.assert_called_once_with(
343+
self.ctxt, test.MatchType(objects.Service))
339344

340345
_do_test()
341346

347+
@mock.patch('nova.compute.api.HostAPI._update_compute_provider_status',
348+
new_callable=mock.NonCallableMock)
349+
def test_service_update_no_update_provider_status(self, mock_ucps):
350+
"""Tests the scenario that the service is updated but the disabled
351+
field is not changed, for example the forced_down field is only
352+
updated. In this case _update_compute_provider_status should not be
353+
called.
354+
"""
355+
service = objects.Service(forced_down=True)
356+
self.assertIn('forced_down', service.obj_what_changed())
357+
with mock.patch.object(service, 'save') as mock_save:
358+
retval = self.host_api.service_update(self.ctxt, service)
359+
self.assertIs(retval, service)
360+
mock_save.assert_called_once_with()
361+
362+
@mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled',
363+
new_callable=mock.NonCallableMock)
364+
def test_update_compute_provider_status_service_too_old(self, mock_she):
365+
"""Tests the scenario that the service is up but is too old to sync the
366+
COMPUTE_STATUS_DISABLED trait.
367+
"""
368+
service = objects.Service(host='fake-host')
369+
service.version = compute.MIN_COMPUTE_SYNC_COMPUTE_STATUS_DISABLED - 1
370+
with mock.patch.object(
371+
self.host_api.servicegroup_api, 'service_is_up',
372+
return_value=True) as service_is_up:
373+
self.host_api._update_compute_provider_status(self.ctxt, service)
374+
service_is_up.assert_called_once_with(service)
375+
self.assertIn('Compute service on host fake-host is too old to sync '
376+
'the COMPUTE_STATUS_DISABLED trait in Placement.',
377+
self.stdlog.logger.output)
378+
379+
@mock.patch('nova.compute.rpcapi.ComputeAPI.set_host_enabled',
380+
side_effect=messaging.MessagingTimeout)
381+
def test_update_compute_provider_status_service_rpc_error(self, mock_she):
382+
"""Tests the scenario that the RPC call to the compute service raised
383+
some exception.
384+
"""
385+
service = objects.Service(host='fake-host', disabled=True)
386+
with mock.patch.object(
387+
self.host_api.servicegroup_api, 'service_is_up',
388+
return_value=True) as service_is_up:
389+
self.host_api._update_compute_provider_status(self.ctxt, service)
390+
service_is_up.assert_called_once_with(service)
391+
mock_she.assert_called_once_with(self.ctxt, 'fake-host', False)
392+
log_output = self.stdlog.logger.output
393+
self.assertIn('An error occurred while updating host enabled '
394+
'status to "disabled" for compute host: fake-host',
395+
log_output)
396+
self.assertIn('MessagingTimeout', log_output)
397+
342398
@mock.patch.object(objects.InstanceList, 'get_by_host',
343399
return_value = ['fake-responses'])
344400
def test_instance_get_all_by_host(self, mock_get):

nova/tests/unit/compute/test_rpcapi.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,8 +468,10 @@ def test_set_admin_password(self):
468468
version='5.0')
469469

470470
def test_set_host_enabled(self):
471+
self.flags(long_rpc_timeout=600, rpc_response_timeout=120)
471472
self._test_compute_api('set_host_enabled', 'call',
472-
enabled='enabled', host='host')
473+
enabled='enabled', host='host',
474+
call_monitor_timeout=120, timeout=600)
473475

474476
def test_get_host_uptime(self):
475477
self._test_compute_api('get_host_uptime', 'call', host='host')

0 commit comments

Comments
 (0)