Skip to content

Commit 5cc39fc

Browse files
committed
Add functional regression test for bug 1837955
This adds a functional regression recreate test for bug 1837955 which was introduced with change Iae904afb6cb4fcea8bb27741d774ffbe986a5fb4 in the Queens release. In this scenario, the primary (and potentially alternate) hosts for a server build fail and reschedule to conductor. Eventually all alternate hosts are exhausted and specifically trying to claim allocations against the alternates fails, probably because between the time of initial scheduling and rescheduling something else took up the spare capacity on the alternate host. When this happens, MaxRetriesExceeded is raised but the instance is stuck in BUILD status rather than set to ERROR status with a fault message. Change-Id: I4ca64dd60d883356880680fb1f04cee4136c2e00 Related-Bug: #1837955
1 parent f1426d1 commit 5cc39fc

File tree

1 file changed

+114
-0
lines changed

1 file changed

+114
-0
lines changed
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
2+
# not use this file except in compliance with the License. You may obtain
3+
# a copy of the License at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10+
# License for the specific language governing permissions and limitations
11+
# under the License.
12+
13+
import time
14+
15+
from nova import exception
16+
from nova.tests.functional import integrated_helpers
17+
from nova.tests.unit import fake_notifier
18+
from nova.tests.unit.image import fake as fake_image
19+
20+
21+
class BuildRescheduleClaimFailsTestCase(
22+
integrated_helpers.ProviderUsageBaseTestCase):
23+
"""Regression test case for bug 1837955 where a server build fails on the
24+
primary host and then attempting to allocate resources on the alternate
25+
host, the alternate host is full and the allocations claim in placement
26+
fails, resulting in the build failing due to MaxRetriesExceeded and the
27+
server going to ERROR status.
28+
"""
29+
compute_driver = 'fake.SmallFakeDriver'
30+
31+
def _wait_for_unversioned_notification(self, event_type):
32+
for x in range(20): # wait up to 10 seconds
33+
for notification in fake_notifier.NOTIFICATIONS:
34+
if notification.event_type == event_type:
35+
return notification
36+
time.sleep(.5)
37+
self.fail('Timed out waiting for unversioned notification %s. Got: %s'
38+
% (event_type, fake_notifier.NOTIFICATIONS))
39+
40+
def test_build_reschedule_alt_host_alloc_fails(self):
41+
# Start two compute services so we have one alternate host.
42+
# Set cpu_allocation_ratio=1.0 to make placement inventory
43+
# and allocations for VCPU easier to manage.
44+
self.flags(cpu_allocation_ratio=1.0)
45+
for x in range(2):
46+
self._start_compute('host%i' % x)
47+
48+
def fake_instance_claim(_self, _context, _inst, nodename, *a, **kw):
49+
# Before triggering the reschedule to the other host, max out the
50+
# capacity on the alternate host.
51+
alt_nodename = 'host0' if nodename == 'host1' else 'host1'
52+
rp_uuid = self._get_provider_uuid_by_host(alt_nodename)
53+
inventories = self._get_provider_inventory(rp_uuid)
54+
# Fake some other consumer taking all of the VCPU on the alt host.
55+
# Since we set cpu_allocation_ratio=1.0 the total is the total
56+
# capacity for VCPU on the host.
57+
total_vcpu = inventories['VCPU']['total']
58+
alt_consumer = '7d32d0bc-af16-44b2-8019-a24925d76152'
59+
allocs = {
60+
'allocations': {
61+
rp_uuid: {
62+
'resources': {
63+
'VCPU': total_vcpu
64+
}
65+
}
66+
},
67+
'project_id': self.api.project_id,
68+
'user_id': self.api.project_id
69+
}
70+
resp = self.placement_api.put(
71+
'/allocations/%s' % alt_consumer, allocs, version='1.12')
72+
self.assertEqual(204, resp.status, resp.content)
73+
raise exception.ComputeResourcesUnavailable(reason='overhead!')
74+
75+
# Stub out the instance claim (regardless of which host the scheduler
76+
# picks as the primary) to trigger a reschedule.
77+
self.stub_out('nova.compute.manager.resource_tracker.ResourceTracker.'
78+
'instance_claim', fake_instance_claim)
79+
80+
# Now that our stub is in place, try to create a server and wait for it
81+
# to go to ERROR status.
82+
server = self._build_minimal_create_server_request(
83+
self.api, 'test_build_reschedule_alt_host_alloc_fails',
84+
image_uuid=fake_image.get_valid_image_id(),
85+
networks=[{'port': self.neutron.port_1['id']}])
86+
server = self.api.post_server({'server': server})
87+
# FIXME(mriedem): This is bug 1837955 where the status is stuck in
88+
# BUILD rather than the vm_state being set to error and the task_state
89+
# being set to None. Uncomment this when the bug is fixed.
90+
# server = self._wait_for_state_change(self.api, server, 'ERROR')
91+
92+
# Wait for the MaxRetriesExceeded fault to be recorded.
93+
# set_vm_state_and_notify sets the vm_state to ERROR before the fault
94+
# is recorded but after the notification is sent. So wait for the
95+
# unversioned notification to show up and then get the fault.
96+
# FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
97+
# self._wait_for_unversioned_notification(
98+
# 'compute_task.build_instances')
99+
# server = self.api.get_server(server['id'])
100+
# self.assertIn('fault', server)
101+
# self.assertIn('Exceeded maximum number of retries',
102+
# server['fault']['message'])
103+
104+
# TODO(mriedem): Remove this when the bug is fixed. We need to assert
105+
# something before the bug is fixed to show the failure so check the
106+
# logs.
107+
for x in range(20):
108+
logs = self.stdlog.logger.output
109+
if 'MaxRetriesExceeded' in logs:
110+
break
111+
time.sleep(.5)
112+
else:
113+
self.fail('Timed out waiting for MaxRetriesExceeded to show up '
114+
'in the logs.')

0 commit comments

Comments
 (0)