Merge "Add functional regression test for bug 1837955"

Zuul · openstack-gerrit · commit 9cd0fcdf626e · 2019-07-30T02:12:22.000Z
diff --git a/nova/tests/functional/regressions/test_bug_1837955.py b/nova/tests/functional/regressions/test_bug_1837955.py
@@ -0,0 +1,114 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import time
+
+from nova import exception
+from nova.tests.functional import integrated_helpers
+from nova.tests.unit import fake_notifier
+from nova.tests.unit.image import fake as fake_image
+
+
+class BuildRescheduleClaimFailsTestCase(
+        integrated_helpers.ProviderUsageBaseTestCase):
+    """Regression test case for bug 1837955 where a server build fails on the
+    primary host and then attempting to allocate resources on the alternate
+    host, the alternate host is full and the allocations claim in placement
+    fails, resulting in the build failing due to MaxRetriesExceeded and the
+    server going to ERROR status.
+    """
+    compute_driver = 'fake.SmallFakeDriver'
+
+    def _wait_for_unversioned_notification(self, event_type):
+        for x in range(20):  # wait up to 10 seconds
+            for notification in fake_notifier.NOTIFICATIONS:
+                if notification.event_type == event_type:
+                    return notification
+            time.sleep(.5)
+        self.fail('Timed out waiting for unversioned notification %s. Got: %s'
+                  % (event_type, fake_notifier.NOTIFICATIONS))
+
+    def test_build_reschedule_alt_host_alloc_fails(self):
+        # Start two compute services so we have one alternate host.
+        # Set cpu_allocation_ratio=1.0 to make placement inventory
+        # and allocations for VCPU easier to manage.
+        self.flags(cpu_allocation_ratio=1.0)
+        for x in range(2):
+            self._start_compute('host%i' % x)
+
+        def fake_instance_claim(_self, _context, _inst, nodename, *a, **kw):
+            # Before triggering the reschedule to the other host, max out the
+            # capacity on the alternate host.
+            alt_nodename = 'host0' if nodename == 'host1' else 'host1'
+            rp_uuid = self._get_provider_uuid_by_host(alt_nodename)
+            inventories = self._get_provider_inventory(rp_uuid)
+            # Fake some other consumer taking all of the VCPU on the alt host.
+            # Since we set cpu_allocation_ratio=1.0 the total is the total
+            # capacity for VCPU on the host.
+            total_vcpu = inventories['VCPU']['total']
+            alt_consumer = '7d32d0bc-af16-44b2-8019-a24925d76152'
+            allocs = {
+                'allocations': {
+                    rp_uuid: {
+                        'resources': {
+                            'VCPU': total_vcpu
+                        }
+                    }
+                },
+                'project_id': self.api.project_id,
+                'user_id': self.api.project_id
+            }
+            resp = self.placement_api.put(
+                '/allocations/%s' % alt_consumer, allocs, version='1.12')
+            self.assertEqual(204, resp.status, resp.content)
+            raise exception.ComputeResourcesUnavailable(reason='overhead!')
+
+        # Stub out the instance claim (regardless of which host the scheduler
+        # picks as the primary) to trigger a reschedule.
+        self.stub_out('nova.compute.manager.resource_tracker.ResourceTracker.'
+                      'instance_claim', fake_instance_claim)
+
+        # Now that our stub is in place, try to create a server and wait for it
+        # to go to ERROR status.
+        server = self._build_minimal_create_server_request(
+            self.api, 'test_build_reschedule_alt_host_alloc_fails',
+            image_uuid=fake_image.get_valid_image_id(),
+            networks=[{'port': self.neutron.port_1['id']}])
+        server = self.api.post_server({'server': server})
+        # FIXME(mriedem): This is bug 1837955 where the status is stuck in
+        # BUILD rather than the vm_state being set to error and the task_state
+        # being set to None. Uncomment this when the bug is fixed.
+        # server = self._wait_for_state_change(self.api, server, 'ERROR')
+
+        # Wait for the MaxRetriesExceeded fault to be recorded.
+        # set_vm_state_and_notify sets the vm_state to ERROR before the fault
+        # is recorded but after the notification is sent. So wait for the
+        # unversioned notification to show up and then get the fault.
+        # FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
+        # self._wait_for_unversioned_notification(
+        #     'compute_task.build_instances')
+        # server = self.api.get_server(server['id'])
+        # self.assertIn('fault', server)
+        # self.assertIn('Exceeded maximum number of retries',
+        #               server['fault']['message'])
+
+        # TODO(mriedem): Remove this when the bug is fixed. We need to assert
+        # something before the bug is fixed to show the failure so check the
+        # logs.
+        for x in range(20):
+            logs = self.stdlog.logger.output
+            if 'MaxRetriesExceeded' in logs:
+                break
+            time.sleep(.5)
+        else:
+            self.fail('Timed out waiting for MaxRetriesExceeded to show up '
+                      'in the logs.')