Skip to content

Commit f77a9fe

Browse files
committed
libvirt: remove default cputune shares value
Previously, the libvirt driver defaulted to 1024 * (# of CPUs) for the value of domain/cputune/shares in the libvirt XML. This value is then passed directly by libvirt to the cgroups API. Cgroups v2 imposes a maximum value of 10000 that can be passed in. This makes Nova unable to launch instances with more than 9 CPUs on hosts that run cgroups v2, like Ubuntu Jammy or RHEL 9. Fix this by just removing the default entirely. Because there is no longer a guarantee that domain/cputune will contain at least a shares element, we can stop always generating the former, and only generate it if it will actually contain something. We can also make operators's lives easier by leveraging the fact that we update the XML during live migration, so this patch also adds a method to remove the shares value from the live migration XML if one was not set as the quota:cpu_shares flavor extra spec. For operators that *have* set this extra spec to something greater than 10000, their flavors will have to get updates, and their instances resized. Partial-bug: 1978489 Change-Id: I49d757f5f261b3562ada27e6cf57284f615ca395
1 parent d869163 commit f77a9fe

File tree

6 files changed

+80
-39
lines changed

6 files changed

+80
-39
lines changed

doc/source/admin/resource-limits.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ CPU limits
3838
Libvirt enforces CPU limits in terms of *shares* and *quotas*, configured
3939
via :nova:extra-spec:`quota:cpu_shares` and :nova:extra-spec:`quota:cpu_period`
4040
/ :nova:extra-spec:`quota:cpu_quota`, respectively. Both are implemented using
41-
the `cgroups v1 cpu controller`__.
41+
the `cgroups cpu controller`__. Note that allowed values for *shares* are
42+
platform dependant.
4243

4344
CPU shares are a proportional weighted share of total CPU resources relative to
4445
other instances. It does not limit CPU usage if CPUs are not busy. There is no

nova/tests/unit/virt/libvirt/test_driver.py

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2991,7 +2991,7 @@ def test_get_guest_config_numa_host_instance_fits(self, is_able):
29912991
cfg = drvr._get_guest_config(instance_ref, [],
29922992
image_meta, disk_info)
29932993
self.assertIsNone(cfg.cpuset)
2994-
self.assertEqual(0, len(cfg.cputune.vcpupin))
2994+
self.assertIsNone(cfg.cputune)
29952995
self.assertIsNone(cfg.cpu.numa)
29962996

29972997
@mock.patch('nova.privsep.utils.supports_direct_io',
@@ -3028,7 +3028,7 @@ def test_get_guest_config_numa_host_instance_no_fit(self, is_able):
30283028
image_meta, disk_info)
30293029
self.assertFalse(choice_mock.called)
30303030
self.assertIsNone(cfg.cpuset)
3031-
self.assertEqual(0, len(cfg.cputune.vcpupin))
3031+
self.assertIsNone(cfg.cputune)
30323032
self.assertIsNone(cfg.cpu.numa)
30333033

30343034
def _test_get_guest_memory_backing_config(
@@ -3436,7 +3436,7 @@ def test_get_guest_config_numa_host_instance_pci_no_numa_info(
34363436
cfg = conn._get_guest_config(instance_ref, [],
34373437
image_meta, disk_info)
34383438
self.assertEqual(set([3]), cfg.cpuset)
3439-
self.assertEqual(0, len(cfg.cputune.vcpupin))
3439+
self.assertIsNone(cfg.cputune)
34403440
self.assertIsNone(cfg.cpu.numa)
34413441

34423442
@mock.patch('nova.privsep.utils.supports_direct_io',
@@ -3490,7 +3490,7 @@ def test_get_guest_config_numa_host_instance_2pci_no_fit(self, is_able):
34903490
image_meta, disk_info)
34913491
self.assertFalse(choice_mock.called)
34923492
self.assertEqual(set([3]), cfg.cpuset)
3493-
self.assertEqual(0, len(cfg.cputune.vcpupin))
3493+
self.assertIsNone(cfg.cputune)
34943494
self.assertIsNone(cfg.cpu.numa)
34953495

34963496
@mock.patch.object(fakelibvirt.Connection, 'getType')
@@ -3589,7 +3589,7 @@ def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset(
35893589
# NOTE(ndipanov): we make sure that pin_set was taken into account
35903590
# when choosing viable cells
35913591
self.assertEqual(set([2, 3]), cfg.cpuset)
3592-
self.assertEqual(0, len(cfg.cputune.vcpupin))
3592+
self.assertIsNone(cfg.cputune)
35933593
self.assertIsNone(cfg.cpu.numa)
35943594

35953595
@mock.patch.object(
@@ -3631,7 +3631,7 @@ def test_get_guest_config_non_numa_host_instance_topo(self, is_able):
36313631
cfg = drvr._get_guest_config(instance_ref, [],
36323632
image_meta, disk_info)
36333633
self.assertIsNone(cfg.cpuset)
3634-
self.assertEqual(0, len(cfg.cputune.vcpupin))
3634+
self.assertIsNone(cfg.cputune)
36353635
self.assertIsNone(cfg.numatune)
36363636
self.assertIsNotNone(cfg.cpu.numa)
36373637
for instance_cell, numa_cfg_cell in zip(
@@ -7038,26 +7038,6 @@ def test_get_guest_config_with_rng_dev_not_present(self, mock_path):
70387038
[],
70397039
image_meta, disk_info)
70407040

7041-
@mock.patch.object(
7042-
host.Host, "is_cpu_control_policy_capable", return_value=True)
7043-
def test_guest_cpu_shares_with_multi_vcpu(self, is_able):
7044-
self.flags(virt_type='kvm', group='libvirt')
7045-
7046-
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
7047-
7048-
instance_ref = objects.Instance(**self.test_instance)
7049-
instance_ref.flavor.vcpus = 4
7050-
image_meta = objects.ImageMeta.from_dict(self.test_image_meta)
7051-
7052-
disk_info = blockinfo.get_disk_info(CONF.libvirt.virt_type,
7053-
instance_ref,
7054-
image_meta)
7055-
7056-
cfg = drvr._get_guest_config(instance_ref, [],
7057-
image_meta, disk_info)
7058-
7059-
self.assertEqual(4096, cfg.cputune.shares)
7060-
70617041
@mock.patch.object(
70627042
host.Host, "is_cpu_control_policy_capable", return_value=True)
70637043
def test_get_guest_config_with_cpu_quota(self, is_able):
@@ -11689,7 +11669,7 @@ def test_live_migration_update_graphics_xml(self, mock_xml,
1168911669
mock_migrateToURI3,
1169011670
mock_min_version):
1169111671
self.compute = manager.ComputeManager()
11692-
instance_ref = self.test_instance
11672+
instance_ref = objects.Instance(**self.test_instance)
1169311673
target_connection = '127.0.0.2'
1169411674

1169511675
xml_tmpl = ("<domain type='kvm'>"
@@ -12369,7 +12349,7 @@ def test_live_migration_update_serial_console_xml(self, mock_xml,
1236912349
mock_get,
1237012350
mock_min_version):
1237112351
self.compute = manager.ComputeManager()
12372-
instance_ref = self.test_instance
12352+
instance_ref = objects.Instance(**self.test_instance)
1237312353
target_connection = '127.0.0.2'
1237412354

1237512355
xml_tmpl = ("<domain type='kvm'>"
@@ -12659,7 +12639,7 @@ def test_live_migration_raises_exception(self, mock_xml,
1265912639
mock_min_version):
1266012640
# Prepare data
1266112641
self.compute = manager.ComputeManager()
12662-
instance_ref = self.test_instance
12642+
instance_ref = objects.Instance(**self.test_instance)
1266312643
target_connection = '127.0.0.2'
1266412644

1266512645
disk_paths = ['vda', 'vdb']

nova/tests/unit/virt/libvirt/test_migration.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from nova import test
2929
from nova.tests import fixtures as nova_fixtures
3030
from nova.tests.fixtures import libvirt as fakelibvirt
31+
from nova.tests.unit.virt.libvirt import test_driver
3132
from nova.virt.libvirt import config as vconfig
3233
from nova.virt.libvirt import guest as libvirt_guest
3334
from nova.virt.libvirt import host
@@ -80,16 +81,51 @@ def test_get_updated_guest_xml(
8081
get_volume_config = mock.MagicMock()
8182
mock_guest.get_xml_desc.return_value = '<domain></domain>'
8283

83-
migration.get_updated_guest_xml(
84-
mock.sentinel.instance, mock_guest, data, get_volume_config)
84+
instance = objects.Instance(**test_driver._create_test_instance())
85+
migration.get_updated_guest_xml(instance, mock_guest, data,
86+
get_volume_config)
8587
mock_graphics.assert_called_once_with(mock.ANY, data)
8688
mock_serial.assert_called_once_with(mock.ANY, data)
8789
mock_volume.assert_called_once_with(
88-
mock.ANY, data, mock.sentinel.instance, get_volume_config)
90+
mock.ANY, data, instance, get_volume_config)
8991
mock_perf_events_xml.assert_called_once_with(mock.ANY, data)
9092
mock_memory_backing.assert_called_once_with(mock.ANY, data)
9193
self.assertEqual(1, mock_tostring.called)
9294

95+
def test_update_quota_xml(self):
96+
old_xml = """<domain>
97+
<name>fake-instance</name>
98+
<cputune>
99+
<shares>42</shares>
100+
<period>1337</period>
101+
</cputune>
102+
</domain>"""
103+
instance = objects.Instance(**test_driver._create_test_instance())
104+
new_xml = migration._update_quota_xml(instance,
105+
etree.fromstring(old_xml))
106+
new_xml = etree.tostring(new_xml, encoding='unicode')
107+
self.assertXmlEqual(
108+
"""<domain>
109+
<name>fake-instance</name>
110+
<cputune>
111+
<period>1337</period>
112+
</cputune>
113+
</domain>""", new_xml)
114+
115+
def test_update_quota_xml_empty_cputune(self):
116+
old_xml = """<domain>
117+
<name>fake-instance</name>
118+
<cputune>
119+
<shares>42</shares>
120+
</cputune>
121+
</domain>"""
122+
instance = objects.Instance(**test_driver._create_test_instance())
123+
new_xml = migration._update_quota_xml(instance,
124+
etree.fromstring(old_xml))
125+
new_xml = etree.tostring(new_xml, encoding='unicode')
126+
self.assertXmlEqual('<domain><name>fake-instance</name></domain>',
127+
new_xml)
128+
93129
def test_update_device_resources_xml_vpmem(self):
94130
# original xml for vpmems, /dev/dax0.1 and /dev/dax0.2 here
95131
# are vpmem device path on source host

nova/virt/libvirt/driver.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5692,15 +5692,11 @@ def _update_guest_cputune(self, guest, flavor):
56925692
if not is_able or CONF.libvirt.virt_type not in ('lxc', 'kvm', 'qemu'):
56935693
return
56945694

5695-
if guest.cputune is None:
5696-
guest.cputune = vconfig.LibvirtConfigGuestCPUTune()
5697-
# Setting the default cpu.shares value to be a value
5698-
# dependent on the number of vcpus
5699-
guest.cputune.shares = 1024 * guest.vcpus
5700-
57015695
for name in cputuning:
57025696
key = "quota:cpu_" + name
57035697
if key in flavor.extra_specs:
5698+
if guest.cputune is None:
5699+
guest.cputune = vconfig.LibvirtConfigGuestCPUTune()
57045700
setattr(guest.cputune, name,
57055701
int(flavor.extra_specs[key]))
57065702

nova/virt/libvirt/migration.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def get_updated_guest_xml(instance, guest, migrate_data, get_volume_config,
6262
xml_doc, migrate_data, instance, get_volume_config)
6363
xml_doc = _update_perf_events_xml(xml_doc, migrate_data)
6464
xml_doc = _update_memory_backing_xml(xml_doc, migrate_data)
65+
xml_doc = _update_quota_xml(instance, xml_doc)
6566
if get_vif_config is not None:
6667
xml_doc = _update_vif_xml(xml_doc, migrate_data, get_vif_config)
6768
if 'dst_numa_info' in migrate_data:
@@ -71,6 +72,18 @@ def get_updated_guest_xml(instance, guest, migrate_data, get_volume_config,
7172
return etree.tostring(xml_doc, encoding='unicode')
7273

7374

75+
def _update_quota_xml(instance, xml_doc):
76+
flavor_shares = instance.flavor.extra_specs.get('quota:cpu_shares')
77+
cputune = xml_doc.find('./cputune')
78+
shares = xml_doc.find('./cputune/shares')
79+
if shares is not None and not flavor_shares:
80+
cputune.remove(shares)
81+
# Remove the cputune element entirely if it has no children left.
82+
if cputune is not None and not list(cputune):
83+
xml_doc.remove(cputune)
84+
return xml_doc
85+
86+
7487
def _update_device_resources_xml(xml_doc, new_resources):
7588
vpmems = []
7689
for resource in new_resources:
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
upgrade:
2+
- |
3+
In the libvirt driver, the default value of the ``<cputune><shares>``
4+
element has been removed, and is now left to libvirt to decide. This is
5+
because allowed values are platform dependant, and the previous code was
6+
not guaranteed to be supported on all platforms. If any of your flavors are
7+
using the quota:cpu_shares extra spec, you may need to resize to a
8+
supported value before upgrading.
9+
10+
To facilitate the transition to no Nova default for ``<cputune><shares>``,
11+
its value will be removed during live migration unless a value is set in
12+
the ``quota:cpu_shares`` extra spec. This can cause temporary CPU
13+
starvation for the live migrated instance if other instances on the
14+
destination host still have the old default ``<cputune><shares>`` value. To
15+
fix this, hard reboot, cold migrate, or live migrate the other instances.

0 commit comments

Comments
 (0)