Skip to content

Commit ba3fc8a

Browse files
authored
Merge branch 'stackhpc/2023.1' into logs_in_grafana
2 parents 2d6f175 + 4c7b7c9 commit ba3fc8a

File tree

122 files changed

+51929
-50713
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

122 files changed

+51929
-50713
lines changed

.automation.conf/tempest/tempest-ci-multinode.overrides.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ max_microversion = 3.70
3232
build_timeout = 600
3333

3434
[dashboard]
35-
dashboard_url = http://192.168.39.2
35+
dashboard_url = https://192.168.39.2

.github/workflows/stackhpc-all-in-one.yml

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ jobs:
167167
VM_NETWORK: ${{ inputs.vm_network }}
168168
VM_SUBNET: ${{ inputs.vm_subnet }}
169169
VM_INTERFACE: ${{ inputs.vm_interface }}
170-
VM_VOLUME_SIZE: ${{ inputs.upgrade && '45' || '35' }}
170+
VM_VOLUME_SIZE: ${{ inputs.upgrade && '50' || '40' }}
171171
VM_TAGS: '["skc-ci-aio", "PR=${{ github.event.number }}"]'
172172

173173
- name: Terraform Plan
@@ -179,6 +179,7 @@ jobs:
179179
OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}
180180

181181
- name: Terraform Apply
182+
id: tf_apply
182183
run: |
183184
for attempt in $(seq 5); do
184185
if terraform apply -auto-approve; then
@@ -355,6 +356,7 @@ jobs:
355356
if: inputs.upgrade
356357

357358
- name: Tempest tests
359+
id: tempest
358360
run: |
359361
mkdir -p tempest-artifacts
360362
docker run -t --rm \
@@ -366,16 +368,55 @@ jobs:
366368
env:
367369
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
368370

371+
- name: StackHPC OpenStack tests
372+
id: stackhpc-openstack-tests
373+
continue-on-error: true
374+
run: |
375+
mkdir -p sot-results
376+
docker run -t --rm \
377+
-v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
378+
-v $(pwd)/sot-results:/stack/sot-results \
379+
-e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
380+
$KAYOBE_IMAGE \
381+
/stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/stackhpc-openstack-tests.yml'
382+
env:
383+
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
384+
385+
- name: Collect diagnostic information
386+
id: diagnostics
387+
run: |
388+
mkdir -p diagnostics
389+
sudo -E docker run -t --rm \
390+
-v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
391+
-v $(pwd)/diagnostics:/stack/diagnostics \
392+
-e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
393+
$KAYOBE_IMAGE \
394+
/stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml'
395+
env:
396+
KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
397+
if: ${{ !cancelled() && steps.tf_apply.outcome == 'success' }}
398+
369399
- name: Upload test result artifacts
370400
uses: actions/upload-artifact@v4
371401
with:
372-
name: tempest-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' }}
373-
path: tempest-artifacts/*
402+
name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' || '' }}
403+
path: |
404+
diagnostics/
405+
tempest-artifacts/
406+
sot-results/
407+
if: ${{ !cancelled() && (steps.tempest.outcome == 'success' || steps.stackhpc-openstack-tests.outcome == 'success' || steps.diagnostics.outcome == 'success') }}
374408

375409
- name: Fail if any Tempest tests failed
376410
run: |
377411
test $(wc -l < tempest-artifacts/failed-tests) -lt 1
378412
413+
- name: Fail if any StackHPC OpenStack tests failed
414+
run: |
415+
echo "Some StackHPC OpenStack tests failed."
416+
echo "See HTML results artifact (sot-results) for details."
417+
exit 1
418+
if: steps.stackhpc-openstack-tests.outcome == 'failure'
419+
379420
- name: Destroy
380421
run: terraform destroy -auto-approve
381422
working-directory: ${{ github.workspace }}/terraform/aio

.github/workflows/stackhpc-container-image-build.yml

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,10 @@ on:
3434
required: false
3535
default: true
3636
push-dirty:
37-
description: Push scanned images that have vulnerabilities?
37+
description: Push scanned images that have critical vulnerabilities?
3838
type: boolean
3939
required: false
40-
# NOTE(Alex-Welsh): This default should be flipped once we resolve existing failures
41-
default: true
40+
default: false
4241

4342
env:
4443
ANSIBLE_FORCE_COLOR: True
@@ -136,6 +135,10 @@ jobs:
136135
run: |
137136
curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin v0.49.0
138137
138+
- name: Install yq
139+
run: |
140+
curl -sL https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64.tar.gz | tar xz && sudo mv yq_linux_amd64 /usr/bin/yq
141+
139142
- name: Install Kayobe
140143
run: |
141144
mkdir -p venvs &&
@@ -149,7 +152,7 @@ jobs:
149152
# Normally installed during host configure.
150153
- name: Install Docker Python SDK
151154
run: |
152-
sudo pip install docker
155+
sudo pip install docker 'requests<2.32.0'
153156
154157
- name: Get Kolla tag
155158
id: write-kolla-tag
@@ -176,7 +179,7 @@ jobs:
176179
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
177180

178181
- name: Create build logs output directory
179-
run: mkdir image-build-logs
182+
run: mkdir image-build-logs
180183

181184
- name: Build kolla overcloud images
182185
id: build_overcloud_images
@@ -235,9 +238,16 @@ jobs:
235238
run: cp image-build-logs/image-scan-output/clean-images.txt image-build-logs/push-attempt-images.txt
236239
if: inputs.push
237240

241+
# NOTE(seunghun1ee): This always appends dirty images with CVEs severity lower than critical.
242+
# This should be reverted when it's decided to filter high level CVEs as well.
238243
- name: Append dirty images to push list
239244
run: |
240245
cat image-build-logs/image-scan-output/dirty-images.txt >> image-build-logs/push-attempt-images.txt
246+
if: ${{ inputs.push }}
247+
248+
- name: Append images with critical vulnerabilities to push list
249+
run: |
250+
cat image-build-logs/image-scan-output/critical-images.txt >> image-build-logs/push-attempt-images.txt
241251
if: ${{ inputs.push && inputs.push-dirty }}
242252

243253
- name: Push images
@@ -249,11 +259,11 @@ jobs:
249259
250260
while read -r image; do
251261
# Retries!
252-
for i in {1..5}; do
262+
for i in {1..5}; do
253263
if docker push $image; then
254264
echo "Pushed $image"
255265
break
256-
elif $i == 5; then
266+
elif [ $i -eq 5 ] ; then
257267
echo "Failed to push $image"
258268
echo $image >> image-build-logs/push-failed-images.txt
259269
else
@@ -283,8 +293,15 @@ jobs:
283293
run: if [ $(wc -l < image-build-logs/push-failed-images.txt) -gt 0 ]; then cat image-build-logs/push-failed-images.txt && exit 1; fi
284294
if: ${{ !cancelled() }}
285295

286-
- name: Fail when images failed scanning
287-
run: if [ $(wc -l < image-build-logs/dirty-images.txt) -gt 0 ]; then cat image-build-logs/dirty-images.txt && exit 1; fi
296+
# NOTE(seunghun1ee): Currently we want to mark the job fail only when critical CVEs are detected.
297+
# This can be used again instead of "Fail when critical vulnerabilities are found" when it's
298+
# decided to fail the job on detecting high CVEs as well.
299+
# - name: Fail when images failed scanning
300+
# run: if [ $(wc -l < image-build-logs/image-scan-output/dirty-images.txt) -gt 0 ]; then cat image-build-logs/image-scan-output/dirty-images.txt && exit 1; fi
301+
# if: ${{ !inputs.push-dirty && !cancelled() }}
302+
303+
- name: Fail when critical vulnerabilities are found
304+
run: if [ $(wc -l < image-build-logs/image-scan-output/critical-images.txt) -gt 0 ]; then cat image-build-logs/image-scan-output/critical-images.txt && exit 1; fi
288305
if: ${{ !inputs.push-dirty && !cancelled() }}
289306

290307
# NOTE(mgoddard): Trigger another CI workflow in the

doc/source/configuration/cephadm.rst

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
================
2-
Cephadm & Kayobe
3-
================
1+
====
2+
Ceph
3+
====
44

55
This section describes how to use the Cephadm integration included in StackHPC
6-
Kayobe configuration since Xena to deploy Ceph.
6+
Kayobe configuration to deploy Ceph.
77

88
The Cephadm integration takes the form of custom playbooks that wrap
99
around the Ansible `stackhpc.cephadm collection
@@ -19,10 +19,10 @@ create or modify Ceph cluster deployments. Supported features are:
1919
Resources
2020
=========
2121

22-
- https://docs.ceph.com/en/pacific/cephadm/index.html
23-
- https://docs.ceph.com/en/pacific/
2422
- https://docs.ceph.com/en/quincy/cephadm/index.html
2523
- https://docs.ceph.com/en/quincy/
24+
- https://docs.ceph.com/en/reef/cephadm/index.html
25+
- https://docs.ceph.com/en/reef/
2626
- https://github.com/stackhpc/ansible-collection-cephadm
2727

2828
Configuration
@@ -107,7 +107,7 @@ OSD specification
107107
~~~~~~~~~~~~~~~~~
108108

109109
The following example is a basic OSD spec that adds OSDs for all
110-
available disks:
110+
available disks with encryption at rest:
111111

112112
.. code:: yaml
113113
@@ -118,9 +118,10 @@ available disks:
118118
host_pattern: "*"
119119
data_devices:
120120
all: true
121+
encrypted: true
121122
122123
More information about OSD service placement is available
123-
`here <https://docs.ceph.com/en/pacific/cephadm/services/osd/#advanced-osd-service-specifications>`__.
124+
`here <https://docs.ceph.com/en/quincy/cephadm/services/osd/#advanced-osd-service-specifications>`__.
124125

125126
Container image
126127
~~~~~~~~~~~~~~~
@@ -264,6 +265,24 @@ post-deployment configuration is applied. Commands in the
264265
``cephadm_commands_post`` list are executed after the rest of the Ceph
265266
post-deployment configuration is applied.
266267

268+
Messenger v2 encryption in transit
269+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
270+
271+
Messenger v2 is the default on-wire protocol since the Nautilus release. It
272+
supports `encryption of data in transit
273+
<https://docs.ceph.com/en/quincy/rados/configuration/msgr2/#connection-mode-configuration-options>`_,
274+
but this is not used by default. It may be enabled as follows:
275+
276+
.. code:: yaml
277+
278+
# A list of commands to pass to cephadm shell -- ceph. See stackhpc.cephadm.commands
279+
# for format.
280+
cephadm_commands_pre:
281+
# Enable messenger v2 encryption in transit.
282+
- "config set global ms_cluster_mode secure"
283+
- "config set global ms_service_mode secure"
284+
- "config set global ms_client_mode secure"
285+
267286
Manila & CephFS
268287
~~~~~~~~~~~~~~~
269288

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
==========
2+
CloudKitty
3+
==========
4+
5+
Configuring in kayobe-config
6+
============================
7+
8+
By default, CloudKitty uses Gnocchi and Ceilometer as the collector and fetcher
9+
backends. Unless the system has a specific reason not to, we recommend instead
10+
using Prometheus as the backend for both. The following instructions explain
11+
how to do this. Also, see the `Kolla Ansible docs on CloudKitty
12+
<https://docs.openstack.org/kolla-ansible/latest/reference/rating/cloudkitty-guide.html>`__
13+
for more details.
14+
15+
Enable CloudKitty and disable InfluxDB, as we are using OpenSearch as the
16+
storage backend. Set the following in ``kolla.yml``:
17+
18+
.. code-block:: yaml
19+
20+
kolla_enable_cloudkitty: true
21+
# Explicitly disable influxdb as we are using OpenSearch as the CloudKitty backend
22+
kolla_enable_influxdb: false
23+
24+
Set Prometheus as the backend for both the collector and fetcher, and
25+
Elasticsearch as the storage backend. Note that our fork of CloudKitty is
26+
patched so that the CloudKitty Elasticsearch V2 storage backend will also work
27+
with an OpenSearch cluster. Proper support for the V2 OpenSearch storage
28+
backend is still pending in Kolla-Ansible `here
29+
<https://review.opendev.org/c/openstack/kolla-ansible/+/898555>`__. Set the
30+
following in ``kolla/globals.yml``:
31+
32+
.. code-block:: yaml
33+
34+
cloudkitty_collector_backend: prometheus
35+
cloudkitty_fetcher_backend: prometheus
36+
cloudkitty_storage_backend: elasticsearch
37+
38+
If you have TLS enabled, you will also need to set the cafile for Prometheus
39+
and Elasticsearch. Set the following in ``kolla/globals.yml``.
40+
41+
.. code-block::
42+
43+
{% raw %}
44+
cloudkitty_prometheus_cafile: "{{ openstack_cacert }}"
45+
cloudkitty_elasticsearch_cafile: "{{ openstack_cacert }}"
46+
{% endraw %}
47+
48+
The default collection period is one hour, which is likely too long for most
49+
systems as CloudKitty charges by the **entire** collection period if any usage
50+
is seen within this timeframe. This is regardless of actual usage, meaning that
51+
even one minute will be charged as a full hour's usage. As a result, it is
52+
recommended to adjust the collection interval, ``period`` (in units of
53+
seconds), appropriately (e.g. ten minutes). Furthermore, when using Prometheus
54+
as the collector, you need to change the ``scope_key`` to match the metrics
55+
provided by the Prometheus OpenStack Exporter. Both of these can be achieved by
56+
setting the following in ``kolla/config/cloudkitty.conf``:
57+
58+
.. code-block:: console
59+
60+
[collect]
61+
scope_key = tenant_id
62+
period = 600
63+
64+
You will need to configure which metrics CloudKitty should track. The following
65+
example, set in ``kolla/config/cloudkitty/metrics.yml``, will track for VM flavors and
66+
the total utilised volume.
67+
68+
.. code-block:: yaml
69+
70+
metrics:
71+
openstack_nova_server_status:
72+
alt_name: instance
73+
groupby:
74+
- uuid
75+
- user_id
76+
- tenant_id
77+
metadata:
78+
- flavor_id
79+
- name
80+
mutate: MAP
81+
mutate_map:
82+
0.0: 1.0 # ACTIVE
83+
11.0: 1.0 # SHUTOFF
84+
12.0: 1.0 # SUSPENDED
85+
16.0: 1.0 # PAUSED
86+
unit: instance
87+
openstack_cinder_limits_volume_used_gb:
88+
alt_name: storage
89+
unit: GiB
90+
groupby:
91+
- tenant_id
92+
93+
If your system had Monasca deployed in the past, you likely have some
94+
relabelled attributes in the Prometheus OpenStack exporter. To account for
95+
this, you should either remove the custom relabelling (in
96+
``kolla/config/prometheus.yml``) or change your ``metrics.yml`` to use the
97+
correct attributes.
98+
99+
Post-configuration with openstack-config
100+
========================================
101+
102+
This is an example `openstack-config
103+
<https://github.com/stackhpc/openstack-config>`__ setup to create mappings for
104+
the metrics configured above. Note that the costs are scaled for the ten minute
105+
collection period, e.g. a flavor with 1 VCPU will cost 1 unit per hour.
106+
107+
.. code-block:: yaml
108+
109+
# Map flavors based on VCPUs
110+
openstack_ratings_hashmap_field_mappings:
111+
- service: instance
112+
name: flavor_id
113+
mappings:
114+
- value: '1' # tiny compute flavor (1 vcpu) with an OpenStack flavor ID of 1
115+
cost: 0.1666666666666666
116+
type: flat
117+
- value: '2' # small compute flavor (2 vcpus) with an OpenStack flavor ID of 2
118+
cost: 0.3333333333333333
119+
type: flat
120+
- value: '3' # medium compute flavor (3 vcpus) with an OpenStack flavor ID of 3
121+
cost: 0.5
122+
type: flat
123+
- value: '4' # large compute flavor (4 vcpus) with an OpenStack flavor ID of 4
124+
cost: 0.6666666666666666
125+
type: flat
126+
- value: '5' # xlarge compute flavor (8 vcpus) with an OpenStack flavor ID of 5
127+
cost: 1.3333333333333333
128+
type: flat
129+
- value: '6' # tiny 2 compute flavor (2 vcpus) with an OpenStack flavor ID of 6
130+
cost: 0.3333333333333333
131+
type: flat
132+
133+
# Map volumes based on GB
134+
openstack_ratings_hashmap_service_mappings:
135+
- service: storage
136+
cost: 0.16666666666666666
137+
type: flat
138+
139+
See the `OpenStack CloudKitty Ratings role
140+
<https://github.com/stackhpc/ansible-collection-openstack/tree/main/roles/os_ratings>`__
141+
for more details.

0 commit comments

Comments
 (0)