Skip to content

Commit ac3a31c

Browse files
wtripp180901sjpb
andauthored
Use bootstrap tokens provisioned by ansible for K3s instead of persistent tokens in cloud-init metadata (#589)
* now uses bootstrap tokens instead of cloud-init metadata * bumped token timeout * fixed assuming default route * bump image * refactored to separate agent and server runtimes + fixes + review comments * removed inventory_secrets from ci * image bump * testing moving agent runtime after nfs * changed k3s conditionals to host patterns + server tweaks * comment suggestions Co-authored-by: Steve Brasier <[email protected]> * k3s agent no longer uses task vars * added comment for moving k3s * Explicitly settings empty string default for bootstrap token Co-authored-by: Steve Brasier <[email protected]> * secure pems for agent environment file --------- Co-authored-by: Steve Brasier <[email protected]>
1 parent 30d6ce4 commit ac3a31c

File tree

24 files changed

+121
-91
lines changed

24 files changed

+121
-91
lines changed

ansible/bootstrap.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,10 +313,11 @@
313313
- include_role:
314314
name: azimuth_cloud.image_utils.linux_ansible_init
315315

316-
- hosts: k3s
316+
- hosts: k3s:&builder
317317
become: yes
318318
tags: k3s
319319
tasks:
320-
- ansible.builtin.include_role:
320+
- name: Install k3s
321+
ansible.builtin.include_role:
321322
name: k3s
322323
tasks_from: install.yml

ansible/extras.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,23 @@
1+
- hosts: k3s_server:!builder
2+
become: yes
3+
tags: k3s
4+
tasks:
5+
- name: Start k3s server
6+
ansible.builtin.include_role:
7+
name: k3s
8+
tasks_from: server-runtime.yml
9+
10+
# technically should be part of bootstrap.yml but hangs waiting on failed mounts
11+
# if runs before filesystems.yml after the control node has been reimaged
12+
- hosts: k3s_agent:!builder
13+
become: yes
14+
tags: k3s
15+
tasks:
16+
- name: Start k3s agents
17+
ansible.builtin.include_role:
18+
name: k3s
19+
tasks_from: agent-runtime.yml
20+
121
- hosts: basic_users:!builder
222
become: yes
323
tags:

ansible/roles/k3s/defaults/main.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ k3s_version: "v1.31.0+k3s1"
33
k3s_selinux_release: v1.6.latest.1
44
k3s_selinux_rpm_version: 1.6-1
55
k3s_helm_version: v3.11.0
6+
k3s_bootstrap_token: '' # matches common environment default
7+
k3s_bootstrap_token_expiry: 10m
8+
k3s_server_name: "{{ None }}" # ansible managed

ansible/roles/k3s/files/start_k3s.yml

Lines changed: 0 additions & 44 deletions
This file was deleted.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
---
2+
3+
- name: Template k3s agent env file
4+
when: k3s_bootstrap_token != ''
5+
ansible.builtin.template:
6+
dest: /etc/systemd/system/k3s-agent.service.env
7+
src: k3s-agent.service.env.j2
8+
owner: root
9+
group: root
10+
mode: 0640
11+
register: _k3s_agent_token_result
12+
13+
- name: Ensure password directory exists
14+
ansible.builtin.file:
15+
path: "/etc/rancher/node"
16+
state: directory
17+
owner: root
18+
group: root
19+
mode: 0640
20+
21+
- name: Write node password
22+
ansible.builtin.copy:
23+
dest: /etc/rancher/node/password
24+
content: "{{ vault_k3s_node_password }}"
25+
owner: root
26+
group: root
27+
mode: 0640 # normal k3s install is 644 but that doesn't feel right
28+
29+
- name: Start/restart k3s agent
30+
when: _k3s_agent_token_result.changed
31+
ansible.builtin.systemd:
32+
name: k3s-agent
33+
daemon_reload: true
34+
state: restarted
35+
enabled: true

ansible/roles/k3s/tasks/install.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,3 @@
7171
ansible.builtin.lineinfile:
7272
path: /etc/environment
7373
line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"
74-
75-
- name: Install ansible-init playbook for k3s agent or server activation
76-
copy:
77-
src: start_k3s.yml
78-
dest: /etc/ansible-init/playbooks/0-start-k3s.yml
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
---
2+
3+
- name: Template k3s env file
4+
ansible.builtin.template:
5+
dest: /etc/systemd/system/k3s.service.env
6+
src: k3s.service.env.j2
7+
register: _k3s_env_file_status
8+
9+
- name: Start k3s server
10+
ansible.builtin.systemd:
11+
name: k3s
12+
daemon_reload: "{{ _k3s_env_file_status.changed }}"
13+
state: started
14+
enabled: true
15+
16+
# Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though
17+
- name: Check which k3s agents are connected
18+
ansible.builtin.shell:
19+
cmd: kubectl get nodes --no-headers | grep -w Ready
20+
register: _k3s_connected_nodes
21+
retries: 6 # task may fail if server is not ready yet
22+
delay: 10
23+
until: not _k3s_connected_nodes.failed
24+
25+
- name: Generate new bootstrap token if not all agents are connected
26+
no_log: true
27+
when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length
28+
shell:
29+
cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}"
30+
register: _k3s_token_output
31+
32+
- name: Set bootstrap token as fact
33+
set_fact:
34+
k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}"
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
K3S_NODE_IP={{ ansible_host }}
2+
K3S_TOKEN={{ k3s_bootstrap_token }}
3+
K3S_URL=https://{{ k3s_server_name }}:6443
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
K3S_NODE_IP={{ ansible_host }}

ansible/roles/passwords/defaults/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ slurm_appliance_secrets:
88
vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}"
99
vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}"
1010
vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}"
11-
vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}"
11+
vault_k3s_node_password: "{{ vault_k3s_node_password | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}"
1212
vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}"
1313
vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}"
1414

ansible/roles/passwords/templates/k3s-token.auto.tfvars.json.j2

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250211-1540-a0b4a57e",
4-
"RL9": "openhpc-RL9-250211-1540-a0b4a57e"
3+
"RL8": "openhpc-RL8-250221-0904-e4ff694e",
4+
"RL9": "openhpc-RL9-250221-0904-e4ff694e"
55
}
66
}

environments/.stackhpc/tofu/main.tf

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,6 @@ module "cluster" {
6666
key_pair = "slurm-app-ci"
6767
cluster_image_id = data.openstack_images_image_v2.cluster.id
6868
control_node_flavor = var.control_node_flavor
69-
# have to override default, as unusually the actual module path and secrets
70-
# are not in the same environment for stackhpc
71-
inventory_secrets_path = "${path.module}/../inventory/group_vars/all/secrets.yml"
7269

7370
login = {
7471
login: {

environments/common/inventory/group_vars/all/defaults.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ opensearch_address: "127.0.0.1"
2121
prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}"
2222
openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}"
2323
grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}"
24+
k3s_server_name: "{{ hostvars[groups['k3s_server'] | first].ansible_host }}"
2425

2526
############################# bootstrap: local user configuration #########################
2627

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}"

environments/common/inventory/groups

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,16 @@ freeipa_client
145145
[compute_init]
146146
# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on
147147

148-
[k3s]
148+
[k3s:children]
149149
# Hosts to run k3s server/agent
150+
k3s_server
151+
k3s_agent
152+
153+
[k3s_server]
154+
# Hosts to run k3s server (should only be single node i.e control node)
155+
156+
[k3s_agent]
157+
# Hosts to run k3s agent
150158

151159
[k9s]
152160
# Hosts to install k9s on

environments/common/layouts/everything

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,14 @@ cluster
9696
[compute_init]
9797
# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on
9898

99-
[k3s:children]
100-
# Hosts to run k3s server/agent
101-
openhpc
99+
[k3s_server:children]
100+
# Hosts to run k3s server (should only be single node i.e control node)
101+
control
102+
103+
[k3s_agent:children]
104+
# Hosts to run k3s agent
105+
compute
106+
login
102107

103108
[k9s:children]
104109
# Hosts to install k9s on

environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ module "compute" {
2929
availability_zone = lookup(each.value, "availability_zone", "nova")
3030

3131
# computed
32-
k3s_token = local.k3s_token
3332
# not using openstack_compute_instance_v2.control.access_ip_v4 to avoid
3433
# updates to node metadata on deletion/recreation of the control node:
3534
control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0]

environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ resource "openstack_compute_instance_v2" "control" {
6060

6161
metadata = {
6262
environment_root = var.environment_root
63-
k3s_token = local.k3s_token
6463
access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0]
6564
}
6665

environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,3 @@
1-
data "external" "inventory_secrets" {
2-
program = ["${path.module}/read-inventory-secrets.py"]
3-
4-
query = {
5-
path = var.inventory_secrets_path == "" ? "${path.module}/../inventory/group_vars/all/secrets.yml" : var.inventory_secrets_path
6-
}
7-
}
8-
91
data "external" "baremetal_nodes" {
102
# returns an empty map if cannot list baremetal nodes
113
program = ["${path.module}/baremetal-node-list.py"]

environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ module "login" {
3333
ignore_image_changes = false
3434

3535
# computed
36-
k3s_token = local.k3s_token
3736
# not using openstack_compute_instance_v2.control.access_ip_v4 to avoid
3837
# updates to node metadata on deletion/recreation of the control node:
3938
control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0]

environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" {
8686
metadata = merge(
8787
{
8888
environment_root = var.environment_root
89-
k3s_token = var.k3s_token
9089
control_address = var.control_address
9190
access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0]
9291
},
@@ -140,7 +139,6 @@ resource "openstack_compute_instance_v2" "compute" {
140139
metadata = merge(
141140
{
142141
environment_root = var.environment_root
143-
k3s_token = var.k3s_token
144142
control_address = var.control_address
145143
access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0]
146144
},

environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,6 @@ variable "security_group_ids" {
7070
type = list
7171
}
7272

73-
variable "k3s_token" {
74-
type = string
75-
}
76-
7773
variable "control_address" {
7874
description = "Name/address of control node"
7975
type = string

environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -184,13 +184,3 @@ variable "root_volume_size" {
184184
type = number
185185
default = 40
186186
}
187-
188-
variable "inventory_secrets_path" {
189-
description = "Path to inventory secrets.yml file. Default is standard cookiecutter location."
190-
type = string
191-
default = ""
192-
}
193-
194-
locals {
195-
k3s_token = data.external.inventory_secrets.result["vault_k3s_token"]
196-
}

0 commit comments

Comments
 (0)