Skip to content

Use bootstrap tokens provisioned by ansible for K3s instead of persistent tokens in cloud-init metadata #589

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -313,10 +313,11 @@
- include_role:
name: azimuth_cloud.image_utils.linux_ansible_init

- hosts: k3s
- hosts: k3s:&builder
become: yes
tags: k3s
tasks:
- ansible.builtin.include_role:
- name: Install k3s
ansible.builtin.include_role:
name: k3s
tasks_from: install.yml
20 changes: 20 additions & 0 deletions ansible/extras.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
- hosts: k3s_server:!builder
become: yes
tags: k3s
tasks:
- name: Start k3s server
ansible.builtin.include_role:
name: k3s
tasks_from: server-runtime.yml

# technically should be part of bootstrap.yml but hangs waiting on failed mounts
# if runs before filesystems.yml after the control node has been reimaged
- hosts: k3s_agent:!builder
become: yes
tags: k3s
tasks:
- name: Start k3s agents
ansible.builtin.include_role:
name: k3s
tasks_from: agent-runtime.yml

- hosts: basic_users:!builder
become: yes
tags:
Expand Down
3 changes: 3 additions & 0 deletions ansible/roles/k3s/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ k3s_version: "v1.31.0+k3s1"
k3s_selinux_release: v1.6.latest.1
k3s_selinux_rpm_version: 1.6-1
k3s_helm_version: v3.11.0
k3s_bootstrap_token: '' # matches common environment default
k3s_bootstrap_token_expiry: 10m
k3s_server_name: "{{ None }}" # ansible managed
44 changes: 0 additions & 44 deletions ansible/roles/k3s/files/start_k3s.yml

This file was deleted.

35 changes: 35 additions & 0 deletions ansible/roles/k3s/tasks/agent-runtime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
---

- name: Template k3s agent env file
when: k3s_bootstrap_token != ''
ansible.builtin.template:
dest: /etc/systemd/system/k3s-agent.service.env
src: k3s-agent.service.env.j2
owner: root
group: root
mode: 0640
register: _k3s_agent_token_result

- name: Ensure password directory exists
ansible.builtin.file:
path: "/etc/rancher/node"
state: directory
owner: root
group: root
mode: 0640

- name: Write node password
ansible.builtin.copy:
dest: /etc/rancher/node/password
content: "{{ vault_k3s_node_password }}"
owner: root
group: root
mode: 0640 # normal k3s install is 644 but that doesn't feel right

- name: Start/restart k3s agent
when: _k3s_agent_token_result.changed
ansible.builtin.systemd:
name: k3s-agent
daemon_reload: true
state: restarted
enabled: true
5 changes: 0 additions & 5 deletions ansible/roles/k3s/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,3 @@
ansible.builtin.lineinfile:
path: /etc/environment
line: "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"

- name: Install ansible-init playbook for k3s agent or server activation
copy:
src: start_k3s.yml
dest: /etc/ansible-init/playbooks/0-start-k3s.yml
34 changes: 34 additions & 0 deletions ansible/roles/k3s/tasks/server-runtime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---

- name: Template k3s env file
ansible.builtin.template:
dest: /etc/systemd/system/k3s.service.env
src: k3s.service.env.j2
register: _k3s_env_file_status

- name: Start k3s server
ansible.builtin.systemd:
name: k3s
daemon_reload: "{{ _k3s_env_file_status.changed }}"
state: started
enabled: true

# Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though
- name: Check which k3s agents are connected
ansible.builtin.shell:
cmd: kubectl get nodes --no-headers | grep -w Ready
register: _k3s_connected_nodes
retries: 6 # task may fail if server is not ready yet
delay: 10
until: not _k3s_connected_nodes.failed

- name: Generate new bootstrap token if not all agents are connected
no_log: true
when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length
shell:
cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}"
register: _k3s_token_output

- name: Set bootstrap token as fact
set_fact:
k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}"
3 changes: 3 additions & 0 deletions ansible/roles/k3s/templates/k3s-agent.service.env.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
K3S_NODE_IP={{ ansible_host }}
K3S_TOKEN={{ k3s_bootstrap_token }}
K3S_URL=https://{{ k3s_server_name }}:6443
1 change: 1 addition & 0 deletions ansible/roles/k3s/templates/k3s.service.env.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
K3S_NODE_IP={{ ansible_host }}
2 changes: 1 addition & 1 deletion ansible/roles/passwords/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ slurm_appliance_secrets:
vault_openhpc_mungekey: "{{ secrets_openhpc_mungekey | default(vault_openhpc_mungekey | default(secrets_openhpc_mungekey_default)) }}"
vault_freeipa_ds_password: "{{ vault_freeipa_ds_password | default(lookup('password', '/dev/null')) }}"
vault_freeipa_admin_password: "{{ vault_freeipa_admin_password | default(lookup('password', '/dev/null')) }}"
vault_k3s_token: "{{ vault_k3s_token | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}"
vault_k3s_node_password: "{{ vault_k3s_node_password | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}"
vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}"
vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}"

Expand Down

This file was deleted.

4 changes: 2 additions & 2 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-250211-1540-a0b4a57e",
"RL9": "openhpc-RL9-250211-1540-a0b4a57e"
"RL8": "openhpc-RL8-250221-0904-e4ff694e",
"RL9": "openhpc-RL9-250221-0904-e4ff694e"
}
}
3 changes: 0 additions & 3 deletions environments/.stackhpc/tofu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ module "cluster" {
key_pair = "slurm-app-ci"
cluster_image_id = data.openstack_images_image_v2.cluster.id
control_node_flavor = var.control_node_flavor
# have to override default, as unusually the actual module path and secrets
# are not in the same environment for stackhpc
inventory_secrets_path = "${path.module}/../inventory/group_vars/all/secrets.yml"

login = {
login: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ opensearch_address: "127.0.0.1"
prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}"
openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}"
grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}"
k3s_server_name: "{{ hostvars[groups['k3s_server'] | first].ansible_host }}"

############################# bootstrap: local user configuration #########################

Expand Down
1 change: 1 addition & 0 deletions environments/common/inventory/group_vars/all/k3s.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}"
10 changes: 9 additions & 1 deletion environments/common/inventory/groups
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,16 @@ freeipa_client
[compute_init]
# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on

[k3s]
[k3s:children]
# Hosts to run k3s server/agent
k3s_server
k3s_agent

[k3s_server]
# Hosts to run k3s server (should only be single node i.e control node)

[k3s_agent]
# Hosts to run k3s agent

[k9s]
# Hosts to install k9s on
Expand Down
11 changes: 8 additions & 3 deletions environments/common/layouts/everything
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,14 @@ cluster
[compute_init]
# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on

[k3s:children]
# Hosts to run k3s server/agent
openhpc
[k3s_server:children]
# Hosts to run k3s server (should only be single node i.e control node)
control

[k3s_agent:children]
# Hosts to run k3s agent
compute
login

[k9s:children]
# Hosts to install k9s on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ module "compute" {
availability_zone = lookup(each.value, "availability_zone", "nova")

# computed
k3s_token = local.k3s_token
# not using openstack_compute_instance_v2.control.access_ip_v4 to avoid
# updates to node metadata on deletion/recreation of the control node:
control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ resource "openstack_compute_instance_v2" "control" {

metadata = {
environment_root = var.environment_root
k3s_token = local.k3s_token
access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0]
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
data "external" "inventory_secrets" {
program = ["${path.module}/read-inventory-secrets.py"]

query = {
path = var.inventory_secrets_path == "" ? "${path.module}/../inventory/group_vars/all/secrets.yml" : var.inventory_secrets_path
}
}

data "external" "baremetal_nodes" {
# returns an empty map if cannot list baremetal nodes
program = ["${path.module}/baremetal-node-list.py"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ module "login" {
ignore_image_changes = false

# computed
k3s_token = local.k3s_token
# not using openstack_compute_instance_v2.control.access_ip_v4 to avoid
# updates to node metadata on deletion/recreation of the control node:
control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" {
metadata = merge(
{
environment_root = var.environment_root
k3s_token = var.k3s_token
control_address = var.control_address
access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0]
},
Expand Down Expand Up @@ -140,7 +139,6 @@ resource "openstack_compute_instance_v2" "compute" {
metadata = merge(
{
environment_root = var.environment_root
k3s_token = var.k3s_token
control_address = var.control_address
access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,6 @@ variable "security_group_ids" {
type = list
}

variable "k3s_token" {
type = string
}

variable "control_address" {
description = "Name/address of control node"
type = string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,3 @@ variable "root_volume_size" {
type = number
default = 40
}

variable "inventory_secrets_path" {
description = "Path to inventory secrets.yml file. Default is standard cookiecutter location."
type = string
default = ""
}

locals {
k3s_token = data.external.inventory_secrets.result["vault_k3s_token"]
}