Skip to content

Compute-init: cope with root-squashed nfs clients #627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions ansible/roles/basic_users/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
when:
- item.state | default('present') == 'present'
- item.create_home | default(true) | bool
- inventory_hostname == basic_users_homedir_server
- ansible_hostname == basic_users_homedir_server

# The following tasks run on a single *client* node, so that home directory
# paths are easily constructed, becoming each user so that root-squash
Expand All @@ -85,7 +85,7 @@
when:
- item.state | default('present') == 'present'
- item.generate_ssh_key | default(true) | bool or item.public_key is defined
- inventory_hostname == basic_users_homedir_client
- ansible_hostname == basic_users_homedir_client

- name: Generate cluster ssh key
community.crypto.openssh_keypair:
Expand All @@ -101,7 +101,7 @@
when:
- item.state | default('present') == 'present'
- item.generate_ssh_key | default(true)
- inventory_hostname == basic_users_homedir_client
- ansible_hostname == basic_users_homedir_client
register: _cluster_ssh_keypair

- name: Write generated cluster ssh key to authorized_keys
Expand All @@ -118,7 +118,7 @@
when:
- item.item.state | default('present') == 'present'
- item.item.generate_ssh_key | default(true)
- inventory_hostname == basic_users_homedir_client
- ansible_hostname == basic_users_homedir_client
- item.public_key is defined # NB this is the *returned* public key

- name: Write supplied public key to authorized_keys
Expand All @@ -134,5 +134,5 @@
label: "{{ item.name }}"
when:
- item.state | default('present') == 'present'
- inventory_hostname == basic_users_homedir_client
- ansible_hostname == basic_users_homedir_client
- item.public_key is defined # NB this is the *provided* public key
2 changes: 1 addition & 1 deletion ansible/roles/cacerts/tasks/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
copy:
src: "{{ item }}"
dest: /exports/cluster/cacerts/
owner: root
owner: slurm
group: root
mode: 0644
with_fileglob:
Expand Down
85 changes: 36 additions & 49 deletions ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,9 @@
tuned_enabled: true
tuned_started: true

nfs_client_mnt_point: "/mnt"
nfs_client_mnt_options:
nfs_client_mnt_state: mounted
nfs_configurations:
nfs_enable:
clients: false

# openhpc: no defaults required

os_manila_mount_shares: []
os_manila_mount_ceph_conf_path: /etc/ceph
os_manila_mount_state: mounted
Expand All @@ -47,15 +41,8 @@
- noatime
- _netdev # prevents mount blocking early boot before networking available
- rw

basic_users_groups: []
basic_users_manage_homedir: false # homedir must already exist on shared filesystem
basic_users_userdefaults:
state: present
create_home: "{{ basic_users_manage_homedir }}"
generate_ssh_key: "{{ basic_users_manage_homedir }}"
ssh_key_comment: "{{ item.name }}"
basic_users_users: []
- nodev
- nosuid

tasks:
- block:
Expand Down Expand Up @@ -96,6 +83,7 @@
when: _mount_mnt_cluster.failed

- name: Check if hostvars exist
become_user: slurm
stat:
path: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml"
register: hostvars_stat
Expand All @@ -109,17 +97,33 @@
- meta: end_play
when: not hostvars_stat.stat.exists

- name: Load hostvars from NFS
- name: Sync /mnt/cluster to /var/tmp
become_user: slurm
synchronize:
src: "/mnt/cluster/"
dest: "/var/tmp/cluster/"
archive: yes
recursive: yes

- name: Unmount /mnt/cluster after sync
mount:
path: /mnt/cluster
state: unmounted

- name: Load hostvars
# this is higher priority than vars block = normal ansible's hostvars
include_vars:
file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname

# TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups?
file: "/var/tmp/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml"

- name: Run chrony role
ansible.builtin.include_role:
name: mrlesmithjr.chrony
when: enable_chrony | bool
tasks_from: config_chrony.yml
vars:
# workaround for set_facts.yml:
chrony_config: /etc/chrony.conf
chrony_service: chronyd
when: enable_chrony

- name: Configure resolve.conf
block:
Expand Down Expand Up @@ -149,7 +153,7 @@

- name: Copy cluster /etc/hosts
copy:
src: /mnt/cluster/hosts
src: /var/tmp/cluster/hosts
dest: /etc/hosts
owner: root
group: root
Expand All @@ -160,14 +164,14 @@
ansible.builtin.include_role:
name: cacerts
vars:
cacerts_cert_dir: "/mnt/cluster/cacerts"
cacerts_cert_dir: "/var/tmp/cluster/cacerts"
when: enable_cacerts

- name: Configure sshd
ansible.builtin.include_role:
name: sshd
vars:
sshd_conf_src: "/mnt/cluster/hostconfig/{{ ansible_hostname }}/sshd.conf"
sshd_conf_src: "/var/tmp/cluster/hostconfig/{{ ansible_hostname }}/sshd.conf"
when: enable_sshd

- name: Configure tuned
Expand All @@ -179,22 +183,24 @@
name: sssd
tasks_from: configure.yml
vars:
sssd_conf_src: "/mnt/cluster/hostconfig/{{ ansible_hostname }}/sssd.conf"
sssd_conf_src: "/var/tmp/cluster/hostconfig/{{ ansible_hostname }}/sssd.conf"
when: enable_sssd

# NFS client mount
- name: If nfs-clients is present
include_tasks: tasks/nfs-clients.yml
ansible.builtin.include_role:
name: stackhpc.nfs
tasks_from: nfs-clients.yml
when:
- enable_nfs
- nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool)
- nfs_enable.clients | default(item.nfs_enable.clients) | bool
loop: "{{ nfs_configurations }}"

- name: Manila mounts
block:
- name: Read manila share info from nfs file
include_vars:
file: /mnt/cluster/manila_share_info.yml
file: /var/tmp/cluster/manila_share_info.yml
no_log: true # contains secrets

- name: Ensure Ceph configuration directory exists
Expand Down Expand Up @@ -269,34 +275,15 @@
when: enable_lustre

- name: Basic users
block:
- name: Create groups
ansible.builtin.group: "{{ item }}"
loop: "{{ basic_users_groups }}"

- name: Create users
user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}"
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }} [{{ item.state | default('present') }}]"
register: basic_users_info

- name: Write sudo rules
blockinfile:
path: /etc/sudoers.d/80-{{ item.name}}-user
block: "{{ item.sudo }}"
create: true
loop: "{{ basic_users_users }}"
loop_control:
label: "{{ item.name }}"
when: "'sudo' in item"
ansible.builtin.include_role:
name: basic_users
when: enable_basic_users

- name: EESSI
block:
- name: Copy cvmfs config
copy:
src: /mnt/cluster/cvmfs/default.local
src: /var/tmp/cluster/cvmfs/default.local
dest: /etc/cvmfs/default.local
owner: root
group: root
Expand Down
16 changes: 8 additions & 8 deletions ansible/roles/compute_init/tasks/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
copy:
src: /etc/hosts
dest: /exports/cluster/hosts
owner: root
owner: slurm
group: root
mode: u=rw,go=
mode: u=r,g=rw,o=
remote_src: true
run_once: true
delegate_to: "{{ groups['control'] | first }}"
Expand All @@ -41,9 +41,9 @@
copy:
content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}"
dest: /exports/cluster/manila_share_info.yml
owner: root
owner: slurm
group: root
mode: u=rw,g=r
mode: u=r,g=rw,o=
run_once: true
delegate_to: "{{ groups['control'] | first }}"
when: os_manila_mount_share_info is defined
Expand All @@ -55,7 +55,7 @@
file:
path: /exports/cluster/cvmfs
state: directory
owner: root
owner: slurm
group: root
mode: 0755
run_once: true
Expand All @@ -65,7 +65,7 @@
copy:
src: /etc/cvmfs/default.local
dest: /exports/cluster/cvmfs/default.local
owner: root
owner: slurm
group: root
mode: 0644
remote_src: true
Expand All @@ -82,9 +82,9 @@
file:
path: "/exports/cluster/hostconfig/{{ inventory_hostname }}/"
state: directory
owner: root
owner: slurm
group: root
mode: u=rw,go=
mode: u=rX,g=rwX,o=
delegate_to: "{{ groups['control'] | first }}"

- name: Template sssd config
Expand Down
8 changes: 4 additions & 4 deletions ansible/roles/compute_init/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
dest: templates/ceph.keyring.j2
- src: ../../resolv_conf/files/NetworkManager-dns-none.conf
dest: files/NetworkManager-dns-none.conf
- src: ../../basic_users/filter_plugins/filter_keys.py
dest: filter_plugins/filter_keys.py
- src: ../../basic_users
dest: roles/
- src: ../../cacerts
dest: roles/
- src: ../../sssd
Expand All @@ -43,8 +43,8 @@
dest: roles/
- src: ../../tuned/tasks/configure.yml
dest: tasks/tuned.yml
- src: ../../stackhpc.nfs/tasks/nfs-clients.yml
dest: tasks/nfs-clients.yml
- src: ../../stackhpc.nfs
dest: roles/
- src: ../../mrlesmithjr.chrony
dest: roles/
- src: ../../lustre
Expand Down
17 changes: 0 additions & 17 deletions environments/.stackhpc/inventory/group_vars/all/nfs.yml

This file was deleted.

4 changes: 2 additions & 2 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-250312-1522-7e5c051d",
"RL9": "openhpc-RL9-250312-1435-7e5c051d"
"RL8": "openhpc-RL8-250319-1045-69713f23",
"RL9": "openhpc-RL9-250319-1045-69713f23"
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ansible_init_wait: 1200 # seconds
ansible_init_wait: 300 # seconds

ansible_init_pip_packages:
# role defaults:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
# See ansible/roles/basic_users/README.md for variable definitions.

basic_users_users: []

# The following are defined for the purpose of compute-init
basic_users_homedir_server: "{{ groups['control'] | first }}"
basic_users_homedir_client: "{{ groups['login'] | first }}"
10 changes: 10 additions & 0 deletions environments/common/inventory/group_vars/all/nfs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,13 @@ nfs_configurations:
# NB: this is set as default for all shares above but is repeated here
# in case nfs_export_clients is overriden
nfs_export_clients: "{{ _nfs_node_ips }}"

- comment: Export /exports/cluster from Slurm control node
nfs_enable:
server: "{{ inventory_hostname in groups['control'] }}"
clients: false
nfs_export: "/exports/cluster"
# prevent non-cluster IPs mounting the share:
# NB: this is set as default for all shares above but is repeated here
# in case nfs_export_clients is overriden
nfs_export_clients: "{{ _nfs_node_ips }}"