Skip to content

Commit 653cb63

Browse files
committed
support a subset of gres
1 parent adfc652 commit 653cb63

File tree

5 files changed

+42
-4
lines changed

5 files changed

+42
-4
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,14 @@ each list element:
5858
* `name`: The name of the nodes within this group.
5959
* `cluster_name`: Optional. An override for the top-level definition `openhpc_cluster_name`.
6060
* `extra_nodes`: Optional. A list of additional node definitions, e.g. for nodes in this group/partition not controlled by this role. Each item should be a dict, with keys/values as per the ["NODE CONFIGURATION"](https://slurm.schedmd.com/slurm.conf.html#lbAE) docs for slurm.conf. Note the key `NodeName` must be first.
61-
* `ram_mb`: Optional. The physical RAM available in each server of this group ([slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `RealMemory`) in MiB. This is set using ansible facts if not defined, equivalent to `free --mebi` total * `openhpc_ram_multiplier`.
61+
* `ram_mb`: Optional. The physical RAM available in each node of this group ([slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `RealMemory`) in MiB. This is set using ansible facts if not defined, equivalent to `free --mebi` total * `openhpc_ram_multiplier`.
6262
* `ram_multiplier`: Optional. An override for the top-level definition `openhpc_ram_multiplier`. Has no effect if `ram_mb` is set.
63+
* `gres`: Optional. List of dicts defining [generic resources](https://slurm.schedmd.com/gres.html). Each dict must define:
64+
- `conf`: A string with the [resource specification](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) but requiring the format `<name>:<type>:<number>`, e.g. `gpu:A100:2`. Note the `type` is an arbitrary string.
65+
- `file`: A string with the [File](https://slurm.schedmd.com/gres.conf.html#OPT_File) (path to device(s)) for this resource, e.g. `/dev/nvidia[0-1]` for the above example.
66+
67+
Note [GresTypes](https://slurm.schedmd.com/slurm.conf.html#OPT_GresTypes) must be set in `openhpc_config` if this is used.
68+
6369
* `default`: Optional. A boolean flag for whether this partion is the default. Valid settings are `YES` and `NO`.
6470
* `maxtime`: Optional. A partition-specific time limit following the format of [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `MaxTime`. The default value is
6571
given by `openhpc_job_maxtime`. The value should be quoted to avoid Ansible conversions.

defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ openhpc_resume_timeout: 300
1313
openhpc_retry_delay: 10
1414
openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm
1515
openhpc_config: "{{ openhpc_extra_config | default({}) }}"
16+
openhpc_gres_template: gres.conf.j2
1617
openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}"
1718

1819
openhpc_state_save_location: /var/spool/slurm

tasks/runtime.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
changed_when: false # so molecule doesn't fail
109109
become: no
110110

111-
- name: Create slurm.conf on control node
111+
- name: Create slurm.conf
112112
copy:
113113
src: "{{ _slurm_conf_tmpfile.path }}"
114114
dest: /etc/slurm/slurm.conf
@@ -121,6 +121,19 @@
121121
register: ohpc_slurm_conf
122122
# NB uses restart rather than reload as number of nodes might have changed
123123

124+
- name: Create gres.conf
125+
template:
126+
src: "{{ openhpc_gres_template }}"
127+
dest: /etc/slurm/gres.conf
128+
mode: "0600"
129+
owner: slurm
130+
group: slurm
131+
when: openhpc_enable.control | default(false) or not openhpc_slurm_configless
132+
notify:
133+
- Restart slurmctld service
134+
register: ohpc_gres_conf
135+
# NB uses restart rather than reload as this is needed in some cases
136+
124137
- name: Remove local tempfile for slurm.conf templating
125138
ansible.builtin.file:
126139
path: "{{ _slurm_conf_tmpfile.path }}"
@@ -136,7 +149,7 @@
136149
changed_when: true
137150
when:
138151
- openhpc_slurm_control_host in ansible_play_hosts
139-
- hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed # noqa no-handler
152+
- hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler
140153
notify:
141154
- Restart slurmd service
142155

templates/gres.conf.j2

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
AutoDetect=off
2+
{% for part in openhpc_slurm_partitions %}
3+
{% set nodelist = [] %}
4+
{% for group in part.get('groups', [part]) %}
5+
{% if 'gres' in group %}
6+
{% for gres in group.gres %}
7+
{% set gres_name, gres_type, _ = gres.conf.split(':') %}
8+
{% set group_name = group.cluster_name|default(openhpc_cluster_name) ~ '_' ~ group.name %}
9+
{% set inventory_group_hosts = groups.get(group_name, []) %}
10+
{% for hostlist in (inventory_group_hosts | hostlist_expression) %}
11+
NodeName={{ hostlist }} Name={{ gres_name }} Type={{ gres_type }} File={{ gres.file }}
12+
{% endfor %}
13+
{% endfor %}
14+
{% endif %}
15+
{% endfor %}
16+
{% endfor %}

templates/slurm.conf.j2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ Epilog=/etc/slurm/slurm.epilog.clean
118118
{% set first_host_hv = hostvars[first_host] %}
119119
{% set ram_mb = (first_host_hv['ansible_memory_mb']['real']['total'] * (group.ram_multiplier | default(openhpc_ram_multiplier))) | int %}
120120
{% for hostlist in (inventory_group_hosts | hostlist_expression) %}
121-
NodeName={{ hostlist }} State=UNKNOWN RealMemory={{ group.get('ram_mb', ram_mb) }} Sockets={{first_host_hv['ansible_processor_count']}} CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }}
121+
{% set gres = ' Gres=%s' % (','.join(group.gres | map(attribute='conf') )) if 'gres' in group else '' %}
122+
123+
NodeName={{ hostlist }} State=UNKNOWN RealMemory={{ group.get('ram_mb', ram_mb) }} Sockets={{first_host_hv['ansible_processor_count']}} CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }}{{ gres }}
122124
{% set _ = nodelist.append(hostlist) %}
123125
{% endfor %}{# nodes #}
124126
{% endif %}{# inventory_group_hosts #}

0 commit comments

Comments
 (0)