|
| 1 | +# Checks Slurm jobs from hpctests are shown in Grafana. |
| 2 | +# Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead. |
| 3 | + |
| 4 | +- hosts: control # so proxying etc is irrelevant |
| 5 | + gather_facts: no |
| 6 | + become: no |
| 7 | + tasks: |
| 8 | + - name: Wait for slurm-stats file to exist (run by cron) |
| 9 | + ansible.builtin.wait_for: |
| 10 | + path: /var/log/slurm-stats/finished_jobs.json |
| 11 | + timeout: 315 # slurm stats cron job runs every 5 mins |
| 12 | + |
| 13 | + - name: Get grafana datasources |
| 14 | + ansible.builtin.uri: |
| 15 | + url: http://{{ grafana_api_address }}:{{ grafana_port }}/api/datasources |
| 16 | + url_username: grafana |
| 17 | + url_password: "{{ vault_grafana_admin_password }}" |
| 18 | + force_basic_auth: yes # otherwise get a 403 |
| 19 | + register: grafana_datasources_api |
| 20 | + |
| 21 | + - name: Query slurm stats |
| 22 | + ansible.builtin.uri: |
| 23 | + url: "http://{{ grafana_api_address }}:{{ grafana_port }}/api/datasources/proxy/{{ _slurmstats_id }}/filebeat-*/_search" |
| 24 | + url_username: grafana |
| 25 | + url_password: "{{ vault_grafana_admin_password }}" |
| 26 | + register: _slurmstats_docs |
| 27 | + vars: |
| 28 | + _slurmstats_id: "{{ (grafana_datasources_api.json | selectattr('name', 'eq', 'slurmstats'))[0]['id'] }}" # [0] as only one slurmstats datasource |
| 29 | + |
| 30 | + - name: Check all hpctests jobs listed |
| 31 | + assert: |
| 32 | + that: "_expected_jobs | difference(_slurm_stats_jobs) == []" |
| 33 | + fail_msg: | |
| 34 | + expected jobs: {{ _expected_jobs }} |
| 35 | + found jobs : {{ _slurm_stats_jobs }} |
| 36 | + success_msg: "Found all expected jobs: {{ _expected_jobs | sort | join(', ') }}" |
| 37 | + vars: |
| 38 | + # see https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-response-body |
| 39 | + _slurm_stats_jobs: "{{ _slurmstats_docs.json.hits.hits | map(attribute='_source') | map(attribute='json') | map(attribute='JobName', default='(json error in slurmstats data)') }}" |
| 40 | + _expected_jobs: ['hpl-solo.sh', 'pingpong.sh', 'pingmatrix.sh'] |
0 commit comments