use slurm jobid for opensearch index and archive old data

sjpb · sjpb · commit 293772541a07 · 2023-08-10T14:59:39.000Z
diff --git a/ansible/roles/opensearch/tasks/archive_data.yml b/ansible/roles/opensearch/tasks/archive_data.yml
@@ -0,0 +1,17 @@
+# Remove data which was NOT indexed by Slurm Job ID
+# It will be re-ingested by filebeat from the slurmdbd, with that index
+
+- name: Ensure opensearch stopped
+  systemd:
+    name: opensearch
+    state: stopped
+  register: _opensearch_stop
+  until: "_opensearch_stop.status.ActiveState in ['inactive', 'failed']"
+  retries: 15
+  delay: 5
+
+- name: Archive existing data
+  community.general.archive:
+    path: "{{ opensearch_data_path }}"
+    dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz"
+    remove: true
diff --git a/ansible/roles/opensearch/tasks/runtime.yml b/ansible/roles/opensearch/tasks/runtime.yml
@@ -15,6 +15,17 @@
     path: /etc/systemd/system/opendistro.service
     state: absent
 
+- name: Enumerate files in data directory
+  find:
+    path: "{{ opensearch_data_path }}"
+  register: _find_opensearch_data
+
+- name: Archive incorrectly indexed data
+  import_tasks: archive_data.yml
+  when:
+    - _find_opensearch_data.files | length > 0
+    - "'slurm_jobid_index' not in _find_opensearch_data.files | map(attribute='path') | map('basename')"
+
 - name: Ensure required opensearch host directories exist
   file:
     state: directory
@@ -27,6 +38,15 @@
     - "{{ opensearch_config_path }}"
     - "{{ opensearch_data_path }}"
 
+- name: Set indexed data flag
+  copy:
+    dest: "{{ opensearch_data_path }}/slurm_jobid_index"
+    content: |
+      This is a flag file to indicate that filebeat is pushing data
+      indexed by Slurm JobID to prevent duplicate OpenSearch records
+    owner: "{{ opensearch_podman_user }}"
+    group: "{{ opensearch_podman_user }}"
+
 - name: Create certs
   import_tasks: certs.yml
 
diff --git a/environments/common/files/filebeat/filebeat.yml b/environments/common/files/filebeat/filebeat.yml
@@ -22,6 +22,11 @@ filebeat.inputs:
     fields_under_root: true
 
 processors:
+  # Want to use the Slurm JobID as the ElasticSearch id to avoid duplicated records
+  # Don't use filebeat.inputs:json.document_id as this removes the JobID from the record
+  - fingerprint:
+      fields: ["json.JobID"]
+      target_field: "@metadata._id"
   - timestamp:
       field: json.End
       layouts: