Merge pull request #200 from stackhpc/feature/plot_pingpong

sjpb · web-flow · commit fd213a3d9e9c · 2022-08-10T18:41:20.000+01:00
Plot pingpong results
diff --git a/ansible/roles/hpctests/files/plot_imb_pingpong.py b/ansible/roles/hpctests/files/plot_imb_pingpong.py
@@ -0,0 +1,85 @@
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from matplotlib import ticker
+import numpy as np
+import os
+
+def sizeof_fmt(num, suffix='B'):
+    """ TODO: """
+    # from https://stackoverflow.com/a/1094933/916373
+    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, 'Yi', suffix)
+
+def read_imb_out(path):
+    """ Read stdout from an IMB-MPI1 run.
+        
+        Returns a dict with:
+            key:= int, total number of processes involved
+            value:= pandas dataframe, i.e. one per results table. Columns as per table.
+        
+        If multiple results tables are present it is assumed that they are all the same benchmark,
+        and only differ in the number of processes.
+    """
+
+    data = {}
+
+    COLTYPES = { # all benchmark names here should be lowercase
+        'uniband': (int, int, float, int), # #bytes #repetitions Mbytes/sec Msg/sec
+        'biband': (int, int, float, int),
+        'pingpong':(int, int, float, float), # #bytes #repetitions t[usec] Mbytes/sec
+        'alltoall':(int, int, float, float, float) # #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec]
+    }
+
+    with open(path) as f:
+        for line in f:
+            if line.startswith('# Benchmarking '):
+                benchmark = line.split()[-1].lower()
+                if benchmark not in COLTYPES:
+                    raise ValueError('Do not know how to read %r benchmark in %s' % (benchmark, path))
+                converters = COLTYPES[benchmark]
+                line = next(f)
+                if not line.startswith('# #processes = '):
+                    raise ValueError('expected %s, got %s' % (expect, nprocs_line))
+                n_procs = int(line.split('=')[-1].strip())
+                while line.startswith('#'):
+                    line = next(f) # may or may not include line "# .. additional processes waiting in MPI_Barrier", plus other # lines
+                rows = []
+                while True:
+                    line = next(f).strip()
+                    if line == '':
+                        break
+                    rows.append([f(v) for (f, v) in zip(converters, line.split())])
+                # turn data around:
+                cols = [[] for _ in range(len(converters))]
+                for r in rows:
+                    for ic, c in enumerate(cols):
+                        c.append(r[ic])
+                data[n_procs] = cols
+    return data
+
+if __name__ == '__main__':
+    import sys
+    d = read_imb_out(sys.argv[1])
+    if len(d) > 1:
+        raise ValueError('Found > 1 benchmark in', sys.argv[1])
+    outdir = os.path.dirname(sys.argv[1])
+    for n, df in d.items():
+        fig, ax1 = plt.subplots()
+        ax2 = ax1.twinx()
+        ax1.plot(df[0], df[2], label='latency', color='b')
+        ax2.plot(df[0], df[3], label='bandwidth', color='r')
+        ax1.set_xscale('log', base=2)
+        ax1.set_yscale('log', base=10)
+        ax1.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: sizeof_fmt(x)))
+        ax1.grid(True, which="both")
+        ax1.set_xlabel('#bytes')
+        ax1.set_ylabel('latency ($\mu$s)', color='b')
+        ax2.set_ylabel('bandwidth (Mbytes/sec)', color='r')
+        fig.legend(loc='upper left')
+        plt.tight_layout()
+        figpath = os.path.join(outdir, 'pingpong.png')
+        plt.savefig(figpath)
+        print(figpath)
diff --git a/ansible/roles/hpctests/tasks/pingpong.yml b/ansible/roles/hpctests/tasks/pingpong.yml
@@ -24,18 +24,41 @@
     chdir: "{{ hpctests_rootdir }}/pingpong"
   register: hpctests_pingpong_sbatch
 
-- name: Read pingpong
+- set_fact:
+    _pingpong_jobid: "{{ hpctests_pingpong_sbatch.stdout.split()[-1] }}"
+- set_fact:
+    _pingpong_local_output: "{{ hpctests_outdir }}/pingpong/{{_pingpong_jobid}}/pingpong.sh.out"
+
+- name: Retrieve results file
+  ansible.builtin.fetch:
+    src: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out"
+    dest: "{{ _pingpong_local_output }}"
+    flat: yes
+
+- name: Read pingpong results
   read_imb_pingpong:
-    path: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out"
+    path: "{{ _pingpong_local_output }}"
   register: hpctests_pingpong_out
+  delegate_to: localhost
 
 - name: Read nodes used
-  shell: "grep 'SLURM_JOB_NODELIST:' {{ hpctests_rootdir }}/pingpong/pingpong.sh.out"
+  shell: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}"
   register: hpctests_pingpong_run_nodes
+  delegate_to: localhost
 
+- name: Plot image
+  shell:
+    cmd: "python {{lookup('env', 'APPLIANCES_REPO_ROOT') }}/ansible/roles/hpctests/files/plot_imb_pingpong.py {{ _pingpong_local_output }}"
+    creates: "{{ _pingpong_local_output | dirname }}/latency.png"
+  register: _pingpong_plot
+  delegate_to: localhost
+  
 - debug:
     msg: |
-      Summary for pingpong (2x scheduler-selected nodes) job {{ hpctests_pingpong_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
+      Summary for pingpong (2x scheduler-selected nodes) job {{ _pingpong_jobid }} (using interface {{ hpctests_ucx_net_devices }}):
       nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
       zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
       max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
+
+      See plot on localhost:
+      {{ _pingpong_plot.stdout }}
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ passlib[bcrypt]==1.7.4
 cookiecutter
 selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3
 netaddr
+matplotlib