Calculate 5 summaries while running benchmarks (#42)

chriselrod · web-flow · commit 37c3497a1c9e · 2021-01-28T13:10:13.000-05:00
* Add option to specify which summary stat you want the benchmark to return; defaults to .

* Store all summaries from a benchmark, allow user to specify when calling  or

* Remove invalid argument

* display the plots

* Fix colors, add option to not display plots.
diff --git a/src/plotting.jl b/src/plotting.jl
@@ -10,7 +10,7 @@ Defines the mapping between libraries and colors
 # make sure colors are distinguishable against white background by adding white to the seed list,
 # then deleting it from the resultant palette
 palette = distinguishable_colors(length(LIBRARIES) + 2, [colorant"white", colorant"black", colorant"#66023C", colorant"#0071c5"])
-deleteat!(palette, 1); deleteat!(palette, 2)
+deleteat!(palette, 1); deleteat!(palette, 1)
 const COLOR_MAP = Dict(zip(LIBRARIES, palette))
 getcolor(l::Symbol) = COLOR_MAP[l]
 for (alias,ref) ∈ [(:BLIS,:blis),(:generic,:Generic),(:GENERIC,:Generic)]
@@ -76,9 +76,17 @@ end
          logscale = true,
          width = 1200,
          height = 600,
+         measure = :minimum,
          plot_directory = default_plot_directory(),
          plot_filename = default_plot_filename(br; desc = desc, logscale = logscale),
-         file_extensions = ["svg", "png"])
+         file_extensions = ["svg", "png"],
+         displayplot = true)
+
+`measure` refers to the BenchmarkTools summary on times. Valid options are:
+`:minimum`, `:medain`, `:mean`, `:maximum`, and `:hmean`.
+
+ -  `:minimum` would yield the maximum `GFLOPS`, and would be the usual estimate used in Julia. 
+ - `:hmean`, the harmonic mean of the times, is usful if you want an average GFLOPS, instead of a GFLOPS computed with the average times.
 """
 function Gadfly.plot(br::BenchmarkResult{T}; kwargs...) where {T}
     _plot(br; kwargs...)
@@ -92,10 +100,13 @@ function _plot(
     logscale::Bool = true,
     width = 12inch,
     height = 8inch,
+    measure = :minimum,
     plot_directory::AbstractString = default_plot_directory(),
     plot_filename::AbstractString = default_plot_filename(br; desc = desc, logscale = logscale),
     file_extensions = ["svg", "png"],
+    displayplot = true
 ) where {T}
+    j = get_measure_index(measure) # throw early if `measure` invalid
     colors = getcolor.(br.libraries);
     libraries = string.(br.libraries)
     xscale = logscale ? Scale.x_log10(labels=string ∘ roundint ∘ exp10) : Scale.x_continuous
@@ -107,11 +118,12 @@ function _plot(
     for i ∈ eachindex(libraries)
         linestyle = isjulialib(libraries[i]) ? :solid : :dash
         l = layer(
-            x = br.sizes, y = br.gflops[:,i],
+            x = br.sizes, y = br.gflops[:,i,j],
             Geom.line, Theme(default_color = colors[i], line_style = [linestyle])
         )
         push!(plt, l)
     end
+    displayplot && display(plt)
     mkpath(plot_directory)
     _filenames = String[]
     extension_dict = Dict("svg" => SVG, "png" => PNG, "pdf" => PDF, "ps" => PS)
diff --git a/src/runbenchmark.jl b/src/runbenchmark.jl
@@ -1,9 +1,15 @@
 struct BenchmarkResult{T,I<:Union{Int,NTuple{3,Int}}}
     libraries::Vector{Symbol}
     sizes::Vector{I}
-    gflops::Matrix{Float64}
-    times::Matrix{Float64}
+    gflops::Array{Float64,3}
+    times::Array{Float64,3}
     threaded::Bool
+    function BenchmarkResult{T}(libraries, sizes, gflops, times, threaded) where {T}
+        gflopsperm = permutedims(gflops, (2,3,1))
+        timesperm = permutedims(times, (2,3,1))
+        I = eltype(sizes)
+        new{T,I}(libraries, convert(Vector{I},sizes), gflopsperm, timesperm, threaded)
+    end
 end
 
 """
@@ -13,23 +19,37 @@ function benchmark_result_type(::BenchmarkResult{T}) where {T}
     return T
 end
 
-function _benchmark_result_df(sizes, libraries, mat)
+function get_measure_index(measure::Symbol)::Int
+    j = findfirst(==(measure), (:minimum,:median,:mean,:maximum,:hmean))
+    if j === nothing
+        throw(ArgumentError("`measure` argument must be one of (:minimum,:median,:mean,:maximum,:hmean), but was $(repr(measure))."))
+    end
+    return j
+end
+function _benchmark_result_df(sizes, libraries, mat, measure)
+    j = get_measure_index(measure)
     df = DataFrame(Size = sizes)
     for i ∈ eachindex(libraries)
-        setproperty!(df, libraries[i], mat[:,i])
+        setproperty!(df, libraries[i], mat[:,i,j])
     end
     return df
 end
-function _benchmark_result_df(br::BenchmarkResult, s::Symbol = :gflops)
-    _benchmark_result_df(br.sizes, br.libraries, getproperty(br, s))
+function _benchmark_result_df(br::BenchmarkResult, s::Symbol = :gflops, measure = :minimum)
+    _benchmark_result_df(br.sizes, br.libraries, getproperty(br, s), measure)
 end
 
 
 """
-    benchmark_result_df(benchmark_result::BenchmarkResult)
+    benchmark_result_df(benchmark_result::BenchmarkResult, `measure` = :minimum)
+
+`measure` refers to the BenchmarkTools summary on times. Valid options are:
+`:minimum`, `:medain`, `:mean`, `:maximum`, and `:hmean`.
+
+ -  `:minimum` would yield the maximum `GFLOPS`, and would be the usual estimate used in Julia.
+ - `:hmean`, the harmonic mean of the times, is usful if you want an average GFLOPS, instead of a GFLOPS computed with the average times.
 """
-function benchmark_result_df(benchmark_result::BenchmarkResult)
-    df = _benchmark_result_df(benchmark_result, :times)
+function benchmark_result_df(benchmark_result::BenchmarkResult, measure = :minimum)
+    df = _benchmark_result_df(benchmark_result, :times, measure)
     df = stack(df, Not(:Size), variable_name = :Library, value_name = :Seconds)
     df.GFLOPS = @. 2e-9 * matmul_length(df.Size) ./ df.Seconds
     return df
@@ -61,10 +81,11 @@ function benchmark_fun!(
     if force_belapsed || 2t0 < BenchmarkTools.DEFAULT_PARAMETERS.seconds
         maybe_sleep(sleep_time)
         br = @benchmark $f!($C, $A, $B)
-        tret = summarystat(br).time
-        if summarystat === minimum # don't want to do this for `median` or `mean`, for example
-            tret = min(tret, t0)
-        end
+        tmin = min(1e-9minimum(br).time, t0)
+        tmedian = 1e-9median(br).time
+        tmean = 1e-9mean(br).time
+        tmax = 1e-9maximum(br).time # We'll exclude the first for this...
+        thmean⁻¹ = 1e9mean(inv, br.times)
     else
         maybe_sleep(sleep_time)
         t1 = @elapsed f!(C, A, B)
@@ -73,12 +94,20 @@ function benchmark_fun!(
         if (t0+t1) < 4BenchmarkTools.DEFAULT_PARAMETERS.seconds
             maybe_sleep(sleep_time)
             t3 = @elapsed f!(C, A, B)
-            tret = summarystat((t0, t1, t2, t3))
+            tmin = minimum((t0, t1, t2, t3))
+            tmedian = median((t0, t1, t2, t3))
+            tmean = mean((t0, t1, t2, t3))
+            tmax = maximum((t0, t1, t2, t3))
+            thmean⁻¹ = mean(inv, (t0, t1, t2, t3))
         else
-            tret = summarystat((t0, t1, t2))
+            tmin = minimum((t0, t1, t2))
+            tmedian = median((t0, t1, t2))
+            tmean = mean((t0, t1, t2))
+            tmax = maximum((t0, t1, t2))
+            thmean⁻¹ = mean(inv, (t0, t1, t2))
         end
     end
-    return tret
+    return tmin, tmedian, tmean, tmax, thmean⁻¹
 end
 _mat_size(M, N, ::typeof(adjoint)) = (N, M)
 _mat_size(M, N, ::typeof(transpose)) = (N, M)
@@ -191,8 +220,7 @@ end
              threaded::Bool = Threads.nthreads() > 1,
              A_transform = identity,
              B_transform = identity,
-             sleep_time = 0.0,
-             summarystat = median)
+             sleep_time = 0.0)
 
  - T: The element type of the matrices.
  - libs: Libraries to benchmark.
@@ -207,9 +235,8 @@ end
  - sleep_time: The use of this keyword argument is discouraged. If set, it will call `sleep`
        in between benchmarks, the idea being to help keep the CPU cool. This is an unreliable
        means of trying to get more reliable benchmarks. Instead, it's reccommended you disable
-       your systems turbo. Disabling it -- and reenabling when you're done benchmarking -- 
+       your systems turbo. Disabling it -- and reenabling when you're done benchmarking --
        should be possible without requiring a reboot.
-  - summarystat: Which summary statistic should be reported? Defaults to `minimum`
 
 """
 function runbench(
@@ -241,14 +268,15 @@ function runbench(
     end
     memory = Vector{T}(undef, max_matrix_sizes)
     library = reduce(vcat, (libs for _ ∈ eachindex(sizevec)))
-    times = Matrix{Float64}(undef, length(sizes), length(libs))
+    times = Array{Float64}(undef, 5, length(sizes), length(libs))
     gflop = similar(times);
     k = 0
 
     force_belapsed = true # force when compiling
 
     p = Progress(length(sizes))
-    last_perfs = Vector{Tuple{Symbol,Union{Float64,NTuple{3,Int}}}}(undef, length(libs)+1)
+    gflop_report_type = NamedTuple{(:MedianGFLOPS, :MaxGFLOPS), Tuple{Float64, Float64}}
+    last_perfs = Vector{Tuple{Symbol,Union{gflop_report_type,NTuple{3,Int}}}}(undef, length(libs)+1)
     for (j,s) ∈ enumerate(sizevec)
         M, K, N = matmul_sizes(s)
         A,  off = alloc_mat(M, K, memory,   0, A_transform)
@@ -262,13 +290,22 @@ function runbench(
             t = benchmark_fun!(
                 funcs[i], summarystat, C, A, B, sleep_time, force_belapsed, ref
             )
-            gflops = 2e-9M*K*N / t
-            times[j,i] = t
-            gflop[j,i] = gflops
-            last_perfs[i+1] = (libs[i], round(gflops,sigdigits=4))
+            gffactor = 2e-9M*K*N
+            @inbounds for k ∈ 1:4
+                times[k,j,i] = t[k]
+                gflop[k,j,i] = gffactor / t[k]
+            end
+            times[5,j,i] = inv(t[5])
+            gflop[5,j,i] = gffactor * t[5]
+            gflops = round.((gflop[1,j,i], gflop[2,j,i]), sigdigits = 4)
+            gflops = (
+                MedianGFLOPS = round(gflop[2,j,i], sigdigits = 4),
+                MaxGFLOPS = round(gflop[1,j,i], sigdigits = 4)
+            )
+            last_perfs[i+1] = (libs[i], gflops)
         end
         ProgressMeter.next!(p; showvalues = last_perfs)
         force_belapsed = false
     end
-    BenchmarkResult{T,eltype(sizes)}(libs, sizes, gflop, times, threaded)
+    BenchmarkResult{T}(libs, sizes, gflop, times, threaded)
 end
diff --git a/test/interface.jl b/test/interface.jl
@@ -2,12 +2,31 @@
 import BLASBenchmarksCPU
 import StatsPlots
 @testset "Interface" begin
-    benchmark_result = BLASBenchmarksCPU.runbench(Float64; sizes = [1, 2, 5, 10, 20, 50, 100, 200], threaded=false, summarystat = BLASBenchmarksCPU.median) #test that threads=false at least doesn't throw somewhere.
-    df = BLASBenchmarksCPU.benchmark_result_df(benchmark_result)
-    @test df isa BLASBenchmarksCPU.DataFrame
-    df[!, :Size] = Float64.(df[!, :Size]);
-    df[!, :GFLOPS] = Float64.(df[!, :GFLOPS]);
-    df[!, :Seconds] = Float64.(df[!, :Seconds]);
-    p = StatsPlots.@df df StatsPlots.plot(:Size, :GFLOPS; group = :Library, legend = :bottomright)
-    @test p isa StatsPlots.Plots.Plot
+    benchmark_result = BLASBenchmarksCPU.runbench(Float64; sizes = [1, 2, 5, 10, 20, 50, 100, 200], threaded=false) #test that threads=false at least doesn't throw somewhere.
+    dfmin = BLASBenchmarksCPU.benchmark_result_df(benchmark_result) # minimum
+    dfmedian = BLASBenchmarksCPU.benchmark_result_df(benchmark_result, :median)
+    dfmean = BLASBenchmarksCPU.benchmark_result_df(benchmark_result, :mean)
+    dfmax = BLASBenchmarksCPU.benchmark_result_df(benchmark_result, :maximum)
+    @test_throws ArgumentError  BLASBenchmarksCPU.benchmark_result_df(benchmark_result, :foobar)
+    @test dfmin isa BLASBenchmarksCPU.DataFrame
+    @test dfmedian isa BLASBenchmarksCPU.DataFrame
+    @test dfmean isa BLASBenchmarksCPU.DataFrame
+    @test dfmax isa BLASBenchmarksCPU.DataFrame
+    for df ∈ (dfmin,dfmedian,dfmean,dfmax)
+        df[!, :Size] = Float64.(df[!, :Size]);
+        df[!, :GFLOPS] = Float64.(df[!, :GFLOPS]);
+        df[!, :Seconds] = Float64.(df[!, :Seconds]);
+        p = StatsPlots.@df df StatsPlots.plot(:Size, :GFLOPS; group = :Library, legend = :bottomright)
+        @test p isa StatsPlots.Plots.Plot
+    end
+    @test all(dfmin[!, :GFLOPS] .≥ dfmedian[!, :GFLOPS])
+    @test all(dfmin[!, :GFLOPS] .≥ dfmean[!, :GFLOPS])
+    @test all(dfmin[!, :GFLOPS] .≥ dfmax[!, :GFLOPS])
+    @test any(dfmin[!, :GFLOPS] .≠ dfmedian[!, :GFLOPS])
+    @test any(dfmin[!, :GFLOPS] .≠ dfmean[!, :GFLOPS])
+    @test any(dfmin[!, :GFLOPS] .≠ dfmax[!, :GFLOPS])
+    @test any(dfmedian[!, :GFLOPS] .≥ dfmax[!, :GFLOPS])
+    @test any(dfmean[!, :GFLOPS] .≥ dfmax[!, :GFLOPS])
+    @test any(dfmedian[!, :GFLOPS] .≠ dfmax[!, :GFLOPS])
+    @test any(dfmean[!, :GFLOPS] .≠ dfmax[!, :GFLOPS])
 end
diff --git a/test/main.jl b/test/main.jl
@@ -19,5 +19,24 @@ for T in [Float64, Float32]
     BLASBenchmarksCPU.plot(
         benchmark_result;
         plot_directory = plot_directory,
+        displayplot = false
+    )
+    BLASBenchmarksCPU.plot(
+        benchmark_result;
+        plot_directory = plot_directory,
+        measure = :median,
+        displayplot = false
+    )
+    BLASBenchmarksCPU.plot(
+        benchmark_result;
+        plot_directory = plot_directory,
+        measure = :mean,
+        displayplot = false
+    )
+    BLASBenchmarksCPU.plot(
+        benchmark_result;
+        plot_directory = plot_directory,
+        measure = :maximum,
+        displayplot = false
     )
 end