Skip to content

Commit 5d64b1d

Browse files
BLIS now provides 64 bit binaries, add a LoopVectorization matmul (#52)
* BLIS now provides 64 bit binaries, add a default LoopVectorization matrix multiply * Bump version * Require LoopVectorization 0.12 * Require Gaius 0.6.4 * Update ci.yml Co-authored-by: Dilum Aluthge <[email protected]>
1 parent 5064398 commit 5d64b1d

File tree

6 files changed

+42
-22
lines changed

6 files changed

+42
-22
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ jobs:
6060
Pkg.develop(PackageSpec(path=pwd()))
6161
Pkg.instantiate()'
6262
shell: bash
63+
env:
64+
JULIA_PKG_SERVER: ""
6365
- run: |
6466
julia --project=docs -e '
6567
using Documenter: doctest

Project.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "BLASBenchmarksCPU"
22
uuid = "5fdc822c-4560-4d20-af7e-e5ee461714d5"
33
authors = ["Chris Elrod <[email protected]> and contributors"]
4-
version = "0.3.1"
4+
version = "0.3.2"
55

66
[deps]
77
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
@@ -29,12 +29,12 @@ Colors = "0.12"
2929
DataFrames = "0.22"
3030
Fontconfig = "0.4"
3131
Gadfly = "1.3"
32-
Gaius = "0.5,0.6"
33-
LoopVectorization = "0.10, 0.11"
32+
Gaius = "0.6.4"
33+
LoopVectorization = "0.12"
3434
Octavian = "0.2"
3535
ProgressMeter = "1.4"
3636
Tullio = "0.2"
37-
VectorizationBase = "0.16, 0.17, 0.18, 0.19"
37+
VectorizationBase = "0.19"
3838
julia = "1.5"
3939

4040
[extras]

src/benchconfig.jl

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,27 @@
33
function tmul_threads!(C, A, B)
44
@tullio C[m,n] = A[m,k] * B[k,n]
55
end
6-
76
function tmul_no_threads!(C, A, B)
87
@tullio C[m,n] = A[m,k] * B[k,n] threads=false
98
end
9+
function lvmul_threads!(C, A, B)
10+
@avxt for n indices((C,B), 2), m indices((C,A), 1)
11+
Cmn = zero(eltype(C))
12+
for k indices((A,B), (2,1))
13+
Cmn += A[m,k] * B[k,n]
14+
end
15+
C[m,n] = Cmn
16+
end
17+
end
18+
function lvmul_no_threads!(C, A, B)
19+
@avx for n indices((C,B), 2), m indices((C,A), 1)
20+
Cmn = zero(eltype(C))
21+
for k indices((A,B), (2,1))
22+
Cmn += A[m,k] * B[k,n]
23+
end
24+
C[m,n] = Cmn
25+
end
26+
end
1027

1128
function generic_matmul!(C, A, B)
1229
istransposed(C) === 'N' || (generic_matmul!(untransposed(C), _transpose(B), _transpose(A)); return C)
@@ -17,6 +34,8 @@ function generic_matmul!(C, A, B)
1734
LinearAlgebra.generic_matmatmul!(C, transA, transB, pA, pB)
1835
end
1936

37+
38+
2039
function getfuncs(libs::Vector{Symbol}, threaded::Bool)::Vector{Function}
2140
map(libs) do i
2241
if i === :MKL
@@ -31,6 +50,8 @@ function getfuncs(libs::Vector{Symbol}, threaded::Bool)::Vector{Function}
3150
threaded ? tmul_threads! : tmul_no_threads!
3251
elseif i === :Gaius
3352
threaded ? Gaius.mul! : Gaius.mul_serial!
53+
elseif i === :LoopVectorization
54+
threaded ? lvmul_threads! : lvmul_no_threads!
3455
elseif i === :generic || i === :Generic || i === :GENERIC
3556
generic_matmul!
3657
else

src/ccallblas.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ _transpose(A::Transpose) = transpose(A)
1515
for (name,typ,suff) [
1616
("mkl", :Int32, ""),
1717
("openblas", :Int64, "_64_"),
18-
("blis", :Int32, "_")
18+
("blis", :Int64, "_64_")
1919
]
2020
uname = uppercase(name)
2121
lib = Symbol("lib", uname)

src/plotting.jl

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
####################################### Colors #####################################################
44
####################################################################################################
55

6-
const LIBRARIES = [:Octavian, :MKL, :OpenBLAS, :blis, :Tullio, :Gaius, :Generic];
6+
const LIBRARIES = [:Octavian, :MKL, :OpenBLAS, :blis, :Tullio, :Gaius, :LoopVectorization, :Generic];
77
"""
88
Defines the mapping between libraries and colors
99
"""# #0071c5 == Intel Blue
@@ -27,14 +27,14 @@ isjulialib(x) = x ∈ JULIA_LIBS
2727

2828

2929
function pick_suffix(desc = "")
30-
suffix = if VectorizationBase.has_feature("x86_64_avx512f")
30+
suffix = if Bool(VectorizationBase.has_feature(Val(:x86_64_avx512f)))
3131
"AVX512"
32-
elseif VectorizationBase.has_feature("x86_64_avx2")
32+
elseif Bool(VectorizationBase.has_feature(Val(:x86_64_avx2)))
3333
"AVX2"
34-
elseif VectorizationBase.has_feature("x86_64_avx")
34+
elseif Bool(VectorizationBase.has_feature(Val(:x86_64_avx)))
3535
"AVX"
3636
else
37-
"REGSIZE$(VectorizationBase.register_size())"
37+
"REGSIZE$(Int(VectorizationBase.register_size()))"
3838
end
3939
if desc != ""
4040
suffix *= '_' * desc
@@ -90,7 +90,6 @@ end
9090
"""
9191
function Gadfly.plot(br::BenchmarkResult{T}; kwargs...) where {T}
9292
_plot(br; kwargs...)
93-
return nothing
9493
end
9594
roundint(x) = round(Int,x)
9695
# `_plot` is just like `plot`, except _plot returns the filenames

src/runbenchmark.jl

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ struct BenchmarkResult{T,I<:Union{Int,NTuple{3,Int}}}
44
gflops::Array{Float64,3}
55
times::Array{Float64,3}
66
threaded::Bool
7-
function BenchmarkResult{T}(libraries, sizes, gflops, times, threaded) where {T}
8-
gflopsperm = permutedims(gflops, (2,3,1))
9-
timesperm = permutedims(times, (2,3,1))
10-
I = eltype(sizes)
11-
new{T,I}(libraries, convert(Vector{I},sizes), gflopsperm, timesperm, threaded)
12-
end
7+
end
8+
function BenchmarkResult{T}(libraries, sizes, gflops, times, threaded) where {T}
9+
gflopsperm = permutedims(gflops, (2,3,1))
10+
timesperm = permutedims(times, (2,3,1))
11+
I = eltype(sizes)
12+
BenchmarkResult{T,I}(libraries, convert(Vector{I},sizes), gflopsperm, timesperm, threaded)
1313
end
1414

1515
"""
@@ -74,7 +74,6 @@ end
7474

7575
function benchmark_fun!(
7676
f!::F,
77-
summarystat,
7877
C,
7978
A,
8079
B,
@@ -203,6 +202,7 @@ function all_libs()
203202
:Octavian,
204203
:OpenBLAS,
205204
:Tullio,
205+
:LoopVectorization
206206
]
207207
return libs
208208
end
@@ -258,8 +258,7 @@ function runbench(
258258
threaded::Bool = Threads.nthreads() > 1,
259259
A_transform = identity,
260260
B_transform = identity,
261-
sleep_time = 0.0,
262-
summarystat = minimum
261+
sleep_time = 0.0
263262
) where {T}
264263
if threaded
265264
mkl_set_num_threads(num_cores())
@@ -303,7 +302,6 @@ function runbench(
303302
comment = "lib=$(lib), M=$(M), K=$(K), N=$(N)"
304303
t = benchmark_fun!(
305304
funcs[i],
306-
summarystat,
307305
C,
308306
A,
309307
B,

0 commit comments

Comments
 (0)