Skip to content

Commit b65eb33

Browse files
committed
Added a benchmark, for now inlining generated functions.
1 parent 3c833cc commit b65eb33

File tree

9 files changed

+163
-55
lines changed

9 files changed

+163
-55
lines changed

benchmark/benchmarkflops.jl

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,17 @@ function benchmark_gemm(sizes)
4949
n_gflop = M*K*N*2e-9
5050
br[1,i] = n_gflop / @belapsed mul!($C, $A, $B)
5151
Cblas = copy(C)
52-
br[2,i] = n_gflop / @belapsed jgemm_nkm!($C, $A, $B)
52+
br[2,i] = n_gflop / @belapsed jgemm!($C, $A, $B)
5353
@assert C Cblas "Julia gemm wrong?"
54-
br[3,i] = n_gflop / @belapsed cgemm_nkm!($C, $A, $B)
54+
br[3,i] = n_gflop / @belapsed cgemm!($C, $A, $B)
5555
@assert C Cblas "Polly gemm wrong?"
56-
br[4,i] = n_gflop / @belapsed fgemm_nkm!($C, $A, $B)
56+
br[4,i] = n_gflop / @belapsed fgemm!($C, $A, $B)
5757
@assert C Cblas "Fort gemm wrong?"
5858
br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $B)
5959
@assert C Cblas "Fort intrinsic gemm wrong?"
60-
br[6,i] = n_gflop / @belapsed icgemm_nkm!($C, $A, $B)
60+
br[6,i] = n_gflop / @belapsed icgemm!($C, $A, $B)
6161
@assert C Cblas "icc gemm wrong?"
62-
br[7,i] = n_gflop / @belapsed ifgemm_nkm!($C, $A, $B)
62+
br[7,i] = n_gflop / @belapsed ifgemm!($C, $A, $B)
6363
@assert C Cblas "ifort gemm wrong?"
6464
br[8,i] = n_gflop / @belapsed ifgemm_builtin!($C, $A, $B)
6565
@assert C Cblas "ifort intrinsic gemm wrong?"
@@ -83,21 +83,21 @@ function benchmark_AtmulB(sizes)
8383
n_gflop = M*K*N*2e-9
8484
br[1,i] = n_gflop / @belapsed mul!($C, $At', $B)
8585
Cblas = copy(C)
86-
br[2,i] = n_gflop / @belapsed jAtmulB!($C, $At, $B)
86+
br[2,i] = n_gflop / @belapsed jgemm!($C, $At', $B)
8787
@assert C Cblas "Julia gemm wrong?"
88-
br[3,i] = n_gflop / @belapsed cAtmulB!($C, $At, $B)
88+
br[3,i] = n_gflop / @belapsed cgemm!($C, $At', $B)
8989
@assert C Cblas "Polly gemm wrong?"
90-
br[4,i] = n_gflop / @belapsed fAtmulB!($C, $At, $B)
90+
br[4,i] = n_gflop / @belapsed fgemm!($C, $At', $B)
9191
@assert C Cblas "Fort gemm wrong?"
92-
br[5,i] = n_gflop / @belapsed fAtmulB_builtin!($C, $At, $B)
92+
br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $At', $B)
9393
@assert C Cblas "Fort intrinsic gemm wrong?"
94-
br[6,i] = n_gflop / @belapsed cAtmulB!($C, $At, $B)
94+
br[6,i] = n_gflop / @belapsed icgemm!($C, $At', $B)
9595
@assert C Cblas "icc gemm wrong?"
96-
br[7,i] = n_gflop / @belapsed ifAtmulB!($C, $At, $B)
96+
br[7,i] = n_gflop / @belapsed ifgemm!($C, $At', $B)
9797
@assert C Cblas "iort gemm wrong?"
98-
br[8,i] = n_gflop / @belapsed ifAtmulB_builtin!($C, $At, $B)
98+
br[8,i] = n_gflop / @belapsed ifgemm_builtin!($C, $At', $B)
9999
@assert C Cblas "ifort intrinsic gemm wrong?"
100-
br[9,i] = n_gflop / @belapsed jAtmulBavx!($C, $At, $B)
100+
br[9,i] = n_gflop / @belapsed gemmavx!($C, $At', $B)
101101
@assert C Cblas "LoopVec gemm wrong?"
102102
# if i % 10 == 0
103103
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
@@ -223,11 +223,14 @@ function sse!(Xβ, y, X, β)
223223
mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
224224
dot(Xβ, Xβ)
225225
end
226+
sse_totwotuple(s::NTuple{2}) = s
227+
sse_totwotuple(s::Integer) = ((3s) >> 1, s >> 1)
228+
226229
function benchmark_sse(sizes)
227230
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
228231
br = BenchmarkResult(tests, sizes)
229232
for (i,s) enumerate(sizes)
230-
N, P = totwotuple(s)
233+
N, P = sse_totwotuple(s)
231234
y = rand(N); β = rand(P)
232235
X = randn(N, P)
233236
= similar(y)
@@ -338,3 +341,32 @@ function benchmark_AplusAt(sizes)
338341
br
339342
end
340343

344+
function benchmark_random_access(sizes)
345+
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
346+
br = BenchmarkResult(tests, sizes)
347+
for (i,s) enumerate(sizes)
348+
A, C = totwotuple(s)
349+
P = rand(A, C);
350+
basis = rand(1:C, A, C);
351+
coefs = randn(C)
352+
n_gflop = 1e-9*(A*C + C)
353+
p = randomaccess(P, basis, coefs)
354+
br[1,i] = n_gflop / @belapsed randomaccess($P, $basis, $coefs)
355+
br[2,i] = n_gflop / @belapsed crandomaccess($P, $basis, $coefs)
356+
@assert p crandomaccess(P, basis, coefs) "Clang wrong?"
357+
br[3,i] = n_gflop / @belapsed frandomaccess($P, $basis, $coefs)
358+
@assert p frandomaccess(P, basis, coefs) "Fort wrong?"
359+
br[4,i] = n_gflop / @belapsed icrandomaccess($P, $basis, $coefs)
360+
@assert p icrandomaccess(P, basis, coefs) "icc wrong?"
361+
br[5,i] = n_gflop / @belapsed ifrandomaccess($P, $basis, $coefs)
362+
@assert p ifrandomaccess(P, basis, coefs) "ifort wrong?"
363+
br[6,i] = n_gflop / @belapsed @avx randomaccessavx($P, $basis, $coefs)
364+
@assert p randomaccessavx(P, basis, coefs) "LoopVec wrong?"
365+
# if i % 10 == 0
366+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
367+
# @show percent_complete
368+
# end
369+
end
370+
br
371+
end
372+

benchmark/driver.jl

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
1010

1111
using Distributed
1212

13-
addprocs(10);
13+
addprocs(11);
1414

1515
@everywhere begin
1616
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
@@ -29,11 +29,13 @@ sse_future = @spawnat 8 benchmark_sse(2:256);
2929
exp_future = @spawnat 9 benchmark_exp(2:256);
3030
aplusBc_future = @spawnat 10 benchmark_aplusBc(2:256);
3131
AplusAt_future = @spawnat 11 benchmark_AplusAt(2:256);
32+
randomaccess_future = @spawnat 12 benchmark_random_access(2:256);
3233

3334
dot_bench = fetch(dot_future)
3435
selfdot_bench = fetch(selfdot_future)
3536
AplusAt_bench = fetch(AplusAt_future)
3637
gemv_bench = fetch(gemv_future)
38+
randomaccess_bench = fetch(randomaccess_future)
3739
dot3_bench = fetch(dot3_future)
3840
sse_bench = fetch(sse_future)
3941
exp_bench = fetch(exp_future)
@@ -42,6 +44,21 @@ gemm_bench = fetch(gemm_future)
4244
AtmulB_bench = fetch(AtmulB_future)
4345

4446

47+
v = 1
48+
const PICTURES = "/home/chriselrod/Pictures"
49+
save(joinpath(PICTURES, "bench_gemm_v$v.png"), plot(gemm_bench));
50+
save(joinpath(PICTURES, "bench_AtmulB_v$v.png"), plot(AtmulB_bench));
51+
save(joinpath(PICTURES, "bench_dot_v$v.png"), plot(dot_bench));
52+
save(joinpath(PICTURES, "bench_selfdot_v$v.png"), plot(selfdot_bench));
53+
save(joinpath(PICTURES, "bench_gemv_v$v.png"), plot(gemv_bench));
54+
save(joinpath(PICTURES, "bench_dot3_v$v.png"), plot(dot3_bench));
55+
save(joinpath(PICTURES, "bench_sse_v$v.png"), plot(sse_bench));
56+
save(joinpath(PICTURES, "bench_exp_v$v.png"), plot(exp_bench));
57+
save(joinpath(PICTURES, "bench_aplusBc_v$v.png"), plot(aplusBc_bench));
58+
save(joinpath(PICTURES, "bench_AplusAt_v$v.png"), plot(AplusAt_bench));
59+
save(joinpath(PICTURES, "bench_random_access_v$v.png"), plot(randomaccess_bench));
60+
61+
4562
plot(gemm_bench)
4663
plot(AtmulB_bench)
4764
plot(dot_bench)
@@ -53,13 +70,5 @@ plot(exp_bench)
5370
plot(aplusBc_bench)
5471
plot(AplusAt_bench)
5572

56-
save(joinpath("~/Pictures", "bench_gemm_v3.png"), plot(gemm_bench));
57-
save(joinpath("~/Pictures", "bench_AtmulB_v3.png"), plot(AtmulB_bench));
58-
save(joinpath("~/Pictures", "bench_dot_v3.png"), plot(dot_bench));
59-
save(joinpath("~/Pictures", "bench_selfdot_v3.png"), plot(selfdot_bench));
60-
save(joinpath("~/Pictures", "bench_gemv_v3.png"), plot(gemv_bench));
61-
save(joinpath("~/Pictures", "bench_dot3_v3.png"), plot(dot3_bench));
62-
save(joinpath("~/Pictures", "bench_sse_v3.png"), plot(sse_bench));
63-
save(joinpath("~/Pictures", "bench_exp_v3.png"), plot(exp_bench));
64-
save(joinpath("~/Pictures", "bench_aplusBc_v3.png"), plot(aplusBc_bench));
73+
6574

benchmark/loadsharedlibs.jl

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
4747
)
4848
end
4949
end
50-
51-
50+
@eval @inline $(Symbol(prefix,:cgemm!))(C, A, B) = $(Symbol(prefix, :cgemm_nkm!))(C, A, B)
51+
@eval @inline $(Symbol(prefix,:fgemm!))(C, A, B) = $(Symbol(prefix, :fgemm_nkm!))(C, A, B)
5252
@eval function $(Symbol(prefix,:fgemm_builtin!))(C, A, B)
5353
M, N = size(C); K = size(B, 1)
5454
ccall(
@@ -57,28 +57,28 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
5757
C, A, B, Ref(M), Ref(K), Ref(N)
5858
)
5959
end
60-
@eval function $(Symbol(prefix,:cAtmulB!))(C, A, B)
60+
@eval @inline function $(Symbol(prefix,:cgemm!))(C, A::Adjoint, B)
6161
M, N = size(C); K = size(B, 1)
6262
ccall(
6363
(:AtmulB, $Cshared), Cvoid,
6464
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong, Clong),
65-
C, A, B, M, K, N
65+
C, parent(A), B, M, K, N
6666
)
6767
end
68-
@eval function $(Symbol(prefix,:fAtmulB!))(C, A, B)
68+
@eval @inline function $(Symbol(prefix,:fgemm!))(C, A::Adjoint, B)
6969
M, N = size(C); K = size(B, 1)
7070
ccall(
7171
(:AtmulB, $Fshared), Cvoid,
7272
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
73-
C, A, B, Ref(M), Ref(K), Ref(N)
73+
C, parent(A), B, Ref(M), Ref(K), Ref(N)
7474
)
7575
end
76-
@eval function $(Symbol(prefix,:fAtmulB_builtin!))(C, A, B)
76+
@eval @inline function $(Symbol(prefix,:fgemm_builtin!))(C, A::Adjoint, B)
7777
M, N = size(C); K = size(B, 1)
7878
ccall(
7979
(:AtmulBbuiltin, $Fshared), Cvoid,
8080
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
81-
C, A, B, Ref(M), Ref(K), Ref(N)
81+
C, parent(A), B, Ref(M), Ref(K), Ref(N)
8282
)
8383
end
8484

@@ -247,4 +247,23 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
247247
B, A, N
248248
)
249249
end
250+
@eval function $(Symbol(prefix,:crandomaccess))(P, basis, coefs)
251+
A, C = size(P)
252+
ccall(
253+
(:randomaccess, $Cshared), Float64,
254+
(Ptr{Float64}, Ptr{Clong}, Ptr{Float64}, Clong, Clong),
255+
P, basis, coefs, A, C
256+
)
257+
end
258+
@eval function $(Symbol(prefix,:frandomaccess))(P, basis, coefs)
259+
A, C = size(P)
260+
p = Ref{Float64}()
261+
ccall(
262+
(:randomaccess, $Cshared), Cvoid,
263+
(Ref{Float64}, Ptr{Float64}, Ptr{Clong}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
264+
p, P, basis, coefs, Ref(A), Ref(C)
265+
)
266+
p[]
267+
end
268+
250269
end

benchmark/looptests.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,4 +178,16 @@ void AplusAt(double* restrict B, double* restrict A, long N){
178178
}
179179
}
180180
}
181+
double randomaccess(double* restrict P, long* restrict basis, double* restrict coefs, long A, long C){
182+
double p = 0.0;
183+
for (long c = 0; c < C; c++){
184+
double pc = coefs[c];
185+
for (long a = 0; a < A; a++){
186+
pc *= P[a + (basis[a + c*A]-1)*A];
187+
}
188+
p += pc;
189+
}
190+
return p;
191+
}
192+
181193

benchmark/looptests.f90

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,4 +253,20 @@ subroutine AplusAtbuiltin(B, A, N) BIND(C, name="AplusAtbuiltin")
253253
real(C_double), dimension(N,N), intent(in) :: A
254254
B = A + transpose(A)
255255
end subroutine AplusAtbuiltin
256+
subroutine randomaccess(pp, P, basis, coefs, A, C) BIND(C, name="randomaccess")
257+
integer(C_long), intent(in) :: A, C
258+
real(C_double), intent(in) :: P(A,C), coefs(C)
259+
integer(C_long), intent(in) :: basis(A,C)
260+
real(C_double), intent(out) :: pp
261+
real(C_double) :: pc
262+
integer(C_long) :: aa, cc
263+
pp = 0
264+
do cc = 1,C
265+
pc = coefs(cc)
266+
do aa = 1,A
267+
pc = pc * P(aa, basis(aa, cc))
268+
end do
269+
pp = pp + pc
270+
end do
271+
end subroutine randomaccess
256272
end module looptests

benchmark/looptests.jl

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
using LoopVectorization, LinearAlgebra
22
BLAS.set_num_threads(1)
33

4-
function jgemm_nkm!(C, A, B)
4+
function jgemm!(C, A, B)
55
C .= 0
66
M, N = size(C); K = size(B,1)
77
@inbounds for n 1:N, k 1:K
@@ -10,16 +10,17 @@ function jgemm_nkm!(C, A, B)
1010
end
1111
end
1212
end
13-
function gemmavx!(C, A, B)
14-
@avx for i 1:size(A,1), j 1:size(B,2)
15-
Cᵢⱼ = 0.0
13+
@inline function gemmavx!(C, A, B)
14+
@avx inline=true for i 1:size(A,1), j 1:size(B,2)
15+
Cᵢⱼ = zero(eltype(C))
1616
for k 1:size(A,2)
1717
Cᵢⱼ += A[i,k] * B[k,j]
1818
end
1919
C[i,j] = Cᵢⱼ
2020
end
2121
end
22-
function jAtmulB!(C, A, B)
22+
@inline function jgemm!(C, Aᵀ::Adjoint, B)
23+
A = parent(Aᵀ)
2324
@inbounds for n 1:size(C,2), m 1:size(C,1)
2425
Cₘₙ = zero(eltype(C))
2526
@simd ivdep for k 1:size(A,1)
@@ -28,15 +29,6 @@ function jAtmulB!(C, A, B)
2829
C[m,n] = Cₘₙ
2930
end
3031
end
31-
function jAtmulBavx!(C, A, B)
32-
@avx for n 1:size(C,2), m 1:size(C,1)
33-
Cₘₙ = zero(eltype(C))
34-
for k 1:size(A,1)
35-
Cₘₙ += A[k,m] * B[k,n]
36-
end
37-
C[m,n] = Cₘₙ
38-
end
39-
end
4032
function jdot(a, b)
4133
s = 0.0
4234
@inbounds @simd ivdep for i eachindex(a, b)
@@ -169,6 +161,32 @@ function jOLSlp_avx(y, X, β)
169161
end
170162
lp
171163
end
164+
function randomaccess(P, basis, coeffs::Vector{T}) where {T}
165+
C = length(coeffs)
166+
A = size(P, 1)
167+
p = zero(T)
168+
@avx for c 1:C
169+
pc = coeffs[c]
170+
for a = 1:A
171+
pc *= P[a, basis[a, c]]
172+
end
173+
p += pc
174+
end
175+
return p
176+
end
177+
function randomaccessavx(P, basis, coeffs::Vector{T}) where {T}
178+
C = length(coeffs)
179+
A = size(P, 1)
180+
p = zero(T)
181+
@avx for c 1:C
182+
pc = coeffs[c]
183+
for a = 1:A
184+
pc *= P[a, basis[a, c]]
185+
end
186+
p += pc
187+
end
188+
return p
189+
end
172190

173191

174192

benchmark/plotbenchmarks.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ function plot(br::BenchmarkResult)
1717
:line,
1818
x = :Size,
1919
y = :GFLOPS,
20-
color = :Method
20+
color = :Method,
21+
width = 549,
22+
height = 372
2123
)
2224
end
2325

src/constructors.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ function check_unroll(arg)
116116
a1 === :unroll || return nothing
117117
convert(Int8, arg.args[2])
118118
end
119-
function check_macro_kwarg(arg, inline::Int8 = one(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
119+
function check_macro_kwarg(arg, inline::Int8 = Int8(2), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
120120
@assert arg.head === :(=)
121121
i = check_inline(arg)
122122
if i !== nothing

test/runtests.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,15 +1053,15 @@ end
10531053
return p
10541054
end
10551055
function mvpavx(P, basis, coeffs::Vector{T}) where {T}
1056-
len_c = length(coeffs)
1057-
len_P = size(P, 1)
1056+
C = length(coeffs)
1057+
A = size(P, 1)
10581058
p = zero(T)
1059-
@avx for n = 1:len_c
1060-
pn = coeffs[n]
1061-
for a = 1:len_P
1062-
pn *= P[a, basis[a, n]]
1059+
@avx for c 1:C
1060+
pc = coeffs[c]
1061+
for a = 1:A
1062+
pc *= P[a, basis[a, c]]
10631063
end
1064-
p += pn
1064+
p += pc
10651065
end
10661066
return p
10671067
end

0 commit comments

Comments
 (0)