Skip to content

Commit 21cb787

Browse files
committed
dgemmjit looks identical to regular MKL, suggesting (a) the JIT isn't working (compiler warnings about ignored preprocessor directives reinforce this), and (b) a way to benchmark both OpenBLAS and MKL.
1 parent 47f2789 commit 21cb787

File tree

5 files changed

+84
-26
lines changed

5 files changed

+84
-26
lines changed

benchmark/benchmarkflops.jl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ function matmul_bench!(br, C, A, B, i)
5252
@assert C Cblas "eigen gemm wrong?"; fill!(C, NaN)
5353
br[10,i] = n_gflop / @belapsed iegemm!($C, $A, $B)
5454
@assert C Cblas "i-eigen gemm wrong?"; fill!(C, NaN)
55-
br[11,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
55+
br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
56+
@assert C Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
57+
br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
5658
@assert C Cblas "LoopVec gemm wrong?"
5759
end
5860
function A_mul_B_bench!(br, s, i)
@@ -97,7 +99,7 @@ const BLASTESTS = [
9799
"GFortran", "GFort-intrinsic",
98100
"icc", "ifort", "ifort-intrinsic",
99101
"Clang++ & Eigen-3", "icpc & Eigen-3",
100-
"LoopVectorization"
102+
"MKL JIT", "LoopVectorization"
101103
]
102104

103105
function benchmark_AmulB(sizes)
@@ -209,7 +211,9 @@ function gemv_bench!(br, x, A, y, i)
209211
@assert x xblas "eigen wrong?"; fill!(x, NaN);
210212
br[10,i] = n_gflop / @belapsed iegemv!($x, $A, $y)
211213
@assert x xblas "i-eigen wrong?"; fill!(x, NaN);
212-
br[11,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
214+
br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
215+
@assert x xblas "gemmjit wrong?"; fill!(x, NaN);
216+
br[12,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
213217
@assert x xblas "LoopVec wrong?"
214218
end
215219
function A_mul_vb_bench!(br, s, i)

benchmark/directcalljit.f90

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
module jitmul
2+
3+
include "/opt/intel/mkl/include/mkl_direct_call.fi"
4+
5+
use ISO_C_BINDING
6+
implicit none
7+
8+
contains
9+
10+
! subroutine dgemmjit(C,A,B,M,K,N,alpha,beta) bind(C, name = "dgemmjit")
11+
subroutine dgemmjit(C,A,B,M,K,N,At,Bt) bind(C, name = "dgemmjit")
12+
integer(C_int32_t), intent(in) :: M, K, N
13+
integer(C_int8_t), intent(in) :: At, Bt
14+
real(C_double), parameter :: alpha = 1.0D0, beta = 0.0D0
15+
! real(C_double), intent(in) :: alpha, beta
16+
real(C_double), dimension(M,K), intent(in) :: A
17+
real(C_double), dimension(K,N), intent(in) :: B
18+
real(C_double), dimension(M,N), intent(out) :: C
19+
character :: Atc, Btc
20+
if (At == 1_C_int8_t) then
21+
Atc = 'T'
22+
else
23+
Atc = 'N'
24+
end if
25+
if (Bt == 1_C_int8_t) then
26+
Btc = 'T'
27+
else
28+
Btc = 'N'
29+
end if
30+
call dgemm(Atc, Btc, M, N, K, alpha, A, M, B, K, beta, C, M)
31+
end subroutine dgemmjit
32+
33+
end module jitmul

benchmark/driver.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ save(joinpath(PICTURES, "bench_AplusAt_v$v.$filetype"), plot(AplusAt_bench));
5151
save(joinpath(PICTURES, "bench_Amulvb_v$v.$filetype"), plot(Amulvb_bench));
5252
save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
5353
save(joinpath(PICTURES, "bench_exp_v$v.$filetype"), plot(exp_bench));
54-
save(joinpath(PICTURES, "bench_logdettriangle_v$v.$filetype"), plot(logdettriangle_bench));
5554
save(joinpath(PICTURES, "bench_random_access_v$v.$filetype"), plot(randomaccess_bench));
55+
save(joinpath(PICTURES, "bench_logdettriangle_v$v.$filetype"), plot(logdettriangle_bench));
5656

5757

5858

benchmark/loadsharedlibs.jl

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
using LinearAlgebra
22
using LoopVectorization.VectorizationBase: REGISTER_SIZE
33

44
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
@@ -11,6 +11,7 @@ const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
1111
const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
1212
const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
1313
const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
14+
const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
1415

1516
# requires Clang with polly to build
1617
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
@@ -23,7 +24,7 @@ end
2324
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90")
2425
if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
2526
# --param max-unroll-times defaults to ≥8, which is generally excessive
26-
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
27+
run(`gfortran -Ofast -march=native -funroll-loops -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
2728
end
2829
if !isfile(LIBIFTEST) || mtime(ffile) > mtime(LIBIFTEST)
2930
run(`ifort -fast -qopt-zmm-usage=high -qoverride-limits -shared -fPIC $ffile -o $LIBIFTEST`)
@@ -39,6 +40,26 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
3940
run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4041
end
4142

43+
directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
44+
if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
45+
# run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
46+
run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
47+
end
48+
49+
istransposed(x) = false
50+
istransposed(x::Adjoint) = true
51+
istransposed(x::Transpose) = true
52+
function dgemmjit!(C::AbstractVecOrMat{Float64}, A::AbstractVecOrMat{Float64}, B::AbstractVecOrMat{Float64})
53+
M, N = size(C); K = size(B, 1)
54+
ccall(
55+
(:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
56+
(Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
57+
parent(C), parent(A), parent(B),
58+
Ref(M), Ref(K), Ref(N),
59+
Ref(istransposed(A)), Ref(istransposed(B))
60+
)
61+
end
62+
4263
for (prefix,Cshared,Fshared,Eshared) ((Symbol(""),LIBCTEST,LIBFTEST,LIBEIGENTEST), (:i,LIBICTEST,LIBIFTEST,LIBIEIGENTEST))
4364
for order (:kmn, :knm, :mkn, :mnk, :nkm, :nmk)
4465
gemm = Symbol(:gemm_, order)
@@ -59,9 +80,9 @@ for (prefix,Cshared,Fshared,Eshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST,LIBEIGEN
5980
)
6081
end
6182
end
62-
@eval @inline $(Symbol(prefix,:cgemm!))(C, A, B) = $(Symbol(prefix, :cgemm_nkm!))(C, A, B)
63-
@eval @inline $(Symbol(prefix,:fgemm!))(C, A, B) = $(Symbol(prefix, :fgemm_nkm!))(C, A, B)
64-
@eval @inline function $(Symbol(prefix,:egemm!))(C, A, B)
83+
@eval $(Symbol(prefix,:cgemm!))(C, A, B) = $(Symbol(prefix, :cgemm_nkm!))(C, A, B)
84+
@eval $(Symbol(prefix,:fgemm!))(C, A, B) = $(Symbol(prefix, :fgemm_nkm!))(C, A, B)
85+
@eval function $(Symbol(prefix,:egemm!))(C, A, B)
6586
M, N = size(C); K = size(B, 1)
6687
ccall(
6788
(:AmulB, $Eshared), Cvoid,
@@ -78,7 +99,7 @@ for (prefix,Cshared,Fshared,Eshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST,LIBEIGEN
7899
)
79100
end
80101
for (p,s) [(:c,Cshared) (:e,Eshared)]
81-
@eval @inline function $(Symbol(prefix,p,:gemm!))(C, A::Adjoint, B)
102+
@eval function $(Symbol(prefix,p,:gemm!))(C, A::Adjoint, B)
82103
M, N = size(C); K = size(B, 1)
83104
ccall(
84105
(:AtmulB, $s), Cvoid,
@@ -87,15 +108,15 @@ for (p,s) ∈ [(:c,Cshared) (:e,Eshared)]
87108
)
88109
end
89110
end
90-
@eval @inline function $(Symbol(prefix,:fgemm!))(C, A::Adjoint, B)
111+
@eval function $(Symbol(prefix,:fgemm!))(C, A::Adjoint, B)
91112
M, N = size(C); K = size(B, 1)
92113
ccall(
93114
(:AtmulB, $Fshared), Cvoid,
94115
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
95116
C, parent(A), B, Ref(M), Ref(K), Ref(N)
96117
)
97118
end
98-
@eval @inline function $(Symbol(prefix,:fgemm_builtin!))(C, A::Adjoint, B)
119+
@eval function $(Symbol(prefix,:fgemm_builtin!))(C, A::Adjoint, B)
99120
M, N = size(C); K = size(B, 1)
100121
ccall(
101122
(:AtmulBbuiltin, $Fshared), Cvoid,
@@ -104,7 +125,7 @@ end
104125
)
105126
end
106127
for (p,s) [(:c,Cshared) (:e,Eshared)]
107-
@eval @inline function $(Symbol(prefix,p,:gemm!))(C, A, B::Adjoint)
128+
@eval function $(Symbol(prefix,p,:gemm!))(C, A, B::Adjoint)
108129
M, N = size(C); K = size(B, 1)
109130
ccall(
110131
(:AmulBt, $s), Cvoid,
@@ -113,15 +134,15 @@ for (p,s) ∈ [(:c,Cshared) (:e,Eshared)]
113134
)
114135
end
115136
end
116-
@eval @inline function $(Symbol(prefix,:fgemm!))(C, A, B::Adjoint)
137+
@eval function $(Symbol(prefix,:fgemm!))(C, A, B::Adjoint)
117138
M, N = size(C); K = size(B, 1)
118139
ccall(
119140
(:AmulBt, $Fshared), Cvoid,
120141
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
121142
C, A, parent(B), Ref(M), Ref(K), Ref(N)
122143
)
123144
end
124-
@eval @inline function $(Symbol(prefix,:fgemm_builtin!))(C, A, B::Adjoint)
145+
@eval function $(Symbol(prefix,:fgemm_builtin!))(C, A, B::Adjoint)
125146
M, N = size(C); K = size(B, 1)
126147
ccall(
127148
(:AmulBtbuiltin, $Fshared), Cvoid,
@@ -130,7 +151,7 @@ end
130151
)
131152
end
132153
for (p,s) [(:c,Cshared) (:e,Eshared)]
133-
@eval @inline function $(Symbol(prefix,p,:gemm!))(C, A::Adjoint, B::Adjoint)
154+
@eval function $(Symbol(prefix,p,:gemm!))(C, A::Adjoint, B::Adjoint)
134155
M, N = size(C); K = size(B, 1)
135156
ccall(
136157
(:AtmulBt, $s), Cvoid,
@@ -139,15 +160,15 @@ for (p,s) ∈ [(:c,Cshared) (:e,Eshared)]
139160
)
140161
end
141162
end
142-
@eval @inline function $(Symbol(prefix,:fgemm!))(C, A::Adjoint, B::Adjoint)
163+
@eval function $(Symbol(prefix,:fgemm!))(C, A::Adjoint, B::Adjoint)
143164
M, N = size(C); K = size(B, 1)
144165
ccall(
145166
(:AtmulBt, $Fshared), Cvoid,
146167
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
147168
C, parent(A), parent(B), Ref(M), Ref(K), Ref(N)
148169
)
149170
end
150-
@eval @inline function $(Symbol(prefix,:fgemm_builtin!))(C, A::Adjoint, B::Adjoint)
171+
@eval function $(Symbol(prefix,:fgemm_builtin!))(C, A::Adjoint, B::Adjoint)
151172
M, N = size(C); K = size(B, 1)
152173
ccall(
153174
(:AtmulBtbuiltin, $Fshared), Cvoid,
@@ -242,7 +263,7 @@ end
242263
)
243264
end
244265
for (p,s) [(:c,Cshared) (:e,Eshared)]
245-
@eval @inline function $(Symbol(prefix,p,:gemv!))(y, A::Adjoint, x)
266+
@eval function $(Symbol(prefix,p,:gemv!))(y, A::Adjoint, x)
246267
M, K = size(A)
247268
ccall(
248269
(:Atmulvb, $s), Cvoid,
@@ -251,7 +272,7 @@ for (p,s) ∈ [(:c,Cshared) (:e,Eshared)]
251272
)
252273
end
253274
end
254-
@eval @inline function $(Symbol(prefix,:fgemv!))(y, A::Adjoint, x)
275+
@eval function $(Symbol(prefix,:fgemv!))(y, A::Adjoint, x)
255276
M, K = size(A)
256277
ccall(
257278
(:Atmulvb, $Fshared), Cvoid,

benchmark/looptests.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ function jgemm!(𝐂, 𝐀, 𝐁)
1010
end
1111
end
1212
end
13-
@inline function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁)
13+
function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁)
1414
𝐀 = parent(𝐀ᵀ)
1515
@inbounds for n 1:size(𝐂,2), m 1:size(𝐂,1)
1616
𝐂ₘₙ = zero(eltype(𝐂))
@@ -20,7 +20,7 @@ end
2020
𝐂[m,n] = 𝐂ₘₙ
2121
end
2222
end
23-
@inline function jgemm!(𝐂, 𝐀, 𝐁ᵀ::Adjoint)
23+
function jgemm!(𝐂, 𝐀, 𝐁ᵀ::Adjoint)
2424
𝐂 .= 0
2525
𝐁 = parent(𝐁ᵀ)
2626
M, N = size(𝐂); K = size(𝐁ᵀ,1)
@@ -30,7 +30,7 @@ end
3030
end
3131
end
3232
end
33-
@inline function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
33+
function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
3434
𝐂 .= 0
3535
𝐀 = parent(𝐀ᵀ)
3636
𝐁 = parent(𝐁ᵀ)
@@ -41,7 +41,7 @@ end
4141
end
4242
end
4343
end
44-
@inline function gemmavx!(𝐂, 𝐀, 𝐁)
44+
function gemmavx!(𝐂, 𝐀, 𝐁)
4545
@avx for m 1:size(𝐀,1), n 1:size(𝐁,2)
4646
𝐂ₘₙ = zero(eltype(𝐂))
4747
for k 1:size(𝐀,2)
@@ -128,7 +128,7 @@ function jgemv!(y, 𝐀, x)
128128
end
129129
end
130130
end
131-
@inline function jgemv!(𝐲, 𝐀ᵀ::Adjoint, 𝐱)
131+
function jgemv!(𝐲, 𝐀ᵀ::Adjoint, 𝐱)
132132
𝐀 = parent(𝐀ᵀ)
133133
@inbounds for i eachindex(𝐲)
134134
𝐲ᵢ = zero(eltype(𝐲))
@@ -138,7 +138,7 @@ end
138138
𝐲[i] = 𝐲ᵢ
139139
end
140140
end
141-
@inline function jgemvavx!(𝐲, 𝐀, 𝐱)
141+
function jgemvavx!(𝐲, 𝐀, 𝐱)
142142
@avx for i eachindex(𝐲)
143143
𝐲ᵢ = zero(eltype(𝐲))
144144
for j eachindex(𝐱)

0 commit comments

Comments
 (0)