Skip to content

Commit 054c8b9

Browse files
committed
Bump version, update benchmarks. Random access benchmark is broken.
1 parent a45eacc commit 054c8b9

File tree

8 files changed

+241
-38
lines changed

8 files changed

+241
-38
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.4.1"
4+
version = "0.4.2"
55

66
[deps]
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

benchmark/benchmarkflops.jl

Lines changed: 77 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
22
include(joinpath(LOOPVECBENCHDIR, "loadsharedlibs.jl"))
33

4-
using PrettyTables, BenchmarkTools
4+
using BenchmarkTools
55
struct SizedResults{V <: AbstractVector} <: AbstractMatrix{String}
66
results::Matrix{Float64}
77
sizes::V
@@ -26,15 +26,6 @@ function Base.getindex(br::SizedResults, row, col)
2626
end
2727
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
2828

29-
const HIGHLIGHT_BEST = Highlighter(
30-
(br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
31-
foreground = :green
32-
);
33-
function Base.show(io::IO, br::BenchmarkResult)
34-
pretty_table(
35-
io, br.sizedresults, br.tests, crop = :none, highlighters = (HIGHLIGHT_BEST,)
36-
)
37-
end
3829

3930
tothreetuple(i::Int) = (i,i,i)
4031
tothreetuple(i::NTuple{3,Int}) = i
@@ -106,6 +97,40 @@ function benchmark_AtmulB(sizes)
10697
end
10798
br
10899
end
100+
function benchmark_AmulBt(sizes)
101+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "GFort-intrinsic", "icc", "ifort", "ifort-intrinsic", "LoopVectorization"]
102+
br = BenchmarkResult(tests, sizes)
103+
for (i,s) enumerate(sizes)
104+
M, K, N = tothreetuple(s)
105+
C = Matrix{Float64}(undef, M, N)
106+
A = rand(M, K)
107+
Bt = rand(N, K)
108+
n_gflop = M*K*N*2e-9
109+
br[1,i] = n_gflop / @belapsed mul!($C, $A, $Bt')
110+
Cblas = copy(C)
111+
br[2,i] = n_gflop / @belapsed jgemm!($C, $A, $Bt')
112+
@assert C Cblas "Julia gemm wrong?"
113+
br[3,i] = n_gflop / @belapsed cgemm!($C, $A, $Bt')
114+
@assert C Cblas "Polly gemm wrong?"
115+
br[4,i] = n_gflop / @belapsed fgemm!($C, $A, $Bt')
116+
@assert C Cblas "Fort gemm wrong?"
117+
br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $Bt')
118+
@assert C Cblas "Fort intrinsic gemm wrong?"
119+
br[6,i] = n_gflop / @belapsed icgemm!($C, $A, $Bt')
120+
@assert C Cblas "icc gemm wrong?"
121+
br[7,i] = n_gflop / @belapsed ifgemm!($C, $A, $Bt')
122+
@assert C Cblas "iort gemm wrong?"
123+
br[8,i] = n_gflop / @belapsed ifgemm_builtin!($C, $A, $Bt')
124+
@assert C Cblas "ifort intrinsic gemm wrong?"
125+
br[9,i] = n_gflop / @belapsed gemmavx!($C, $A, $Bt')
126+
@assert C Cblas "LoopVec gemm wrong?"
127+
# if i % 10 == 0
128+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
129+
# @show percent_complete
130+
# end
131+
end
132+
br
133+
end
109134

110135
function benchmark_dot(sizes)
111136
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
@@ -164,7 +189,7 @@ end
164189
totwotuple(i::Int) = (i,i)
165190
totwotuple(i::Tuple{Int,Int}) = i
166191
function benchmark_gemv(sizes)
167-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
192+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "GFort-intrinsic", "icc", "ifort", "ifort-intrinsic", "LoopVectorization"]
168193
br = BenchmarkResult(tests, sizes)
169194
for (i,s) enumerate(sizes)
170195
M, N = totwotuple(s)
@@ -178,11 +203,47 @@ function benchmark_gemv(sizes)
178203
@assert x xblas "Polly wrong?"
179204
br[4,i] = n_gflop / @belapsed fgemv!($x, $A, $y)
180205
@assert x xblas "Fort wrong?"
181-
br[5,i] = n_gflop / @belapsed icgemv!($x, $A, $y)
206+
br[5,i] = n_gflop / @belapsed fgemv_builtin!($x, $A, $y)
207+
@assert x xblas "Fort wrong?"
208+
br[6,i] = n_gflop / @belapsed icgemv!($x, $A, $y)
182209
@assert x xblas "icc wrong?"
183-
br[6,i] = n_gflop / @belapsed ifgemv!($x, $A, $y)
210+
br[7,i] = n_gflop / @belapsed ifgemv!($x, $A, $y)
211+
@assert x xblas "ifort wrong?"
212+
br[8,i] = n_gflop / @belapsed ifgemv_builtin!($x, $A, $y)
213+
@assert x xblas "ifort wrong?"
214+
br[9,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
215+
@assert x xblas "LoopVec wrong?"
216+
# if i % 10 == 0
217+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
218+
# @show percent_complete
219+
# end
220+
end
221+
br
222+
end
223+
function benchmark_Atmulvb(sizes)
224+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "GFort-intrinsic", "icc", "ifort", "ifort-intrinsic", "LoopVectorization"]
225+
br = BenchmarkResult(tests, sizes)
226+
for (i,s) enumerate(sizes)
227+
M, N = totwotuple(s)
228+
x = Vector{Float64}(undef, M); A = rand(N, M); y = rand(N);
229+
n_gflop = M*N * 2e-9
230+
br[1,i] = n_gflop / @belapsed mul!($x, $A', $y)
231+
xblas = copy(x)
232+
br[2,i] = n_gflop / @belapsed jgemv!($x, $A', $y)
233+
@assert x xblas "Julia wrong?"
234+
br[3,i] = n_gflop / @belapsed cgemv!($x, $A', $y)
235+
@assert x xblas "Polly wrong?"
236+
br[4,i] = n_gflop / @belapsed fgemv!($x, $A', $y)
237+
@assert x xblas "Fort wrong?"
238+
br[5,i] = n_gflop / @belapsed fgemv_builtin!($x, $A', $y)
239+
@assert x xblas "Fort wrong?"
240+
br[6,i] = n_gflop / @belapsed icgemv!($x, $A', $y)
241+
@assert x xblas "icc wrong?"
242+
br[7,i] = n_gflop / @belapsed ifgemv!($x, $A', $y)
243+
@assert x xblas "ifort wrong?"
244+
br[8,i] = n_gflop / @belapsed ifgemv_builtin!($x, $A', $y)
184245
@assert x xblas "ifort wrong?"
185-
br[7,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
246+
br[9,i] = n_gflop / @belapsed jgemvavx!($x, $A', $y)
186247
@assert x xblas "LoopVec wrong?"
187248
# if i % 10 == 0
188249
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
@@ -323,13 +384,13 @@ function benchmark_AplusAt(sizes)
323384
@assert B baseB "Clang wrong?"
324385
br[3,i] = n_gflop / @belapsed fAplusAt!($B, $A)
325386
@assert B baseB "Fort wrong?"
326-
br[4,i] = n_gflop / @belapsed fAplusAtbuiltin!($B, $A)
387+
br[4,i] = n_gflop / @belapsed fAplusAt_builtin!($B, $A)
327388
@assert B baseB "Fort-builtin wrong?"
328389
br[5,i] = n_gflop / @belapsed icAplusAt!($B, $A)
329390
@assert B baseB "icc wrong?"
330391
br[6,i] = n_gflop / @belapsed ifAplusAt!($B, $A)
331392
@assert B baseB "ifort wrong?"
332-
br[7,i] = n_gflop / @belapsed ifAplusAtbuiltin!($B, $A)
393+
br[7,i] = n_gflop / @belapsed ifAplusAt_builtin!($B, $A)
333394
@assert B baseB "ifort-builtin wrong?"
334395
br[8,i] = n_gflop / @belapsed @avx @. $B = $A + $A'
335396
@assert B baseB "LoopVec wrong?"

benchmark/driver.jl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
1010

1111
using Distributed
1212

13-
addprocs(11);
13+
addprocs(13);
1414

1515
@everywhere begin
1616
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
@@ -30,6 +30,8 @@ exp_future = @spawnat 9 benchmark_exp(2:256);
3030
aplusBc_future = @spawnat 10 benchmark_aplusBc(2:256);
3131
AplusAt_future = @spawnat 11 benchmark_AplusAt(2:256);
3232
randomaccess_future = @spawnat 12 benchmark_random_access(2:256);
33+
AmulBt_future = @spawnat 13 benchmark_AmulBt(2:256);
34+
Atmulvb_future = @spawnat 14 benchmark_Atmulvb(2:256);
3335

3436
dot_bench = fetch(dot_future)
3537
selfdot_bench = fetch(selfdot_future)
@@ -42,9 +44,10 @@ exp_bench = fetch(exp_future)
4244
aplusBc_bench = fetch(aplusBc_future)
4345
gemm_bench = fetch(gemm_future)
4446
AtmulB_bench = fetch(AtmulB_future)
47+
AmulBt_bench = fetch(AmulBt_future)
48+
Atmulvb_bench = fetch(Atmulvb_future)
4549

46-
47-
v = 1
50+
v = 2
4851
const PICTURES = "/home/chriselrod/Pictures"
4952
save(joinpath(PICTURES, "bench_gemm_v$v.png"), plot(gemm_bench));
5053
save(joinpath(PICTURES, "bench_AtmulB_v$v.png"), plot(AtmulB_bench));
@@ -57,7 +60,8 @@ save(joinpath(PICTURES, "bench_exp_v$v.png"), plot(exp_bench));
5760
save(joinpath(PICTURES, "bench_aplusBc_v$v.png"), plot(aplusBc_bench));
5861
save(joinpath(PICTURES, "bench_AplusAt_v$v.png"), plot(AplusAt_bench));
5962
save(joinpath(PICTURES, "bench_random_access_v$v.png"), plot(randomaccess_bench));
60-
63+
save(joinpath(PICTURES, "bench_AmulBt_v$b.png"), plot(AmulBt_bench));
64+
save(joinpath(PICTURES, "bench_Atmulvb_v$v.png"), plot(Atmulvb_bench));
6165

6266
plot(gemm_bench)
6367
plot(AtmulB_bench)

benchmark/loadsharedlibs.jl

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,30 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
8181
C, parent(A), B, Ref(M), Ref(K), Ref(N)
8282
)
8383
end
84-
84+
@eval @inline function $(Symbol(prefix,:cgemm!))(C, A, B::Adjoint)
85+
M, N = size(C); K = size(B, 1)
86+
ccall(
87+
(:AmulBt, $Cshared), Cvoid,
88+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong, Clong),
89+
C, A, parent(B), M, K, N
90+
)
91+
end
92+
@eval @inline function $(Symbol(prefix,:fgemm!))(C, A, B::Adjoint)
93+
M, N = size(C); K = size(B, 1)
94+
ccall(
95+
(:AmulBt, $Fshared), Cvoid,
96+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
97+
C, A, parent(B), Ref(M), Ref(K), Ref(N)
98+
)
99+
end
100+
@eval @inline function $(Symbol(prefix,:fgemm_builtin!))(C, A, B::Adjoint)
101+
M, N = size(C); K = size(B, 1)
102+
ccall(
103+
(:AmulBtbuiltin, $Fshared), Cvoid,
104+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
105+
C, A, parent(B), Ref(M), Ref(K), Ref(N)
106+
)
107+
end
85108
@eval function $(Symbol(prefix,:cdot))(a, b)
86109
N = length(a)
87110
ccall(
@@ -161,6 +184,30 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
161184
y, A, x, Ref(M), Ref(K)
162185
)
163186
end
187+
@eval @inline function $(Symbol(prefix,:cgemv!))(y, A::Adjoint, x)
188+
M, K = size(A)
189+
ccall(
190+
(:Atmulvb, $Cshared), Cvoid,
191+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
192+
y, parent(A), x, M, K
193+
)
194+
end
195+
@eval @inline function $(Symbol(prefix,:fgemv!))(y, A::Adjoint, x)
196+
M, K = size(A)
197+
ccall(
198+
(:Atmulvb, $Fshared), Cvoid,
199+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
200+
y, parent(A), x, Ref(M), Ref(K)
201+
)
202+
end
203+
@eval function $(Symbol(prefix,:fgemv_builtin!))(y, A::Adjoint, x)
204+
M, K = size(A)
205+
ccall(
206+
(:Atmulvbbuiltin, $Fshared), Cvoid,
207+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
208+
y, parent(A), x, Ref(M), Ref(K)
209+
)
210+
end
164211

165212
@eval function $(Symbol(prefix,:caplusBc!))(D, a, B, c)
166213
M, K = size(B)
@@ -231,7 +278,7 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
231278
B, A, Ref(N)
232279
)
233280
end
234-
@eval function $(Symbol(prefix,:fAplusAtbuiltin!))(B, A)
281+
@eval function $(Symbol(prefix,:fAplusAt_builtin!))(B, A)
235282
N = size(B,1)
236283
ccall(
237284
(:AplusAtbuiltin, $Fshared), Cvoid,

benchmark/looptests.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,19 @@ void AtmulB(double* restrict C, double* restrict At, double* restrict B, long M,
9191
}
9292
return;
9393
}
94+
void AmulBt(double* restrict C, double* restrict A, double* restrict Bt, long M, long K, long N){
95+
for (long i = 0; i < M*N; i++){
96+
C[i] = 0.0;
97+
}
98+
for (long k = 0; k < K; k++){
99+
for (long n = 0; n < N; n++){
100+
for (long m = 0; m < M; m++){
101+
C[m + n*M] += A[m + M*k] * Bt[n + N*k];
102+
}
103+
}
104+
}
105+
return;
106+
}
94107
double dot(double* restrict a, double* restrict b, long N){
95108
double s = 0.0;
96109
for (long n = 0; n < N; n++){
@@ -125,6 +138,17 @@ void gemv(double* restrict y, double* restrict A, double* restrict x, long M, l
125138
}
126139
return;
127140
}
141+
void Atmulvb(double* restrict y, double* restrict A, double* restrict x, long M, long K){
142+
for (long m = 0; m < M; m++){
143+
y[m] = 0.0;
144+
}
145+
for (long m = 0; m < M; m++){
146+
for (long k = 0; k < K; k++){
147+
y[m] += A[k + m*K] * x[k];
148+
}
149+
}
150+
return;
151+
}
128152
double svexp(double* restrict a, long N){
129153
double s = 0.0;
130154
for (long n = 0; n < N; n++){

benchmark/looptests.f90

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,30 @@ subroutine AtmulBbuiltin(C, A, B, M, K, N) BIND(C, name="AtmulBbuiltin")
123123
real(C_double), dimension(M, N), intent(out) :: C
124124
real(C_double), dimension(K, M), intent(in) :: A
125125
real(C_double), dimension(K, N), intent(in) :: B
126-
integer(C_long) :: mm, kk, nn
127126
C = matmul(transpose(A), B)
128127
end subroutine AtmulBbuiltin
128+
subroutine AmulBt(C, A, B, M, K, N) BIND(C, name="AmulBt")
129+
integer(C_long), intent(in) :: M, K, N
130+
real(C_double), dimension(M, N), intent(out) :: C
131+
real(C_double), dimension(M, K), intent(in) :: A
132+
real(C_double), dimension(N, K), intent(in) :: B
133+
integer(C_long) :: mm, kk, nn
134+
C = 0.0
135+
do concurrent(kk = 1:K)
136+
do concurrent(nn = 1:N)
137+
do concurrent(mm = 1:M)
138+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(nn,kk)
139+
end do
140+
end do
141+
end do
142+
end subroutine AmulBt
143+
subroutine AmulBtbuiltin(C, A, B, M, K, N) BIND(C, name="AmulBtbuiltin")
144+
integer(C_long), intent(in) :: M, K, N
145+
real(C_double), dimension(M, N), intent(out) :: C
146+
real(C_double), dimension(M, K), intent(in) :: A
147+
real(C_double), dimension(N, K), intent(in) :: B
148+
C = matmul(A, transpose(B))
149+
end subroutine AmulBtbuiltin
129150
subroutine dot(s, a, b, N) BIND(C, name="dot")
130151
integer(C_long), intent(in) :: N
131152
real(C_double), dimension(N), intent(in) :: a, b
@@ -189,13 +210,30 @@ subroutine gemv(y, A, x, M, K) BIND(C, name="gemv")
189210
end do
190211
end do
191212
end subroutine gemv
192-
subroutine gemvbuiltin(y, A, x, M, K) BIND(C, name="gemv_builtin")
213+
subroutine gemvbuiltin(y, A, x, M, K) BIND(C, name="gemvbuiltin")
193214
integer(C_long), intent(in) :: M, K
194215
real(C_double), intent(in) :: A(M,K), x(K)
195216
real(C_double), dimension(M), intent(out) :: y
196-
integer(C_long) :: mm, kk
197217
y = matmul(A, x)
198218
end subroutine gemvbuiltin
219+
subroutine Atmulvb(y, A, x, M, K) BIND(C, name="Atmulvb")
220+
integer(C_long), intent(in) :: M, K
221+
real(C_double), intent(in) :: A(K,M), x(K)
222+
real(C_double), dimension(M), intent(out) :: y
223+
integer(C_long) :: mm, kk
224+
y = 0.0
225+
do concurrent(mm = 1:M)
226+
do concurrent(kk = 1:K)
227+
y(mm) = y(mm) + A(kk,mm) * x(kk)
228+
end do
229+
end do
230+
end subroutine Atmulvb
231+
subroutine Atmulvbbuiltin(y, A, x, M, K) BIND(C, name="Atmulvbbuiltin")
232+
integer(C_long), intent(in) :: M, K
233+
real(C_double), intent(in) :: A(K,M), x(K)
234+
real(C_double), dimension(M), intent(out) :: y
235+
y = matmul(transpose(A), x)
236+
end subroutine Atmulvbbuiltin
199237
subroutine unscaledvar(s, A, x, M, N) BIND(C, name="unscaledvar")
200238
integer(C_long), intent(in) :: M, N
201239
real(C_double), intent(in) :: A(M,N), x(M)

0 commit comments

Comments
 (0)