Skip to content

Commit 7c98fb7

Browse files
committed
Started working on documentation, stop tiler from choosing 0 size, update precompile.
1 parent 4b481a3 commit 7c98fb7

30 files changed

+430
-102
lines changed

.gitignore

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@
55
*.jl.*.cov
66
*.jl.mem
77
*~
8-
src/#*#
9-
tests/#*#
10-
benchmark/#*#
118
*.mem
129
*.mod
1310
*.mod0
1411
*.so
1512
*.s
13+
*#

benchmark/driver.jl

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,33 +35,35 @@ Atmulvb_future = @spawnat 14 benchmark_Atmulvb(2:256);
3535

3636
dot_bench = fetch(dot_future)
3737
selfdot_bench = fetch(selfdot_future)
38-
AplusAt_bench = fetch(AplusAt_future)
3938
gemv_bench = fetch(gemv_future)
4039
randomaccess_bench = fetch(randomaccess_future)
4140
dot3_bench = fetch(dot3_future)
4241
sse_bench = fetch(sse_future)
4342
exp_bench = fetch(exp_future)
43+
AplusAt_bench = fetch(AplusAt_future)
4444
aplusBc_bench = fetch(aplusBc_future)
4545
gemm_bench = fetch(gemm_future)
4646
AtmulB_bench = fetch(AtmulB_future)
4747
AmulBt_bench = fetch(AmulBt_future)
4848
Atmulvb_bench = fetch(Atmulvb_future)
4949

50-
v = 6
51-
const PICTURES = "/home/chriselrod/Pictures"
52-
save(joinpath(PICTURES, "bench_gemm_v$v.png"), plot(gemm_bench));
53-
save(joinpath(PICTURES, "bench_AtmulB_v$v.png"), plot(AtmulB_bench));
54-
save(joinpath(PICTURES, "bench_dot_v$v.png"), plot(dot_bench));
55-
save(joinpath(PICTURES, "bench_selfdot_v$v.png"), plot(selfdot_bench));
56-
save(joinpath(PICTURES, "bench_gemv_v$v.png"), plot(gemv_bench));
57-
save(joinpath(PICTURES, "bench_dot3_v$v.png"), plot(dot3_bench));
58-
save(joinpath(PICTURES, "bench_sse_v$v.png"), plot(sse_bench));
59-
save(joinpath(PICTURES, "bench_exp_v$v.png"), plot(exp_bench));
60-
save(joinpath(PICTURES, "bench_aplusBc_v$v.png"), plot(aplusBc_bench));
61-
save(joinpath(PICTURES, "bench_AplusAt_v$v.png"), plot(AplusAt_bench));
62-
save(joinpath(PICTURES, "bench_random_access_v$v.png"), plot(randomaccess_bench));
63-
save(joinpath(PICTURES, "bench_AmulBt_v$v.png"), plot(AmulBt_bench));
64-
save(joinpath(PICTURES, "bench_Atmulvb_v$v.png"), plot(Atmulvb_bench));
50+
51+
v = 1
52+
filetype = "svg"
53+
const PICTURES = joinpath(pkgdir("LoopVectorization"), "docs", "src", "assets")
54+
save(joinpath(PICTURES, "bench_gemm_v$v.$filetype"), plot(gemm_bench));
55+
save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));
56+
save(joinpath(PICTURES, "bench_dot_v$v.$filetype"), plot(dot_bench));
57+
save(joinpath(PICTURES, "bench_selfdot_v$v.$filetype"), plot(selfdot_bench));
58+
save(joinpath(PICTURES, "bench_gemv_v$v.$filetype"), plot(gemv_bench));
59+
save(joinpath(PICTURES, "bench_dot3_v$v.$filetype"), plot(dot3_bench));
60+
save(joinpath(PICTURES, "bench_sse_v$v.$filetype"), plot(sse_bench));
61+
save(joinpath(PICTURES, "bench_aplusBc_v$v.$filetype"), plot(aplusBc_bench));
62+
save(joinpath(PICTURES, "bench_AplusAt_v$v.$filetype"), plot(AplusAt_bench));
63+
save(joinpath(PICTURES, "bench_random_access_v$v.$filetype"), plot(randomaccess_bench));
64+
save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
65+
save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
66+
save(joinpath(PICTURES, "bench_exp_v$v.$filetype"), plot(exp_bench));
6567

6668

6769

benchmark/looptests.jl

Lines changed: 59 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,67 @@
11
using LoopVectorization, LinearAlgebra
22
BLAS.set_num_threads(1)
33

4-
function jgemm!(C, A, B)
5-
C .= 0
6-
M, N = size(C); K = size(B,1)
4+
function jgemm!(𝐂, 𝐀, 𝐁)
5+
𝐂 .= 0
6+
M, N = size(𝐂); K = size(𝐁,1)
77
@inbounds for n 1:N, k 1:K
88
@simd ivdep for m 1:M
9-
C[m,n] += A[m,k] * B[k,n]
9+
𝐂[m,n] += 𝐀[m,k] * 𝐁[k,n]
1010
end
1111
end
1212
end
13-
@inline function jgemm!(C, Aᵀ::Adjoint, B)
14-
A = parent(Aᵀ)
15-
@inbounds for n 1:size(C,2), m 1:size(C,1)
16-
Cₘₙ = zero(eltype(C))
17-
@simd ivdep for k 1:size(A,1)
18-
Cₘₙ += A[k,m] * B[k,n]
13+
@inline function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁)
14+
𝐀 = parent(𝐀ᵀ)
15+
@inbounds for n 1:size(𝐂,2), m 1:size(𝐂,1)
16+
𝐂ₘₙ = zero(eltype(𝐂))
17+
@simd ivdep for k 1:size(𝐀,1)
18+
𝐂ₘₙ += 𝐀[k,m] * 𝐁[k,n]
1919
end
20-
C[m,n] = Cₘₙ
20+
𝐂[m,n] = 𝐂ₘₙ
2121
end
2222
end
23-
@inline function jgemm!(C, A, Bᵀ::Adjoint)
24-
C .= 0
25-
B = parent(Bᵀ)
26-
M, N = size(C); K = size(B,1)
23+
@inline function jgemm!(𝐂, 𝐀, 𝐁ᵀ::Adjoint)
24+
𝐂 .= 0
25+
𝐁 = parent(𝐁ᵀ)
26+
M, N = size(𝐂); K = size(𝐁,1)
2727
@inbounds for k 1:K, n 1:N
2828
@simd ivdep for m 1:M
29-
C[m,n] += A[m,k] * B[n,k]
29+
𝐂[m,n] += 𝐀[m,k] * 𝐁[n,k]
3030
end
3131
end
3232
end
33-
@inline function gemmavx!(C, A, B)
34-
@avx for i 1:size(A,1), j 1:size(B,2)
35-
Cᵢⱼ = zero(eltype(C))
36-
for k 1:size(A,2)
37-
Cᵢⱼ += A[i,k] * B[k,j]
33+
@inline function gemmavx!(𝐂, 𝐀, 𝐁)
34+
@avx for m 1:size(𝐀,1), n 1:size(𝐁,2)
35+
𝐂ₘₙ = zero(eltype(𝐂))
36+
for k 1:size(𝐀,2)
37+
𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
3838
end
39-
C[i,j] = Cᵢⱼ
39+
𝐂[m,n] = 𝐂ₘₙ
4040
end
4141
end
4242
function jdot(a, b)
43-
s = 0.0
43+
s = zero(eltype(a))
4444
@inbounds @simd ivdep for i eachindex(a, b)
4545
s += a[i] * b[i]
4646
end
4747
s
4848
end
4949
function jdotavx(a, b)
50-
s = 0.0
50+
s = zero(eltype(a))
5151
@avx for i eachindex(a, b)
5252
s += a[i] * b[i]
5353
end
5454
s
5555
end
5656
function jselfdot(a)
57-
s = 0.0
57+
s = zero(eltype(a))
5858
@inbounds @simd ivdep for i eachindex(a)
5959
s += a[i] * a[i]
6060
end
6161
s
6262
end
6363
function jselfdotavx(a)
64-
s = 0.0
64+
s = zero(eltype(a))
6565
@avx for i eachindex(a)
6666
s += a[i] * a[i]
6767
end
@@ -96,71 +96,71 @@ function jvexpavx!(b, a)
9696
end
9797
end
9898
function jsvexp(a)
99-
s = 0.0
99+
s = zero(eltype(a))
100100
@inbounds for i eachindex(a)
101101
s += exp(a[i])
102102
end
103103
s
104104
end
105105
function jsvexpavx(a)
106-
s = 0.0
106+
s = zero(eltype(a))
107107
@avx for i eachindex(a)
108108
s += exp(a[i])
109109
end
110110
s
111111
end
112-
function jgemv!(y, A, x)
113-
y .= 0.0
112+
function jgemv!(y, 𝐀, x)
113+
y .= zero(eltype(y))
114114
@inbounds for j eachindex(x)
115115
@simd ivdep for i eachindex(y)
116-
y[i] += A[i,j] * x[j]
116+
y[i] += 𝐀[i,j] * x[j]
117117
end
118118
end
119119
end
120-
@inline function jgemv!(y, Aᵀ::Adjoint, x)
121-
A = parent(Aᵀ)
122-
@inbounds for i eachindex(y)
123-
yᵢ = 0.0
124-
@simd ivdep for j eachindex(x)
125-
yᵢ += A[j,i] * x[j]
120+
@inline function jgemv!(𝐲, 𝐀ᵀ::Adjoint, 𝐱)
121+
𝐀 = parent(𝐀ᵀ)
122+
@inbounds for i eachindex(𝐲)
123+
𝐲ᵢ = zero(eltype(𝐲))
124+
@simd ivdep for j eachindex(𝐱)
125+
𝐲ᵢ += 𝐀[j,i] * 𝐱[j]
126126
end
127-
y[i] = yᵢ
127+
𝐲[i] = 𝐲ᵢ
128128
end
129129
end
130-
@inline function jgemvavx!(y, A, x)
131-
@avx for i eachindex(y)
132-
yᵢ = 0.0
133-
for j eachindex(x)
134-
yᵢ += A[i,j] * x[j]
130+
@inline function jgemvavx!(𝐲, 𝐀, 𝐱)
131+
@avx for i eachindex(𝐲)
132+
𝐲ᵢ = zero(eltype(𝐲))
133+
for j eachindex(𝐱)
134+
𝐲ᵢ += 𝐀[i,j] * 𝐱[j]
135135
end
136-
y[i] = yᵢ
136+
𝐲[i] = 𝐲ᵢ
137137
end
138138
end
139-
function jvar!(s², A, x̄)
140-
@.= 0
141-
@inbounds for i 1:size(A,2)
142-
@simd for j eachindex(s²)
143-
δ = A[j,i] - x̄[j]
144-
s²[j] += δ*δ
139+
function jvar!(𝐬², 𝐀, x̄)
140+
@.= zero(eltype(𝐬²))
141+
@inbounds for i 1:size(𝐀,2)
142+
@simd for j eachindex(𝐬²)
143+
δ = 𝐀[j,i] - x̄[j]
144+
𝐬²[j] += δ*δ
145145
end
146146
end
147147
end
148-
function jvaravx!(s², A, x̄)
149-
@avx for j eachindex(s²)
150-
s²ⱼ = 0.0
148+
function jvaravx!(𝐬², 𝐀, x̄)
149+
@avx for j eachindex(𝐬²)
150+
𝐬²ⱼ = zero(eltype(𝐬²))
151151
x̄ⱼ = x̄[j]
152-
for i 1:size(A,2)
153-
δ = A[j,i] - x̄ⱼ
154-
s²ⱼ += δ*δ
152+
for i 1:size(𝐀,2)
153+
δ = 𝐀[j,i] - x̄ⱼ
154+
𝐬²ⱼ += δ*δ
155155
end
156-
s²[j] = s²ⱼ
156+
𝐬²[j] = 𝐬²ⱼ
157157
end
158158
end
159159
japlucBc!(d, a, B, c) = @. d = a + B * c';
160160
japlucBcavx!(d, a, B, c) = @avx @. d = a + B * c';
161161

162162
function jOLSlp(y, X, β)
163-
lp = 0.0
163+
lp = zero(eltype(y))
164164
@inbounds @fastmath for i eachindex(y)
165165
δ = y[i]
166166
@simd for j eachindex(β)
@@ -171,7 +171,7 @@ function jOLSlp(y, X, β)
171171
lp
172172
end
173173
function jOLSlp_avx(y, X, β)
174-
lp = 0.0
174+
lp = zero(eltype(y))
175175
@avx for i eachindex(y)
176176
δ = y[i]
177177
for j eachindex(β)

docs/make.jl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@ makedocs(;
55
format=Documenter.HTML(),
66
pages=[
77
"Home" => "index.md",
8+
"Getting Started" => "getting_started.md",
9+
"Examples" => Any[
10+
"examples/matrix_multiplication.md",
11+
"examples/matrix_vector_ops.md",
12+
"examples/dot_product.md",
13+
"examples/sum_of_squared_error.md"
14+
],
15+
"Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
16+
"Future Work" => "future_work.md"
817
],
918
repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
1019
sitename="LoopVectorization.jl",

docs/src/assets/bench_AmulBt_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_AplusAt_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_AtmulB_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_Atmulvb_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_aplusBc_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_dot3_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_dot_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_gemm_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_gemv_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_random_access_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_selfdot_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/assets/bench_sse_v1.svg

Lines changed: 1 addition & 0 deletions
Loading

docs/src/examples/dot_product.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Dot Products
2+
3+
Dot products are so simple, it is almost surprising that compilers leave any performance on the table.
4+
5+
```julia
6+
function jdotavx(a, b)
7+
s = zero(eltype(a))
8+
@avx for i eachindex(a, b)
9+
s += a[i] * b[i]
10+
end
11+
s
12+
end
13+
```
14+
![dot](../assets/bench_dot_v1.svg)
15+
16+
17+
```julia
18+
function jselfdotavx(a)
19+
s = zero(eltype(a))
20+
@avx for i eachindex(a)
21+
s += a[i] * a[i]
22+
end
23+
s
24+
end
25+
```
26+
![selfdot](../assets/bench_selfdot_v1.svg)
27+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Matrix Multiplication
2+
3+
```julia
4+
@inline function A_mul_B!(𝐂, 𝐀, 𝐁)
5+
@avx for m 1:size(𝐀,1), n 1:size(𝐁,2)
6+
𝐂ₘₙ = zero(eltype(𝐂))
7+
for k 1:size(𝐀,2)
8+
𝐂ₘₙ += 𝐀[m,k] * 𝐁[k,n]
9+
end
10+
𝐂[m,n] = 𝐂ₘₙ
11+
end
12+
end
13+
```
14+
15+
Letting all three matrices be square and `Size` x `Size`, we attain the following benchmark results:
16+
17+
![AmulB](../assets/bench_gemm_v1.svg)
18+
19+
20+
![AtmulB](../assets/bench_AtmulB_v1.svg)
21+
22+
23+
![AmulBt](../assets/bench_AmulBt_v1.svg)
24+
25+
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Matrix-Vector Operations
2+
3+
Here I'll discuss a variety of Matrix-vector operations, naturally starting with matrix-vector multiplication.
4+
5+
```julia
6+
@inline function jgemvavx!(𝐲, 𝐀, 𝐱)
7+
@avx for i eachindex(𝐲)
8+
𝐲ᵢ = zero(eltype(𝐲))
9+
for j eachindex(𝐱)
10+
𝐲ᵢ += 𝐀[i,j] * 𝐱[j]
11+
end
12+
𝐲[i] = 𝐲ᵢ
13+
end
14+
end
15+
```
16+
17+
Using a square `Size` x `Size` matrix `A`, we find the following results.
18+
![Amulvb](../assets/bench_gemv_v1.svg)
19+
20+
21+
![Atmulvb](../assets/bench_Atmulvb_v1.svg)
22+
23+
24+
![dot3](../assets/bench_dot3_v1.svg)
25+
26+
27+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Sum of squared error
2+
3+
To calculate `(y - X * β)'(y - X * β)`, we can use the following loop.
4+
```julia
5+
function sse_avx(y, X, β)
6+
lp = zero(eltype(y))
7+
@avx for i eachindex(y)
8+
δ = y[i]
9+
for j eachindex(β)
10+
δ -= X[i,j] * β[j]
11+
end
12+
lp += δ * δ
13+
end
14+
lp
15+
end
16+
```
17+
18+
LoopVectorization does not model memory access yet.
19+
That seems important for this example, where performance starts to decline for sizes larger than 60.
20+
Letting `N` be the size, `X` was a `3N/2`x `N/2` matrix. Therefore, performance started to suffer
21+
when `X` had more than about 30 columns (performance is much less sensitive to the number of rows).
22+
23+
![sse](../assets/bench_sse_v1.svg)
24+
25+
26+

docs/src/future_work.md

Whitespace-only changes.

0 commit comments

Comments
 (0)