Skip to content

Commit 5ce49bf

Browse files
committed
Fix loopvalue-associated costs in determinestrategy and rerun filter benchmarks.
1 parent 07e71a1 commit 5ce49bf

12 files changed

+47
-34
lines changed

benchmark/benchmarkflops.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,15 @@ function Base.getindex(br::SizedResults, row, col)
2525
col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
2626
end
2727
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
28-
28+
function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult)
29+
BenchmarkResult(
30+
br1.tests,
31+
SizedResults(
32+
SharedMatrix(hcat(br1.sizedresults.results, br2.sizedresults.results)),
33+
vcat(br1.sizedresults.sizes, br2.sizedresults.sizes)
34+
)
35+
)
36+
end
2937

3038
tothreetuple(i::Int) = (i,i,i)
3139
tothreetuple(i::NTuple{3,Int}) = i

benchmark/driver.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ end
2323
# sizes = 23:23
2424
sizes = 256:-1:2
2525

26-
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)
27-
filter2d_3x3_bench = benchmark_filter2d3x3(sizes)
28-
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)
26+
filter2d_dynamic_bench = benchmark_filter2ddynamic(512:-1:2)
27+
filter2d_3x3_bench = benchmark_filter2d3x3(512:-1:2)
28+
filter2d_unrolled_bench = benchmark_filter2dunrolled(512:-1:2)
2929

3030
AmulB_bench = benchmark_AmulB(sizes)
3131
AmulBt_bench = benchmark_AmulBt(sizes)

benchmark/loadsharedlibs.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
1616
# requires Clang with polly to build
1717
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
1818
if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
19-
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
19+
run(`/usr/local/bin/clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
2020
end
2121
if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
2222
run(`icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`)

benchmark/plotbenchmarks.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
using PrettyTables
22

3-
const HIGHLIGHT_BEST = Highlighter(
4-
(br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
5-
foreground = :green
6-
);
73
function Base.show(io::IO, br::BenchmarkResult)
4+
hb = Highlighter(
5+
(br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
6+
foreground = :green
7+
);
88
pretty_table(
9-
io, br.sizedresults, br.tests, crop = :none, highlighters = (HIGHLIGHT_BEST,)
9+
io, br.sizedresults, br.tests, crop = :none, highlighters = (hb,)
1010
)
1111
end
1212

docs/src/assets/bench_filter2d_3x3_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_filter2d_dynamic_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_filter2d_unrolled_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/examples/filtering.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,11 @@ These are four nested loops. For all the benchmarks, `kern` was only 3 by 3, mak
2121
LoopVectorization achieved much better performance than all the alternatives, which tended to prefer vectorizing the inner loops.
2222
By making the compilers aware that the `ik` loop is too short to be worth vectorizing, we can get them to vectorize something else instead. By defining the size of `kern` as constant in `C` and `Fortran`, and using size parameters in Julia, we can inform the compilers:
2323
![staticsizefilter](../assets/bench_filter2d_3x3_v1.svg)
24-
Now all are doing much better than they were before, although still well shy of the 131.2 GFLOPS theoretical limit for the host CPU cores. While they all improved, three are lagging behind the main group:
24+
Now all are doing much better than they were before, although still well shy of the 131.2 GFLOPS theoretical limit for the host CPU cores. While they all improved, two are lagging behind the main group:
2525
- `ifort` lags behind all the others except base Julia. I'd need to do more investigating to find out why.
26-
- Providing static size information was enough for all to realize vectorizing the inner loops was not worth it. However, all but base Julia decided to vectorize a different loop instead, while the base Julia version I tested just didn't vectorize at all.
27-
- LoopVectorization currently only unrolls up to 2 loops. To get optimal performance in this problem, if you know the size of the inner loops, you should completely unroll them, and then also partially unroll the outer loops. I'll have to lift that restriction ([tracking issue](https://github.com/chriselrod/LoopVectorization.jl/issues/73)), and also make it aware that unrolling the outer loops is cheap, thanks to the ability to reuse neighboring `A` entries.
26+
- Providing static size information was enough for all to realize vectorizing the inner loops was not worth it. However, all but base Julia decided to vectorize a different loop instead, while the base Julia version I tested just didn't vectorize at all.
2827

29-
Trying to provide hints by manually unrolling produces:
28+
Helping Base Julia out by manually unrolling the inner loops:
3029
![unrolledfilter](../assets/bench_filter2d_unrolled_v1.svg)
31-
This manual unrolling helped both Julia versions, while there was no change in any of the others.
30+
This manual unrolling helped Julia, but had no real impact on any of the others.
3231

src/determinestrategy.jl

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ function register_pressure(op::Operation)
5151
end
5252
function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
5353
isconstant(op) && return 0.0, 0, 1
54-
isloopvalue(op) && return 0.0, 0, 1
54+
isloopvalue(op) && return 0.0, 0, 0
5555
# Wshift == dependson(op, vectorized) ? Wshift : 0
5656
# c = first(cost(instruction(op), Wshift, size_T))::Int
5757
instr = Instruction(:LoopVectorization, instruction(op).instr)
@@ -60,6 +60,8 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
6060
if instr == Instruction(:-) || instr === Instruction(:vsub) || instr == Instruction(:+) || instr == Instruction(:vadd)
6161
return 0.0, 0, 1
6262
end
63+
elseif iscompute(op) && all(isloopvalue, parents(op))
64+
return 0.0, 0, 1
6365
end
6466
opisvectorized = dependson(op, vectorized)
6567
srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
@@ -244,7 +246,8 @@ function tile_cost(X, U, T, UL, TL)
244246
X[1] + X[4] + X[2] * Tfactor + X[3] * Ufactor
245247
end
246248
function solve_tilesize(X, R, UL, TL)
247-
@inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
249+
# @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
250+
first(iszero(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
248251
# @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
249252
# We use a lagrange multiplier to find floating point values for U and T
250253
# first solving for U via quadratic formula
@@ -256,7 +259,9 @@ function solve_tilesize(X, R, UL, TL)
256259
Ufloat = (sqrt(b^2 - 4a*c) - b) / (2a)
257260
Tfloat = (RR - Ufloat*R[2])/(Ufloat*R[1])
258261
# @show Ufloat, Tfloat
259-
(isfinite(Tfloat) && isfinite(Ufloat)) || return -1,-1,Inf
262+
if !(isfinite(Tfloat) && isfinite(Ufloat))
263+
return 4, 4, tile_cost(X, 4, 4, UL, TL)
264+
end
260265
Ulow = max(1, floor(Int, Ufloat)) # must be at least 1
261266
Tlow = max(1, floor(Int, Tfloat)) # must be at least 1
262267
Uhigh = Ulow + 1 #ceil(Int, Ufloat)
@@ -301,7 +306,7 @@ function solve_tilesize_constT(ls, T)
301306
end
302307
# Tiling here is about alleviating register pressure for the UxT
303308
function solve_tilesize(X, R, Umax, Tmax, UL, TL)
304-
first(R) == 0 && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
309+
iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
305310
U, T, cost = solve_tilesize(X, R, UL, TL)
306311
# T -= T & 1
307312
# U = min(U, T)
@@ -482,6 +487,7 @@ function evaluate_cost_tile(
482487
factor = convolution_cost_factor(ls, op, unrolled, tiled, vectorized)
483488
rt *= factor#; rp *= factor;
484489
end
490+
# @show op rt, lat, rp
485491
rp = opisininnerloop ? rp : 0 # we only care about register pressure within the inner most loop
486492
rt *= iters[id]
487493
if isunrolled && istiled # no cost decrease; cost must be repeated

src/lower_store.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ function lower_conditionalstore_vectorized!(
7575
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}, isunrolled::Bool
7676
)
7777
loopdeps = loopdependencies(op)
78-
@assert unrolled loopdeps
78+
@assert vectorized loopdeps
7979
var = pvariable_name(op, suffix, tiled)
8080
parentisunrolled = unrolled loopdependencies(first(parents(op)))
8181
if isunrolled

test/ifelsemasks.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ T = Float32
2121
end
2222
z
2323
end
24-
@macroexpand @_avx for i eachindex(x)
25-
z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
26-
end
24+
# @macroexpand @_avx for i ∈ eachindex(x)
25+
# z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
26+
# end
2727
function promote_bool_storeavx2!(z, x, y)
2828
@avx for i eachindex(x)
2929
z[i] = (x[i]*x[i] + y[i]*y[i]) < 1 ? 1 : 0
@@ -301,14 +301,14 @@ T = Float32
301301
promote_bool_store!(c1, a, b);
302302
promote_bool_storeavx!(c2, a, b);
303303
@test c1 == c2
304-
fill!(c2, -999999999); promote_bool_store_avx!(c2, a, b)
304+
fill!(c2, -999999999); promote_bool_store_avx!(c2, a, b);
305305
@test c1 == c2
306-
fill!(c2, -999999999); promote_bool_storeavx2!(c2, a, b)
306+
fill!(c2, -999999999); promote_bool_storeavx2!(c2, a, b);
307307
@test c1 == c2
308-
fill!(c2, -999999999); promote_bool_store_avx2!(c2, a, b)
308+
fill!(c2, -999999999); promote_bool_store_avx2!(c2, a, b);
309309
@test c1 == c2
310310

311-
fill!(c2, -999999999); addormul!(c1, a, b)
311+
fill!(c1, 999999999); addormul!(c1, a, b)
312312
fill!(c2, -999999999); addormul_avx!(c2, a, b)
313313
@test c1 c2
314314
fill!(c2, -999999999); addormulavx!(c2, a, b)
@@ -369,7 +369,7 @@ T = Float32
369369
A = randn(T, K, M);
370370
B = randn(T, K, N);
371371
C1 = randn(T, M, N);
372-
end
372+
end;
373373
C2 = copy(C1); C3 = copy(C1);
374374
AtmulBpos!(C1, A, B)
375375
AtmulBposavx!(C2, A, B)

test/miscellaneous.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
B[j,i] = A[j,i] - x[j]
3434
end)
3535
lssubcol = LoopVectorization.LoopSet(subcolq);
36-
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
36+
@test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, :j, :j, Unum, Tnum)
3737
## @avx is SLOWER!!!!
3838
## need to fix!
3939
function mysubcol!(B, A, x)
@@ -58,7 +58,7 @@
5858
x[j] += A[j,i] - 0.25
5959
end)
6060
lscolsum = LoopVectorization.LoopSet(colsumq);
61-
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
61+
@test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, Unum, Tnum)
6262

6363
# my colsum is wrong (by 0.25), but slightly more interesting
6464
function mycolsum!(x, A)
@@ -95,7 +95,7 @@
9595
lsvar = LoopVectorization.LoopSet(varq);
9696
# LoopVectorization.choose_order(lsvar)
9797
# @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
98-
@test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
98+
@test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :i, :j, :j, Unum, Tnum)
9999

100100
function myvar!(s², A, x̄)
101101
@.= 0

0 commit comments

Comments
 (0)