Fix loopvalue-associated costs in determinestrategy and rerun filter benchmarks.

chriselrod · chriselrod · commit 5ce49bf8788a · 2020-03-15T20:49:31.000-04:00
diff --git a/benchmark/benchmarkflops.jl b/benchmark/benchmarkflops.jl
@@ -25,7 +25,15 @@ function Base.getindex(br::SizedResults, row, col)
     col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
 end
 Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
-
+function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult)
+    BenchmarkResult(
+        br1.tests,
+        SizedResults(
+            SharedMatrix(hcat(br1.sizedresults.results, br2.sizedresults.results)),
+            vcat(br1.sizedresults.sizes, br2.sizedresults.sizes)
+        )
+    )
+end
 
 tothreetuple(i::Int) = (i,i,i)
 tothreetuple(i::NTuple{3,Int}) = i
diff --git a/benchmark/driver.jl b/benchmark/driver.jl
@@ -23,9 +23,9 @@ end
 # sizes = 23:23
 sizes = 256:-1:2
 
-filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)
-filter2d_3x3_bench = benchmark_filter2d3x3(sizes)
-filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)
+filter2d_dynamic_bench = benchmark_filter2ddynamic(512:-1:2)
+filter2d_3x3_bench = benchmark_filter2d3x3(512:-1:2)
+filter2d_unrolled_bench = benchmark_filter2dunrolled(512:-1:2)
 
 AmulB_bench = benchmark_AmulB(sizes)
 AmulBt_bench = benchmark_AmulBt(sizes)
diff --git a/benchmark/loadsharedlibs.jl b/benchmark/loadsharedlibs.jl
@@ -16,7 +16,7 @@ const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
 # requires Clang with polly to build
 cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
 if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)    
-    run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
+    run(`/usr/local/bin/clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
 end
 if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
     run(`icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`)
diff --git a/benchmark/plotbenchmarks.jl b/benchmark/plotbenchmarks.jl
@@ -1,12 +1,12 @@
 using PrettyTables
 
-const HIGHLIGHT_BEST = Highlighter(
-    (br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
-    foreground = :green
-);
 function Base.show(io::IO, br::BenchmarkResult)
+    hb = Highlighter(
+        (br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
+        foreground = :green
+    );
     pretty_table(
-        io, br.sizedresults, br.tests, crop = :none, highlighters = (HIGHLIGHT_BEST,)
+        io, br.sizedresults, br.tests, crop = :none, highlighters = (hb,)
     )
 end
 
diff --git a/docs/src/assets/bench_filter2d_3x3_v1.svg b/docs/src/assets/bench_filter2d_3x3_v1.svg
diff --git a/docs/src/assets/bench_filter2d_dynamic_v1.svg b/docs/src/assets/bench_filter2d_dynamic_v1.svg
diff --git a/docs/src/assets/bench_filter2d_unrolled_v1.svg b/docs/src/assets/bench_filter2d_unrolled_v1.svg
diff --git a/docs/src/examples/filtering.md b/docs/src/examples/filtering.md
@@ -21,12 +21,11 @@ These are four nested loops. For all the benchmarks, `kern` was only 3 by 3, mak
 LoopVectorization achieved much better performance than all the alternatives, which tended to prefer vectorizing the inner loops.
 By making the compilers aware that the `ik` loop is too short to be worth vectorizing, we can get them to vectorize something else instead. By defining the size of `kern` as constant in `C` and `Fortran`, and using size parameters in Julia, we can inform the compilers:
 ![staticsizefilter](../assets/bench_filter2d_3x3_v1.svg)
-Now all are doing much better than they were before, although still well shy of the 131.2 GFLOPS theoretical limit for the host CPU cores. While they all improved, three are lagging behind the main group:
+Now all are doing much better than they were before, although still well shy of the 131.2 GFLOPS theoretical limit for the host CPU cores. While they all improved, two are lagging behind the main group:
 - `ifort` lags behind all the others except base Julia. I'd need to do more investigating to find out why.
-- Providing static size information was enough for all to realize vectorizing the inner loops was not worth it. However, all but base Julia decided to vectorize a different loop instead, while the base Julia version I tested just didn't vectorize at all.
-- LoopVectorization currently only unrolls up to 2 loops. To get optimal performance in this problem, if you know the size of the inner loops, you should completely unroll them, and then also partially unroll the outer loops. I'll have to lift that restriction ([tracking issue](https://github.com/chriselrod/LoopVectorization.jl/issues/73)), and also make it aware that unrolling the outer loops is cheap, thanks to the ability to reuse neighboring `A` entries.
+- Providing static size information was enough for all to realize vectorizing the inner loops was not worth it. However, all but base Julia decided to vectorize a different loop instead, while the base Julia version I tested just didn't vectorize at all. 
 
-Trying to provide hints by manually unrolling produces:
+Helping Base Julia out by manually unrolling the inner loops:
 ![unrolledfilter](../assets/bench_filter2d_unrolled_v1.svg)
-This manual unrolling helped both Julia versions, while there was no change in any of the others.
+This manual unrolling helped Julia, but had no real impact on any of the others.
 
diff --git a/src/determinestrategy.jl b/src/determinestrategy.jl
@@ -51,7 +51,7 @@ function register_pressure(op::Operation)
 end
 function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
     isconstant(op) && return 0.0, 0, 1
-    isloopvalue(op) && return 0.0, 0, 1
+    isloopvalue(op) && return 0.0, 0, 0
     # Wshift == dependson(op, vectorized) ? Wshift : 0
     # c = first(cost(instruction(op), Wshift, size_T))::Int
     instr = Instruction(:LoopVectorization, instruction(op).instr)
@@ -60,6 +60,8 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
         if instr == Instruction(:-) || instr === Instruction(:vsub) || instr == Instruction(:+) || instr == Instruction(:vadd)
             return 0.0, 0, 1
         end
+    elseif iscompute(op) && all(isloopvalue, parents(op))
+        return 0.0, 0, 1
     end
     opisvectorized = dependson(op, vectorized)
     srt, sl, srp = opisvectorized ? vector_cost(instr, Wshift, size_T) : scalar_cost(instr)
@@ -244,7 +246,8 @@ function tile_cost(X, U, T, UL, TL)
     X[1] + X[4] + X[2] * Tfactor + X[3] * Ufactor
 end
 function solve_tilesize(X, R, UL, TL)
-    @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
+    # @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
+    first(iszero(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     # @inbounds any(iszero, (R[1],R[2],R[3])) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     # We use a lagrange multiplier to find floating point values for U and T
     # first solving for U via quadratic formula
@@ -256,7 +259,9 @@ function solve_tilesize(X, R, UL, TL)
     Ufloat = (sqrt(b^2 - 4a*c) - b) / (2a)
     Tfloat = (RR - Ufloat*R[2])/(Ufloat*R[1])
     # @show Ufloat, Tfloat
-    (isfinite(Tfloat) && isfinite(Ufloat)) || return -1,-1,Inf
+    if !(isfinite(Tfloat) && isfinite(Ufloat))
+        return 4, 4, tile_cost(X, 4, 4, UL, TL)
+    end
     Ulow = max(1, floor(Int, Ufloat)) # must be at least 1
     Tlow = max(1, floor(Int, Tfloat)) # must be at least 1
     Uhigh = Ulow + 1 #ceil(Int, Ufloat)
@@ -301,7 +306,7 @@ function solve_tilesize_constT(ls, T)
 end
 # Tiling here is about alleviating register pressure for the UxT
 function solve_tilesize(X, R, Umax, Tmax, UL, TL)
-    first(R) == 0 && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
+    iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, Umax, Tmax)
     U, T, cost = solve_tilesize(X, R, UL, TL)
     # T -= T & 1
     # U = min(U, T)
@@ -482,6 +487,7 @@ function evaluate_cost_tile(
             factor = convolution_cost_factor(ls, op, unrolled, tiled, vectorized)
             rt *= factor#; rp *= factor;
         end
+        # @show op rt, lat, rp
         rp = opisininnerloop ? rp : 0 # we only care about register pressure within the inner most loop
         rt *= iters[id]
         if isunrolled && istiled # no cost decrease; cost must be repeated
diff --git a/src/lower_store.jl b/src/lower_store.jl
@@ -75,7 +75,7 @@ function lower_conditionalstore_vectorized!(
     suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}, isunrolled::Bool
 )
     loopdeps = loopdependencies(op)
-    @assert unrolled ∈ loopdeps
+    @assert vectorized ∈ loopdeps
     var = pvariable_name(op, suffix, tiled)
     parentisunrolled = unrolled ∈ loopdependencies(first(parents(op)))
     if isunrolled
diff --git a/test/ifelsemasks.jl b/test/ifelsemasks.jl
@@ -21,9 +21,9 @@ T = Float32
         end
         z
     end
-    @macroexpand @_avx for i ∈ eachindex(x)
-            z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
-        end
+    # @macroexpand @_avx for i ∈ eachindex(x)
+            # z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
+        # end
     function promote_bool_storeavx2!(z, x, y)
         @avx for i ∈ eachindex(x)
             z[i] = (x[i]*x[i] + y[i]*y[i]) < 1 ? 1 : 0
@@ -301,14 +301,14 @@ T = Float32
         promote_bool_store!(c1, a, b);
         promote_bool_storeavx!(c2, a, b);
         @test c1 == c2
-        fill!(c2, -999999999); promote_bool_store_avx!(c2, a, b)
+        fill!(c2, -999999999); promote_bool_store_avx!(c2, a, b);
         @test c1 == c2
-        fill!(c2, -999999999); promote_bool_storeavx2!(c2, a, b)
+        fill!(c2, -999999999); promote_bool_storeavx2!(c2, a, b);
         @test c1 == c2
-        fill!(c2, -999999999); promote_bool_store_avx2!(c2, a, b)
+        fill!(c2, -999999999); promote_bool_store_avx2!(c2, a, b);
         @test c1 == c2
 
-        fill!(c2, -999999999); addormul!(c1, a, b)
+        fill!(c1,  999999999); addormul!(c1, a, b)
         fill!(c2, -999999999); addormul_avx!(c2, a, b)
         @test c1 ≈ c2
         fill!(c2, -999999999); addormulavx!(c2, a, b)
@@ -369,7 +369,7 @@ T = Float32
             A = randn(T, K, M);
             B = randn(T, K, N);
             C1 = randn(T, M, N);
-        end
+        end;
         C2 = copy(C1); C3 = copy(C1);
         AtmulBpos!(C1, A, B)
         AtmulBposavx!(C2, A, B)
diff --git a/test/miscellaneous.jl b/test/miscellaneous.jl
@@ -33,7 +33,7 @@
                 B[j,i] = A[j,i] - x[j]
                 end)
     lssubcol = LoopVectorization.LoopSet(subcolq);
-    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
+    @test LoopVectorization.choose_order(lssubcol) == (Symbol[:j,:i], :i, :j, :j, Unum, Tnum)
     ## @avx is SLOWER!!!!
     ## need to fix!
     function mysubcol!(B, A, x)
@@ -58,7 +58,7 @@
                 x[j] += A[j,i] - 0.25
                 end)
     lscolsum = LoopVectorization.LoopSet(colsumq);
-    @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
+    @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :i, :j, :j, Unum, Tnum)
 
     # my colsum is wrong (by 0.25), but slightly more interesting
     function mycolsum!(x, A)
@@ -95,7 +95,7 @@
     lsvar = LoopVectorization.LoopSet(varq);
     # LoopVectorization.choose_order(lsvar)
     # @test LoopVectorization.choose_order(lscolsum) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
-    @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :j, Symbol("##undefined##"), :j, 4, -1)
+    @test LoopVectorization.choose_order(lsvar) == (Symbol[:j,:i], :i, :j, :j, Unum, Tnum)
 
     function myvar!(s², A, x̄)
         @. s² = 0

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ function lower_conditionalstore_vectorized!(`
`75`	`75`	`suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}, isunrolled::Bool`
`76`	`76`	`)`
`77`	`77`	`loopdeps = loopdependencies(op)`
`78`		`- @assert unrolled ∈ loopdeps`
	`78`	`+ @assert vectorized ∈ loopdeps`
`79`	`79`	`var = pvariable_name(op, suffix, tiled)`
`80`	`80`	`parentisunrolled = unrolled ∈ loopdependencies(first(parents(op)))`
`81`	`81`	`if isunrolled`