Added image filtering to documentation.

chriselrod · chriselrod · commit 07e71a111a7f · 2020-03-14T23:49:45.000-04:00
diff --git a/benchmark/looptests.jl b/benchmark/looptests.jl
@@ -244,7 +244,7 @@ function randomaccess(P, basis, coeffs::Vector{T}) where {T}
         end
         p += pc
     end
-    return p
+   return p
 end
 function randomaccessavx(P, basis, coeffs::Vector{T}) where {T}
     C = length(coeffs)
diff --git a/docs/make.jl b/docs/make.jl
@@ -10,7 +10,8 @@ makedocs(;
             "examples/matrix_multiplication.md",
             "examples/matrix_vector_ops.md",
             "examples/dot_product.md",
-            "examples/sum_of_squared_error.md"
+            "examples/sum_of_squared_error.md",
+            "examples/filtering.md"
         ],
         "Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
         "Future Work" => "future_work.md",
diff --git a/docs/src/assets/bench_filter2d_3x3_v1.svg b/docs/src/assets/bench_filter2d_3x3_v1.svg
diff --git a/docs/src/examples/filtering.md b/docs/src/examples/filtering.md
@@ -0,0 +1,32 @@
+# Image Filtering
+
+Here, we convolve a small matrix `kern` with a larger matrix `A`, storing the results in `out`:
+```julia
+function filter2davx!(out::AbstractMatrix, A::AbstractMatrix, kern)
+    rng1k, rng2k = axes(kern)
+    rng1,  rng2  = axes(out)
+    @avx for j in rng2, i in rng1
+        tmp = zero(eltype(out))
+        for jk in rng2k, ik in rng1k
+            tmp += A[i+ik,j+jk]*kern[ik,jk]
+        end
+        out[i,j] = tmp
+    end
+    out
+end
+```
+These are four nested loops. For all the benchmarks, `kern` was only 3 by 3, making it too small for vectorizing these loops to be particularly profitable. By vectorizing the `i` loop instead, it can benefit from SIMD and also avoid having to do a reduction (horizontal addition) of a vector before storing in `out`, as the vectors can then be stored directly.
+![dynamicfilter](../assets/bench_filter2d_dynamic_v1.svg)
+
+LoopVectorization achieved much better performance than all the alternatives, which tended to prefer vectorizing the inner loops.
+By making the compilers aware that the `ik` loop is too short to be worth vectorizing, we can get them to vectorize something else instead. By defining the size of `kern` as constant in `C` and `Fortran`, and using size parameters in Julia, we can inform the compilers:
+![staticsizefilter](../assets/bench_filter2d_3x3_v1.svg)
+Now all are doing much better than they were before, although still well shy of the 131.2 GFLOPS theoretical limit for the host CPU cores. While they all improved, three are lagging behind the main group:
+- `ifort` lags behind all the others except base Julia. I'd need to do more investigating to find out why.
+- Providing static size information was enough for all to realize vectorizing the inner loops was not worth it. However, all but base Julia decided to vectorize a different loop instead, while the base Julia version I tested just didn't vectorize at all.
+- LoopVectorization currently only unrolls up to 2 loops. To get optimal performance in this problem, if you know the size of the inner loops, you should completely unroll them, and then also partially unroll the outer loops. I'll have to lift that restriction ([tracking issue](https://github.com/chriselrod/LoopVectorization.jl/issues/73)), and also make it aware that unrolling the outer loops is cheap, thanks to the ability to reuse neighboring `A` entries.
+
+Trying to provide hints by manually unrolling produces:
+![unrolledfilter](../assets/bench_filter2d_unrolled_v1.svg)
+This manual unrolling helped both Julia versions, while there was no change in any of the others.
+
diff --git a/docs/src/examples/matrix_multiplication.md b/docs/src/examples/matrix_multiplication.md
@@ -5,7 +5,7 @@ One of the friendliest problems for vectorization is matrix multiplication. Give
 LoopVectorization currently doesn't do any memory-modeling or memory-based optimizations, so it will still run into problems as the size of matrices increases. But at smaller sizes, it's capable of achieving a healthy percent of potential GFLOPS.
 We can write a single function:
 ```julia
-@inline function A_mul_B!(𝐂, 𝐀, 𝐁)
+function A_mul_B!(𝐂, 𝐀, 𝐁)
     @avx for m ∈ 1:size(𝐀,1), n ∈ 1:size(𝐁,2)
         𝐂ₘₙ = zero(eltype(𝐂))
         for k ∈ 1:size(𝐀,2)
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -11,6 +11,7 @@ Pages = [
     "examples/matrix_multiplication.md",
     "examples/matrix_vector_ops.md",
     "examples/dot_product.md",
+    "examples/filtering.md",
     "examples/sum_of_squared_error.md",
     "vectorized_convenience_functions.md",
     "future_work.md",
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -7,7 +7,7 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector
     Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
     PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct,
     maybestaticfirst, maybestaticlast
-using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
+using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
     sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
     vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
     vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
diff --git a/test/offsetarrays.jl b/test/offsetarrays.jl
@@ -171,7 +171,8 @@ using Test
         
         fill!(out3, NaN); avx2dunrolled3x3!(out3, A, skern);
         @test out1 ≈ out3
-end
+
+    end
 
     
 end