Big fixes; tests pass locally on latest VectorizationBase and SIMDPirates.

chriselrod · chriselrod · commit 39b069b127a8 · 2020-01-22T11:24:19.000-05:00
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -17,6 +17,7 @@ export LowDimArray, stridedpointer, vectorizable,
     vmap, vmap!
 
 
+include("map.jl")
 include("costs.jl")
 include("operations.jl")
 include("graphs.jl")
@@ -29,11 +30,9 @@ include("add_ifelse.jl")
 include("broadcast.jl")
 include("determinestrategy.jl")
 include("lowering.jl")
-include("constructors.jl")
-include("map.jl")
-# include("_avx.jl")
 include("condense_loopset.jl")
 include("reconstruct_loopset.jl")
+include("constructors.jl")
 
 export @_avx, _avx, @_avx_, avx_!
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -38,7 +38,6 @@ end
 
 @inline *ˡ(a::A, b::B) where {A,B} = Product{A,B}(a, b)
 @inline Base.Broadcast.broadcasted(::typeof(*ˡ), a::A, b::B) where {A, B} = Product{A,B}(a, b)
-const ∗ = *ˡ
 # TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
 function add_broadcast!(
     ls::LoopSet, mC::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl
@@ -161,7 +161,7 @@ end
 
 
 # Try to condense in type stable manner
-function generate_call(ls::LoopSet)
+function generate_call(ls::LoopSet, IUT)
     operation_descriptions = Expr(:curly, :Tuple)
     varnames = Symbol[]
     for op ∈ operations(ls)
@@ -176,16 +176,16 @@ function generate_call(ls::LoopSet)
     argmeta = argmeta_and_consts_description(ls, arraysymbolinds)
     loop_bounds = loop_boundaries(ls)
 
-    q = Expr(:call, lv(:_avx_!), operation_descriptions, arrayref_descriptions, argmeta, loop_bounds)
+    q = Expr(:call, lv(:_avx_!), Expr(:call, Expr(:curly, :Val, IUT)), operation_descriptions, arrayref_descriptions, argmeta, loop_bounds)
 
     foreach(ref -> push!(q.args, vptr(ref)), ls.refs_aliasing_syms)
     foreach(is -> push!(q.args, last(is)), ls.preamble_symsym)
     append!(q.args, arraysymbolinds)
     q
 end
 
-function setup_call(ls::LoopSet)
-    call = generate_call(ls)
+function setup_call(ls::LoopSet, inline = one(Int8), U = zero(Int8), T = zero(Int8))
+    call = generate_call(ls, (inline,U,T))
     hasouterreductions = length(ls.outer_reductions) > 0
     if hasouterreductions
         retv = loopset_return_value(ls, Val(false))
@@ -208,7 +208,4 @@ function setup_call(ls::LoopSet)
     ls.preamble
 end
 
-macro _avx(q)
-    esc(setup_call(LoopSet(q)))
-end
 
diff --git a/src/constructors.jl b/src/constructors.jl
@@ -92,26 +92,69 @@ true
 """
 macro avx(q)
     q2 = if q.head === :for
-        lower(LoopSet(q))
+        setup_call(LoopSet(q))
     else# assume broadcast
         substitute_broadcast(q)
     end
     esc(q2)
 end
+
+function check_inline(arg)
+    a1 = (arg.args[1])::Symbol
+    a1 === :inline || return nothing
+    (arg.args[2])::Bool
+end
+function check_tile(arg)
+    a1 = (arg.args[1])::Symbol
+    a1 === :tile || return nothing
+    U = convert(Int8, tup.args[1])
+    T = convert(Int8, tup.args[2])
+    U, T
+end
+function check_unroll(arg)
+    a1 = (arg.args[1])::Symbol
+    a1 === :unroll || return nothing
+    convert(Int8, arg.args[2])
+end
+function check_macro_kwarg(arg, inline::Int8 = one(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
+    @assert arg.head === :(=)
+    i = check_inline(arg)
+    if i !== nothing
+        inline = i ? Int8(2) : Int8(-1)
+    else
+        u = check_unroll(arg)
+        if u !== nothing
+            U = u
+            T = Int8(-1)
+        else
+            U, T = check_tile(arg)
+        end
+    end
+    inline, U, T
+end
 macro avx(arg, q)
     @assert q.head === :for
     @assert arg.head === :(=)
-    local U::Int, T::Int
-    if arg.args[1] === :unroll
-        U = arg.args[2]
-        T = -1
-    elseif arg.args[1] === :tile
-        tup = arg.args[2]
-        @assert tup.head === :tuple
-        U = tup.args[1]
-        T = tup.args[2]
-    end
+    inline, U, T = check_macro_kwarg(arg)
+    esc(setup_call(LoopSet(q), inline, U, T))
+end
+macro avx(arg1, arg2, q)
+    @assert q.head === :for
+    inline, U, T = check_macro_kwarg(arg1)
+    inline, U, T = check_macro_kwarg(arg2, inline, U, T)
+    esc(setup_call(LoopSet(q), inline, U, T))
+end
+
+
+
+macro _avx(q)
+    esc(lower(LoopSet(q)))
+end
+macro _avx(arg, q)
+    @assert q.head === :for
+    inline, U, T = check_macro_kwarg(arg)
     esc(lower(LoopSet(q), U, T))
 end
-    
+
+
 
diff --git a/src/lowering.jl b/src/lowering.jl
@@ -1001,22 +1001,41 @@ end
 
 
 
+function maybeinline!(q, ls, istiled, prependinlineORorUnroll)
+    if prependinlineORorUnroll == 1
+        if !istiled | length(ls.outer_reductions) > 1
+            pushfirst!(q.args, Expr(:meta, :inline))
+        end
+    elseif prependinlineORorUnroll == 2
+        pushfirst!(q.args, Expr(:meta, :inline))
+    elseif prependinlineORorUnroll == -1
+        pushfirst!(q.args, Expr(:meta, :noinline))
+    end
+    q
+end
 # Here, we have to figure out how to convert the loopset into a vectorized expression.
 # This must traverse in a parent -> child pattern
 # but order is also dependent on which loop inds they depend on.
-# Requires sorting 
-function lower(ls::LoopSet)
+# Requires sorting
+# values for prependinlineORorUnroll:
+# -1 : force @noinline
+# 0 : nothing
+# 1 : inline if length(ls.outer_reductions) > 1
+# 2 : force inline
+function lower(ls::LoopSet, prependinlineORorUnroll = 0)
     order, vectorized, U, T = choose_order(ls)
     istiled = T != -1
     fillorder!(ls, order, istiled)
-    istiled ? lower_tiled(ls, vectorized, U, T) : lower_unrolled(ls, vectorized, U)
+    q = istiled ? lower_tiled(ls, vectorized, U, T) : lower_unrolled(ls, vectorized, U)
+    maybeinline!(q, ls, istiled, prependinlineORorUnroll)
 end
-function lower(ls::LoopSet, U, T = -1)
+function lower(ls::LoopSet, U, T, prependinlineORorUnroll = 0)
     num_loops(ls) == 1 && @assert T == -1
     order, vectorized, _U, _T = choose_order(ls)
     istiled = T != -1
     fillorder!(ls, order, istiled)
-    istiled ? lower_tiled(ls, vectorized, U, T) : lower_unrolled(ls, vectorized, U)
+    q = istiled ? lower_tiled(ls, vectorized, Int(U), Int(T)) : lower_unrolled(ls, vectorized, Int(U))
+    maybeinline!(q, ls, istiled, prependinlineORorUnroll)
 end
 
 Base.convert(::Type{Expr}, ls::LoopSet) = lower(ls)
diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl
@@ -190,7 +190,7 @@ function sizeofeltypes(v, num_arrays)::Int
     sizeof(T)
 end
 
-function avx_body(instr, ops, arf, AM, LB, vargs)
+function avx_body(IUT, instr, ops, arf, AM, LB, vargs)
     ls = LoopSet()
     # elementbytes = mapreduce(elbytes, min, @view(vargs[Base.OneTo(length(arf))]))::Int
     num_arrays = length(arf)
@@ -203,20 +203,20 @@ function avx_body(instr, ops, arf, AM, LB, vargs)
     add_ops!(ls, instr, ops, mrefs, opsymbols, elementbytes)
     add_array_symbols!(ls, arraysymbolinds, num_arrays + length(ls.preamble_symsym))
     pushpreamble!(ls, Expr(:(=), ls.T, Expr(:call, :promote_type, [Expr(:call, :eltype, vptr(mref)) for mref ∈ mrefs]...)))
-    q = lower(ls)
-    push!(q.args, loopset_return_value(ls, Val(true)))
+    inline, U, T = IUT
+    q = iszero(U) ? lower(ls, inline) : lower(ls, U, T, inline)
+    length(ls.outer_reductions) > 0 ? push!(q.args, loopset_return_value(ls, Val(true))) : push!(q.args, nothing)
     # @show q
-    length(ls.outer_reductions) > 1 && pushfirst!(q.args, Expr(:meta, :inline))
     q
 end
 
-@generated function _avx_!(::Type{OPS}, ::Type{ARF}, ::Type{AM}, lb::LB, vargs...) where {OPS, ARF, AM, LB}
+@generated function _avx_!(::Val{IUT}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, lb::LB, vargs...) where {IUT, OPS, ARF, AM, LB}
     OPSsv = OPS.parameters
     nops = length(OPSsv) ÷ 3
     instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i ∈ 0:nops-1]
     ops = OperationStruct[ OPSsv[3i] for i ∈ 1:nops ]
     avx_body(
-        instr, ops,
+        IUT, instr, ops,
         ArrayRefStruct[ARF.parameters...],
         AM.parameters, LB.parameters, vargs
     )       
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -355,47 +355,76 @@ using LinearAlgebra
                 AmulB!(C2, A, B)
                 AmulBavx1!(C, A, B)
                 @test C ≈ C2
+                fill!(C, 999.99); AmulBavx1!(C, At', B)
+                @test C ≈ C2
                 fill!(C, 999.99); AmulBavx2!(C, A, B)
                 @test C ≈ C2
+                fill!(C, 999.99); AmulBavx2!(C, At', B)
+                @test C ≈ C2
                 fill!(C, 999.99); AmulBavx3!(C, A, B)
                 @test C ≈ C2
+                fill!(C, 999.99); AmulBavx3!(C, At', B)
+                @test C ≈ C2
                 fill!(C, 0.0); AmuladdBavx!(C, A, B)
                 @test C ≈ C2
-                AmuladdBavx!(C, A, B)
+                AmuladdBavx!(C, At', B)
                 @test C ≈ 2C2
                 AmuladdBavx!(C, A, B, -1)
                 @test C ≈ C2
+                AmuladdBavx!(C, At', B, -2)
+                @test C ≈ -C2
                 fill!(C, 9999.999); AtmulBavx!(C, At, B)
                 @test C ≈ C2
+                fill!(C, 9999.999); AtmulBavx!(C, A', B)
+                @test C ≈ C2
                 fill!(C, 9999.999); mulCAtB_2x2blockavx!(C, At, B);
                 @test C ≈ C2
+                fill!(C, 9999.999); mulCAtB_2x2blockavx!(C, A', B);
+                @test C ≈ C2
             end
             @time @testset "_avx $T gemm" begin
-                fill!(C, 999.99); AmulB_avx1!(C, A, B)
+                AmulB_avx1!(C, A, B)
+                @test C ≈ C2
+                fill!(C, 999.99); AmulB_avx1!(C, At', B)
                 @test C ≈ C2
                 fill!(C, 999.99); AmulB_avx2!(C, A, B)
                 @test C ≈ C2
+                fill!(C, 999.99); AmulB_avx2!(C, At', B)
+                @test C ≈ C2
                 fill!(C, 999.99); AmulB_avx3!(C, A, B)
                 @test C ≈ C2
+                fill!(C, 999.99); AmulB_avx3!(C, At', B)
+                @test C ≈ C2
                 fill!(C, 0.0); AmuladdB_avx!(C, A, B)
                 @test C ≈ C2
-                AmuladdB_avx!(C, A, B)
+                AmuladdB_avx!(C, At', B)
                 @test C ≈ 2C2
                 AmuladdB_avx!(C, A, B, -1)
                 @test C ≈ C2
+                AmuladdB_avx!(C, At', B, -2)
+                @test C ≈ -C2
                 fill!(C, 9999.999); AtmulB_avx!(C, At, B)
                 @test C ≈ C2
+                fill!(C, 9999.999); AtmulB_avx!(C, A', B)
+                @test C ≈ C2
                 fill!(C, 9999.999); mulCAtB_2x2block_avx!(C, At, B);
                 @test C ≈ C2
+                fill!(C, 9999.999); mulCAtB_2x2block_avx!(C, A', B);
+                @test C ≈ C2
             end
 
             @time @testset "$T rank2mul" begin
                 Aₘ= rand(R, M, 2); Aₖ = rand(R, 2, K);
+                Aₖ′ = copy(Aₖ')
                 rank2AmulB!(C2, Aₘ, Aₖ, B)
                 rank2AmulBavx!(C, Aₘ, Aₖ, B)
                 @test C ≈ C2
                 fill!(C, 9999.999); rank2AmulB_avx!(C, Aₘ, Aₖ, B)
                 @test C ≈ C2
+                fill!(C, 9999.999); rank2AmulBavx!(C, Aₘ, Aₖ′', B)
+                @test C ≈ C2
+                fill!(C, 9999.999); rank2AmulB_avx!(C, Aₘ, Aₖ′', B)
+                @test C ≈ C2
             end
 
         end
@@ -456,20 +485,51 @@ using LinearAlgebra
         end
         s
     end
-    function dot_unroll2(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
+    function dot_unroll2avx(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
         z = zero(T)
         @avx unroll=2 for i ∈ 1:length(x)
             z += x[i]*y[i]
         end
         return z
     end
-    function dot_unroll3(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
+    @macroexpand @avx unroll=2 for i ∈ 1:length(x)
+            z += x[i]*y[i]
+        end
+    function dot_unroll3avx(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
         z = zero(T)
         @avx unroll=3 for i ∈ 1:length(x)
             z += x[i]*y[i]
         end
         return z
     end
+    function dot_unroll2avx_noinline(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
+        z = zero(T)
+        @avx inline=true unroll=2 for i ∈ 1:length(x)
+            z += x[i]*y[i]
+        end
+        return z
+    end
+    function dot_unroll3avx_inline(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
+        z = zero(T)
+        @avx unroll=3 inline=false for i ∈ 1:length(x)
+            z += x[i]*y[i]
+        end
+        return z
+    end
+    function dot_unroll2_avx(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
+        z = zero(T)
+        @_avx unroll=2 for i ∈ 1:length(x)
+            z += x[i]*y[i]
+        end
+        return z
+    end
+    function dot_unroll3_avx(x::Vector{T}, y::Vector{T}) where {T<:AbstractFloat}
+        z = zero(T)
+        @_avx unroll=3 for i ∈ 1:length(x)
+            z += x[i]*y[i]
+        end
+        return z
+    end
     function complex_dot_soa(
         xre::AbstractVector{T}, xim::AbstractVector{T},
         yre::AbstractVector{T}, yim::AbstractVector{T}
@@ -495,8 +555,12 @@ using LinearAlgebra
         s = mydot(a,b)
         @test mydotavx(a,b) ≈ s
         @test mydot_avx(a,b) ≈ s
-        @test dot_unroll2(a,b) ≈ s
-        @test dot_unroll3(a,b) ≈ s
+        @test dot_unroll2avx(a,b) ≈ s
+        @test dot_unroll3avx(a,b) ≈ s
+        @test dot_unroll2_avx(a,b) ≈ s
+        @test dot_unroll3_avx(a,b) ≈ s
+        @test dot_unroll2avx_noinline(a,b) ≈ s
+        @test dot_unroll3avx_inline(a,b) ≈ s
         s = myselfdot(a)
         @test myselfdotavx(a) ≈ s
         @test myselfdot_avx(a) ≈ s