Use Mask type in place of unsigned integers to represent bitmasks and update to deps that added more support for them; fixes #60.

chriselrod · chriselrod · commit 51a75c94957c · 2020-02-26T08:12:31.000-05:00
diff --git a/Manifest.toml b/Manifest.toml
@@ -49,15 +49,15 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[SIMDPirates]]
 deps = ["VectorizationBase"]
-git-tree-sha1 = "4b1e0b1442fb4af5e6b93b9c7fdeacf287d2653b"
+git-tree-sha1 = "839625f8699855a7d5ca96be25bc24d71c5c00ff"
 uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
-version = "0.5.0"
+version = "0.6.0"
 
 [[SLEEFPirates]]
 deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
-git-tree-sha1 = "769fd039d0835e8e628d61e2f0c80822ba668497"
+git-tree-sha1 = "62368836fef70b461ac005ed0112315222eab5b5"
 uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
-version = "0.3.9"
+version = "0.4.0"
 
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -71,6 +71,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[VectorizationBase]]
 deps = ["CpuId", "LinearAlgebra"]
-git-tree-sha1 = "9f8caaa5d033f88e188f62a3dba0dab5f429447a"
+git-tree-sha1 = "9410db46eeb38d9fb108fae9758713cfafc4cb91"
 uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
-version = "0.5.0"
+version = "0.6.1"
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.6.15"
+version = "0.6.16"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -12,9 +12,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 Parameters = "0"
-SIMDPirates = "~0.5"
-SLEEFPirates = "~0.3.9"
-VectorizationBase = "~0.5"
+SIMDPirates = "~0.6"
+SLEEFPirates = "~0.4"
+VectorizationBase = "~0.6.1"
 julia = "1.1"
 
 [extras]
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -7,7 +7,7 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector
     Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
     PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
 using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
-    sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vfmadd!, vfnmadd!,
+    sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
     vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
diff --git a/src/add_compute.jl b/src/add_compute.jl
@@ -137,7 +137,15 @@ function add_reduction_update_parent!(
         reductcombine = Symbol("")
     end
     combineddeps = copy(deps); mergesetv!(combineddeps, reduceddeps)
-    directdependency && pushparent!(vparents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
+    # directdependency && pushparent!(vparents, deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
+    if directdependency
+        if instr ∈ (:-, :vsub!, :vsub, :/, :vfdiv!, :vfidiv!)
+            pushfirst!(vparents, reductinit)
+            update_deps!(deps, reduceddeps, reductinit)#parent) # deps and reduced deps will not be disjoint
+        else
+            push!(vparents, reductinit)
+        end
+    end
     update_reduction_status!(vparents, reduceddeps, name(reductinit))
     # this is the op added by add_compute
     op = Operation(length(operations(ls)), reductsym, elementbytes, instr, compute, deps, reduceddeps, vparents)
diff --git a/src/add_ifelse.jl b/src/add_ifelse.jl
@@ -8,14 +8,25 @@ function add_if!(ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int, positio
     # for now, just simple 1-liners
     @assert length(RHS.args) == 3 "if statements without an else cannot be assigned to a variable."
     condition = first(RHS.args)
-    condop = add_compute!(ls, gensym(:mask), condition, elementbytes, position, mpref)
+    condop = if mpref === nothing
+        add_operation!(ls, gensym(:mask), condition, elementbytes, position)
+    else
+        add_operation!(ls, gensym(:mask), condition, mpref, elementbytes, position)
+    end
     iftrue = RHS.args[2]
-    (iftrue isa Expr && iftrue.head !== :call) && throw("Only calls or constant expressions are currently supported in if/else blocks.")
-    trueop = add_operation!(ls, Symbol(:iftrue), iftrue, elementbytes, position)
+    trueop = if iftrue isa Expr
+        (iftrue isa Expr && iftrue.head !== :call) && throw("Only calls or constant expressions are currently supported in if/else blocks.")
+        add_operation!(ls, Symbol(:iftrue), iftrue, elementbytes, position)
+    else
+        getop(ls, iftrue, elementbytes)
+    end
     iffalse = RHS.args[3]
-    (iffalse isa Expr && iffalse.head !== :call) && throw("Only calls or constant expressions are currently supported in if/else blocks.")
-    falseop = add_operation!(ls, Symbol(:iffalse), iffalse, elementbytes, position)
-
+    falseop = if iffalse isa Expr
+        (iffalse isa Expr && iffalse.head !== :call) && throw("Only calls or constant expressions are currently supported in if/else blocks.")
+        add_operation!(ls, Symbol(:iffalse), iffalse, elementbytes, position)
+    else
+        getop(ls, iffalse, elementbytes)
+    end
     add_compute!(ls, LHS, :vifelse, [condop, trueop, falseop], elementbytes)
 end
 
@@ -38,7 +49,7 @@ function add_andblock!(ls::LoopSet, condop::Operation, LHS, RHS, elementbytes::I
     add_andblock!(ls, condop, LHS, rhsop, elementbytes, position)
 end
 function add_andblock!(ls::LoopSet, condexpr::Expr, condeval::Expr, elementbytes::Int, position::Int)
-    condop = add_compute!(ls, gensym(:mask), condexpr, elementbytes, position)
+    condop = add_operation!(ls, gensym(:mask), condexpr, elementbytes, position)
     if condeval.head === :call
         @assert first(condeval.args) === :setindex!
         array, raw_indices = ref_from_setindex(condeval)
@@ -79,7 +90,7 @@ function add_orblock!(ls::LoopSet, condop::Operation, LHS, RHS, elementbytes::In
     add_orblock!(ls, condop, LHS, rhsop, elementbytes, position)
 end
 function add_orblock!(ls::LoopSet, condexpr::Expr, condeval::Expr, elementbytes::Int, position::Int)
-    condop = add_compute!(ls, gensym(:mask), condexpr, elementbytes, position)
+    condop = add_operation!(ls, gensym(:mask), condexpr, elementbytes, position)
     if condeval.head === :call
         @assert first(condeval.args) === :setindex!
         array, raw_indices = ref_from_setindex(condeval)
diff --git a/src/costs.jl b/src/costs.jl
@@ -110,8 +110,10 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:vsub) => InstructionCost(4,0.5),
     Instruction(:vadd!) => InstructionCost(4,0.5),
     Instruction(:vsub!) => InstructionCost(4,0.5),
+    Instruction(:vmul!) => InstructionCost(4,0.5),
     Instruction(:vmul) => InstructionCost(4,0.5),
     Instruction(:vfdiv) => InstructionCost(13,4.0,-2.0),
+    Instruction(:vfdiv!) => InstructionCost(13,4.0,-2.0),
     Instruction(:evadd) => InstructionCost(4,0.5),
     Instruction(:evsub) => InstructionCost(4,0.5),
     Instruction(:evmul) => InstructionCost(4,0.5),
@@ -152,6 +154,8 @@ const COST = Dict{Instruction,InstructionCost}(
     Instruction(:vfnmsub) => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfmadd!) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmadd!) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfmsub!) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    Instruction(:vfnmsub!) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfmadd_fast) => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfmsub_fast) => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
     Instruction(:vfnmadd_fast) => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
@@ -212,7 +216,10 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :* => MULTIPLICATIVE_IN_REDUCTIONS,
     :vadd => ADDITIVE_IN_REDUCTIONS,
     :vsub => ADDITIVE_IN_REDUCTIONS,
+    :vadd! => ADDITIVE_IN_REDUCTIONS,
+    :vsub! => ADDITIVE_IN_REDUCTIONS,
     :vmul => MULTIPLICATIVE_IN_REDUCTIONS,
+    :vmul! => MULTIPLICATIVE_IN_REDUCTIONS,
     :evadd => ADDITIVE_IN_REDUCTIONS,
     :evsub => ADDITIVE_IN_REDUCTIONS,
     :evmul => MULTIPLICATIVE_IN_REDUCTIONS,
@@ -228,6 +235,8 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :vfnmsub => ADDITIVE_IN_REDUCTIONS,
     :vfmadd! => ADDITIVE_IN_REDUCTIONS,
     :vfnmadd! => ADDITIVE_IN_REDUCTIONS,
+    :vfmsub! => ADDITIVE_IN_REDUCTIONS,
+    :vfnmsub! => ADDITIVE_IN_REDUCTIONS,
     :vfmadd_fast => ADDITIVE_IN_REDUCTIONS,
     :vfmsub_fast => ADDITIVE_IN_REDUCTIONS,
     :vfnmadd_fast => ADDITIVE_IN_REDUCTIONS,
@@ -283,8 +292,11 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(Base.FastMath.sub_fast) => :(-),
     typeof(*) => :(*),
     typeof(SIMDPirates.vmul) => :(*),
+    typeof(SIMDPirates.vmul!) => :(*),
     typeof(Base.FastMath.mul_fast) => :(*),
     typeof(/) => :(/),
+    typeof(SIMDPirates.vfdiv) => :(/),
+    typeof(SIMDPirates.vfdiv!) => :(/),
     typeof(SIMDPirates.vdiv) => :(/),
     typeof(Base.FastMath.div_fast) => :(/),
     typeof(==) => :(==),
@@ -306,6 +318,8 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(SIMDPirates.vfnmsub) => :vfnmsub,
     typeof(SIMDPirates.vfmadd!) => :vfmadd!,
     typeof(SIMDPirates.vfnmadd!) => :vfnmadd!,
+    typeof(SIMDPirates.vfmsub!) => :vfmsub!,
+    typeof(SIMDPirates.vfnmsub!) => :vfnmsub!,
     typeof(SIMDPirates.vfmadd_fast) => :vfmadd_fast,
     typeof(SIMDPirates.vfmsub_fast) => :vfmsub_fast,
     typeof(SIMDPirates.vfnmadd_fast) => :vfnmadd_fast,
diff --git a/src/graphs.jl b/src/graphs.jl
@@ -290,6 +290,7 @@ Base.length(ls::LoopSet, s::Symbol) = length(getloop(ls, s))
 isstaticloop(ls::LoopSet, s::Symbol) = isstaticloop(getloop(ls,s))
 looprangehint(ls::LoopSet, s::Symbol) = length(getloop(ls, s))
 looprangesym(ls::LoopSet, s::Symbol) = getloop(ls, s).rangesym
+getop(ls::LoopSet, var::Number, elementbytes) = add_constant!(ls, var, elementbytes)
 function getop(ls::LoopSet, var::Symbol, elementbytes::Int)
     get!(ls.opdict, var) do
         add_constant!(ls, var, elementbytes)
diff --git a/src/precompile.jl b/src/precompile.jl
@@ -58,7 +58,7 @@ function _precompile_()
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(false, true, true),Int64,3,Array{Int64,3}}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(true, false, true),Int64,3,Array{Int64,3}}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(true, true, false),Int64,3,Array{Int64,3}}})
-    precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Float64,2},Array{Float64,1}})
+    precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Int32,2},Array{Int32,1}})
     precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Int64,2},Array{Int64,1}})
     precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,NTuple{4,DataType}})
     precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,NTuple{5,DataType}})
diff --git a/test/ifelsemasks.jl b/test/ifelsemasks.jl
@@ -1,4 +1,68 @@
 @testset "ifelse (masks)" begin
+
+    function promote_bool_store!(z, x, y)
+        for i ∈ eachindex(x)
+            z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
+        end
+        z
+    end
+    function promote_bool_storeavx!(z, x, y)
+        @avx for i ∈ eachindex(x)
+            z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
+        end
+        z
+    end
+    function promote_bool_store_avx!(z, x, y)
+        @_avx for i ∈ eachindex(x)
+            z[i] = (x[i]*x[i] + y[i]*y[i]) < 1
+        end
+        z
+    end
+    function promote_bool_storeavx2!(z, x, y)
+        @avx for i ∈ eachindex(x)
+            z[i] = (x[i]*x[i] + y[i]*y[i]) < 1 ? 1 : 0
+        end
+        z
+    end
+    function promote_bool_store_avx2!(z, x, y)
+        @_avx for i ∈ eachindex(x)
+            z[i] = (x[i]*x[i] + y[i]*y[i]) < 1 ? 1 : 0
+        end
+        z
+    end
+
+    function Bernoulli_logit(y::BitVector, α::AbstractVector{T}) where {T}
+        t = zero(promote_type(Float32,T))
+        @inbounds for i ∈ eachindex(α)
+            invOmP = 1 + exp(α[i])
+            nlogOmP = log(invOmP)
+            nlogP = nlogOmP - α[i]
+            t -= y[i] ? nlogP : nlogOmP
+        end
+        t
+    end
+    function Bernoulli_logitavx(y::BitVector, α::AbstractVector{T}) where {T}
+        t = zero(promote_type(Float32,T))
+        @avx for i ∈ eachindex(α)
+            invOmP = 1 + exp(α[i])
+            nlogOmP = log(invOmP)
+            nlogP = nlogOmP - α[i]
+            t -= y[i] ? nlogP : nlogOmP
+        end
+        t
+    end
+    function Bernoulli_logit_avx(y::BitVector, α::AbstractVector{T}) where {T}
+        t = zero(promote_type(Float32,T))
+        @_avx for i ∈ eachindex(α)
+            invOmP = 1 + exp(α[i])
+            nlogOmP = log(invOmP)
+            nlogP = nlogOmP - α[i]
+            t -= y[i] ? nlogP : nlogOmP
+        end
+        t
+    end
+
+
     function addormul!(c, a, b)
         for i ∈ eachindex(c,a,b)
             c[i] = a[i] > b[i] ? a[i] + b[i] : a[i] * b[i]
@@ -227,8 +291,19 @@
             a = rand(T, N); b = rand(T, N);
         end;
         c1 = similar(a); c2 = similar(a);
-        addormul!(c1, a, b)
-        addormul_avx!(c2, a, b)
+
+        promote_bool_store!(c1, a, b)
+        promote_bool_storeavx!(c2, a, b)
+        @test c1 == c2
+        fill!(c2, -999999999); promote_bool_store_avx!(c2, a, b)
+        @test c1 == c2
+        fill!(c2, -999999999); promote_bool_storeavx2!(c2, a, b)
+        @test c1 == c2
+        fill!(c2, -999999999); promote_bool_store_avx2!(c2, a, b)
+        @test c1 == c2
+
+        fill!(c2, -999999999); addormul!(c1, a, b)
+        fill!(c2, -999999999); addormul_avx!(c2, a, b)
         @test c1 ≈ c2
         fill!(c2, -999999999); addormulavx!(c2, a, b)
         @test c1 ≈ c2
@@ -296,4 +371,17 @@
         @test C1 ≈ C2
         @test C1 ≈ C3
     end
+    
+    
+    a = rand(-10:10, 43);
+    bit = a .> 0.5;
+    t = Bernoulli_logit(bit, a);
+    @test t ≈ Bernoulli_logitavx(bit, a)
+    @test t ≈ Bernoulli_logit_avx(bit, a)
+    a = rand(43)
+    bit = a .> 0.5;
+    t = Bernoulli_logit(bit, a);
+    @test t ≈ Bernoulli_logitavx(bit, a)
+    @test t ≈ Bernoulli_logit_avx(bit, a)
+
 end