JuliaSIMD · chriselrod · Mar 8, 2020 · Mar 8, 2020
diff --git a/docs/make.jl b/docs/make.jl
@@ -14,12 +14,13 @@ makedocs(;
         ],
         "Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
         "Future Work" => "future_work.md",
+        "API reference" => "api.md",
         "Developer Documentation" => [
             "devdocs/overview.md",
             "devdocs/loopset_structure.md",
-	    "devdocs/constructing_loopsets.md",
-	    "devdocs/evaluating_loops.md",
-	    "devdocs/lowering.md"
+            "devdocs/constructing_loopsets.md",
+            "devdocs/evaluating_loops.md",
+            "devdocs/lowering.md"
         ]
     ],
     # repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -0,0 +1,26 @@
+# API reference
+
+## Macros
+
+```@docs
+@avx
+@_avx
+```
+
+## `map`-like constructs
+
+```@docs
+vmap
+vmap!
+vmapnt
+vmapnt!
+vmapntt
+vmapntt!
+```
+
+## `filter`-like constructs
+
+```@docs
+vfilter
+vfilter!
+```
diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl
@@ -47,6 +47,18 @@ include("condense_loopset.jl")
 include("reconstruct_loopset.jl")
 include("constructors.jl")
 
+"""
+LoopVectorization provides macros and functions that combine SIMD vectorization and
+loop-reordering so as to improve performance:
+
+- [`@avx`](@ref): transform `for`-loops and broadcasting
+- [`@_avx`](@ref): similar to `@avx` but does not use type information
+- [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
+- [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
+- [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
+- [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
+"""
+LoopVectorization
 
 include("precompile.jl")
 _precompile_()

diff --git a/src/broadcast.jl b/src/broadcast.jl
@@ -40,6 +40,21 @@ end
 #     recursive_eltype(ARGS)
 # end
 
+"""
+    A *ˡ B
+
+A lazy product of `A` and `B`. While functionally identical to `A * B`, this may avoid the
+need for intermediate storage for any computations in `A` or `B`.  Example:
+
+    @avx @. a + B *ˡ (c + d')
+
+which is equivalent to
+
+     a .+ B * (c .+ d')
+
+It should only be used inside an `@avx` block, and to materialize the result it cannot be
+the final operation.
+"""
 @inline *ˡ(a::A, b::B) where {A,B} = Product{A,B}(a, b)
 @inline Base.Broadcast.broadcasted(::typeof(*ˡ), a::A, b::B) where {A, B} = Product{A,B}(a, b)
 # TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
@@ -88,7 +103,7 @@ function add_broadcast!(
     reductop = Operation(
         ls, mC, elementbytes, :vfmadd231, compute, reductdeps, kvec, Operation[loadA, loadB, setC]
     )
-    reductop = pushop!(ls, reductop, mC)    
+    reductop = pushop!(ls, reductop, mC)
     reductfinal = Operation(
         ls, mCt, elementbytes, :reduce_to_add, compute, cloopsyms, kvec, Operation[reductop, targetC]
     )
@@ -123,7 +138,7 @@ function add_broadcast_adjoint_array!(
     parent = gensym(:parent)
     pushpreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
     ref = ArrayReference(parent, Symbol[loopsyms[N + 1 - n] for n ∈ 1:N])
-    add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation    
+    add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
 end
 function add_broadcast_adjoint_array!(
     ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{<:AbstractVector}, elementbytes::Int

diff --git a/src/constructors.jl b/src/constructors.jl
@@ -66,7 +66,8 @@ Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant
         end
     end
 
-The macro models the set of nested loops, and chooses a 
+The macro models the set of nested loops, and chooses an ordering of the three loops to
+minimize predicted computation time.
 
 It may also apply to broadcasts:
 
@@ -81,7 +82,7 @@ julia> c = similar(b);
 
 julia> @avx @. c = exp(2a);
 
-julia> b ≈ c 
+julia> b ≈ c
 true
 ```
 
@@ -148,8 +149,9 @@ end
 """
     @_avx
 
-This macro transforms loops, making default assumptions rather than punting to a generated
-function that is often capable of using type information in place of some assumptions.
+This macro transforms loops similarly to [`@avx`](@ref).
+While `@avx` punts to a generated function to enable type-based analysis, `_@avx`
+works on just the expressions. This requires that it makes a number of default assumptions.
 """
 macro _avx(q)
     esc(lower(LoopSet(q, __module__)))
@@ -164,4 +166,3 @@ end
 macro avx_debug(q)
     esc(LoopVectorization.setup_call_debug(LoopSet(q, __module__)))
 end
-
diff --git a/src/costs.jl b/src/costs.jl
@@ -68,7 +68,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     else # we assume custom cost, and that latency == recip_throughput
         scaling = ic.scaling
         sl, srt = round(Int,scaling), scaling
-    end    
+    end
     srt, sl, srp
 end
 # instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
@@ -278,7 +278,7 @@ function reduction_combine_to(x::Float64)
     x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
 end
 reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
-function reduction_zero(x::Float64) 
+function reduction_zero(x::Float64)
     # x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
     x == 1.0 ? :zero : x == 2.0 ? :one : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
 end
@@ -373,4 +373,3 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
     typeof(ifelse) => :vifelse,
     typeof(vifelse) => :vifelse
 )
-
diff --git a/src/filter.jl b/src/filter.jl
@@ -30,3 +30,16 @@ if (Base.libllvm_version ≥ v"7" && VectorizationBase.AVX512F) || Base.libllvm_
 end
 vfilter(f, y) = filter(f, y)
 
+"""
+    vfilter(f, a::AbstractArray)
+
+SIMD-vectorized `filter`, returning an array containing the elements of `a` for which `f` return `true`.
+"""
+vfilter
+
+"""
+    vfilter!(f, a::AbstractArray)
+
+SIMD-vectorized `filter!`, removing the element of `a` for which `f` is false.
+"""
+vfilter!
diff --git a/src/map.jl b/src/map.jl
@@ -1,5 +1,4 @@
-
-
+# Expression-generator for vmap!
 function vmap_quote(N, ::Type{T}) where {T}
     W, Wshift = VectorizationBase.pick_vector_width_shift(T)
     val = Expr(:call, Expr(:curly, :Val, W))
@@ -21,7 +20,15 @@ function vmap_quote(N, ::Type{T}) where {T}
     push!(q.args, :dest)
     q
 end
+"""
+    vmap!(f, destination, a::AbstractArray)
+    vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...)
+
+Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
+and storing the result in `destination`.
+"""
 @generated function vmap!(f::F, dest::AbstractArray{T}, args::Vararg{<:AbstractArray,N}) where {F,T,N}
+    # do not change argnames here without compensatory changes in vmap_quote
     vmap_quote(N, T)
 end
 
@@ -112,7 +119,7 @@ function vmapnt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A
         vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
         vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
     end
-    while i < N - (W - 1) # stops at 16 when 
+    while i < N - (W - 1) # stops at 16 when
         vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
     end
     if i < N
@@ -125,7 +132,7 @@ end
 """
     vmapntt!(::Function, dest, args...)
 
-Like `vmapnt!` (see `vmapnt!`), but but uses `Threads.@threads` for parallel execution.
+Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
 """
 function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A}
     ptry, ptrargs, N = alignstores!(f, y, args...)
@@ -142,7 +149,7 @@ function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,
         vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i)
     end
     ii = Niter << Wsh
-    while ii < N - (W - 1) # stops at 16 when 
+    while ii < N - (W - 1) # stops at 16 when
         vstorent!(ptry, extract_data(f(vload.(V, ptrargs, ii)...)), ii); ii += W
     end
     if ii < N
@@ -157,12 +164,33 @@ function vmap_call(f::F, vm!::V, args::Vararg{<:Any,N}) where {V,F,N}
     dest = similar(first(args), T)
     vm!(f, dest, args...)
 end
+
+"""
+    vmap(f, a::AbstractArray)
+    vmap(f, a::AbstractArray, b::AbstractArray, ...)
+
+SIMD-vectorized `map`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
+and returning a new array.
+"""
 vmap(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmap!, args...)
+
+"""
+    vmapnt(f, a::AbstractArray)
+    vmapnt(f, a::AbstractArray, b::AbstractArray, ...)
+
+A "non-temporal" variant of [`vmap`](@ref). This can improve performance in cases where
+`destination` will not be needed soon.
+"""
 vmapnt(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmapnt!, args...)
+
+"""
+    vmapntt(f, a::AbstractArray)
+    vmapntt(f, a::AbstractArray, b::AbstractArray, ...)
+
+A threaded variant of [`vmapnt`](@ref).
+"""
 vmapntt(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmapntt!, args...)
 
 
 # @inline vmap!(f, y, x...) = @avx y .= f.(x...)
 # @inline vmap(f, x...) = @avx f.(x...)
-
-