Skip to content

Add a few more docstrings #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ makedocs(;
],
"Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
"Future Work" => "future_work.md",
"API reference" => "api.md",
"Developer Documentation" => [
"devdocs/overview.md",
"devdocs/loopset_structure.md",
"devdocs/constructing_loopsets.md",
"devdocs/evaluating_loops.md",
"devdocs/lowering.md"
"devdocs/constructing_loopsets.md",
"devdocs/evaluating_loops.md",
"devdocs/lowering.md"
]
],
# repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
Expand Down
26 changes: 26 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# API reference

## Macros

```@docs
@avx
@_avx
```

## `map`-like constructs

```@docs
vmap
vmap!
vmapnt
vmapnt!
vmapntt
vmapntt!
```

## `filter`-like constructs

```@docs
vfilter
vfilter!
```
12 changes: 12 additions & 0 deletions src/LoopVectorization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ include("condense_loopset.jl")
include("reconstruct_loopset.jl")
include("constructors.jl")

"""
LoopVectorization provides macros and functions that combine SIMD vectorization and
loop-reordering so as to improve performance:

- [`@avx`](@ref): transform `for`-loops and broadcasting
- [`@_avx`](@ref): similar to `@avx` but does not use type information
- [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
- [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
- [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
- [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
"""
LoopVectorization

include("precompile.jl")
_precompile_()
Expand Down
19 changes: 17 additions & 2 deletions src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,21 @@ end
# recursive_eltype(ARGS)
# end

"""
A *ˡ B

A lazy product of `A` and `B`. While functionally identical to `A * B`, this may avoid the
need for intermediate storage for any computations in `A` or `B`. Example:

@avx @. a + B *ˡ (c + d')

which is equivalent to

a .+ B * (c .+ d')

It should only be used inside an `@avx` block, and to materialize the result it cannot be
the final operation.
"""
@inline *ˡ(a::A, b::B) where {A,B} = Product{A,B}(a, b)
@inline Base.Broadcast.broadcasted(::typeof(*ˡ), a::A, b::B) where {A, B} = Product{A,B}(a, b)
# TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
Expand Down Expand Up @@ -88,7 +103,7 @@ function add_broadcast!(
reductop = Operation(
ls, mC, elementbytes, :vfmadd231, compute, reductdeps, kvec, Operation[loadA, loadB, setC]
)
reductop = pushop!(ls, reductop, mC)
reductop = pushop!(ls, reductop, mC)
reductfinal = Operation(
ls, mCt, elementbytes, :reduce_to_add, compute, cloopsyms, kvec, Operation[reductop, targetC]
)
Expand Down Expand Up @@ -123,7 +138,7 @@ function add_broadcast_adjoint_array!(
parent = gensym(:parent)
pushpreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
ref = ArrayReference(parent, Symbol[loopsyms[N + 1 - n] for n ∈ 1:N])
add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
end
function add_broadcast_adjoint_array!(
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{<:AbstractVector}, elementbytes::Int
Expand Down
11 changes: 6 additions & 5 deletions src/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant
end
end

The macro models the set of nested loops, and chooses a
The macro models the set of nested loops, and chooses an ordering of the three loops to
minimize predicted computation time.

It may also apply to broadcasts:

Expand All @@ -81,7 +82,7 @@ julia> c = similar(b);

julia> @avx @. c = exp(2a);

julia> b ≈ c
julia> b ≈ c
true
```

Expand Down Expand Up @@ -148,8 +149,9 @@ end
"""
@_avx

This macro transforms loops, making default assumptions rather than punting to a generated
function that is often capable of using type information in place of some assumptions.
This macro transforms loops similarly to [`@avx`](@ref).
While `@avx` punts to a generated function to enable type-based analysis, `_@avx`
works on just the expressions. This requires that it makes a number of default assumptions.
"""
macro _avx(q)
esc(lower(LoopSet(q, __module__)))
Expand All @@ -164,4 +166,3 @@ end
macro avx_debug(q)
esc(LoopVectorization.setup_call_debug(LoopSet(q, __module__)))
end

5 changes: 2 additions & 3 deletions src/costs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
else # we assume custom cost, and that latency == recip_throughput
scaling = ic.scaling
sl, srt = round(Int,scaling), scaling
end
end
srt, sl, srp
end
# instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
Expand Down Expand Up @@ -278,7 +278,7 @@ function reduction_combine_to(x::Float64)
x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
end
reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
function reduction_zero(x::Float64)
function reduction_zero(x::Float64)
# x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
x == 1.0 ? :zero : x == 2.0 ? :one : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
end
Expand Down Expand Up @@ -373,4 +373,3 @@ const FUNCTIONSYMBOLS = Dict{Type{<:Function},Instruction}(
typeof(ifelse) => :vifelse,
typeof(vifelse) => :vifelse
)

13 changes: 13 additions & 0 deletions src/filter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,16 @@ if (Base.libllvm_version ≥ v"7" && VectorizationBase.AVX512F) || Base.libllvm_
end
vfilter(f, y) = filter(f, y)

"""
vfilter(f, a::AbstractArray)

SIMD-vectorized `filter`, returning an array containing the elements of `a` for which `f` return `true`.
"""
vfilter

"""
vfilter!(f, a::AbstractArray)

SIMD-vectorized `filter!`, removing the element of `a` for which `f` is false.
"""
vfilter!
42 changes: 35 additions & 7 deletions src/map.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@


# Expression-generator for vmap!
function vmap_quote(N, ::Type{T}) where {T}
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
val = Expr(:call, Expr(:curly, :Val, W))
Expand All @@ -21,7 +20,15 @@ function vmap_quote(N, ::Type{T}) where {T}
push!(q.args, :dest)
q
end
"""
vmap!(f, destination, a::AbstractArray)
vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...)

Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
and storing the result in `destination`.
"""
@generated function vmap!(f::F, dest::AbstractArray{T}, args::Vararg{<:AbstractArray,N}) where {F,T,N}
# do not change argnames here without compensatory changes in vmap_quote
vmap_quote(N, T)
end

Expand Down Expand Up @@ -112,7 +119,7 @@ function vmapnt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
end
while i < N - (W - 1) # stops at 16 when
while i < N - (W - 1) # stops at 16 when
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
end
if i < N
Expand All @@ -125,7 +132,7 @@ end
"""
vmapntt!(::Function, dest, args...)

Like `vmapnt!` (see `vmapnt!`), but but uses `Threads.@threads` for parallel execution.
Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
"""
function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A}
ptry, ptrargs, N = alignstores!(f, y, args...)
Expand All @@ -142,7 +149,7 @@ function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i)
end
ii = Niter << Wsh
while ii < N - (W - 1) # stops at 16 when
while ii < N - (W - 1) # stops at 16 when
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, ii)...)), ii); ii += W
end
if ii < N
Expand All @@ -157,12 +164,33 @@ function vmap_call(f::F, vm!::V, args::Vararg{<:Any,N}) where {V,F,N}
dest = similar(first(args), T)
vm!(f, dest, args...)
end

"""
vmap(f, a::AbstractArray)
vmap(f, a::AbstractArray, b::AbstractArray, ...)

SIMD-vectorized `map`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
and returning a new array.
"""
vmap(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmap!, args...)

"""
vmapnt(f, a::AbstractArray)
vmapnt(f, a::AbstractArray, b::AbstractArray, ...)

A "non-temporal" variant of [`vmap`](@ref). This can improve performance in cases where
`destination` will not be needed soon.
"""
vmapnt(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmapnt!, args...)

"""
vmapntt(f, a::AbstractArray)
vmapntt(f, a::AbstractArray, b::AbstractArray, ...)

A threaded variant of [`vmapnt`](@ref).
"""
vmapntt(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmapntt!, args...)


# @inline vmap!(f, y, x...) = @avx y .= f.(x...)
# @inline vmap(f, x...) = @avx f.(x...)