Skip to content

Commit 64f765d

Browse files
authored
Merge pull request #70 from timholy/teh/docstrings
Add a few more docstrings
2 parents 290c7a3 + eebe3b1 commit 64f765d

File tree

8 files changed

+115
-20
lines changed

8 files changed

+115
-20
lines changed

docs/make.jl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@ makedocs(;
1414
],
1515
"Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
1616
"Future Work" => "future_work.md",
17+
"API reference" => "api.md",
1718
"Developer Documentation" => [
1819
"devdocs/overview.md",
1920
"devdocs/loopset_structure.md",
20-
"devdocs/constructing_loopsets.md",
21-
"devdocs/evaluating_loops.md",
22-
"devdocs/lowering.md"
21+
"devdocs/constructing_loopsets.md",
22+
"devdocs/evaluating_loops.md",
23+
"devdocs/lowering.md"
2324
]
2425
],
2526
# repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",

docs/src/api.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# API reference
2+
3+
## Macros
4+
5+
```@docs
6+
@avx
7+
@_avx
8+
```
9+
10+
## `map`-like constructs
11+
12+
```@docs
13+
vmap
14+
vmap!
15+
vmapnt
16+
vmapnt!
17+
vmapntt
18+
vmapntt!
19+
```
20+
21+
## `filter`-like constructs
22+
23+
```@docs
24+
vfilter
25+
vfilter!
26+
```

src/LoopVectorization.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@ include("condense_loopset.jl")
4848
include("reconstruct_loopset.jl")
4949
include("constructors.jl")
5050

51+
"""
52+
LoopVectorization provides macros and functions that combine SIMD vectorization and
53+
loop-reordering so as to improve performance:
54+
55+
- [`@avx`](@ref): transform `for`-loops and broadcasting
56+
- [`@_avx`](@ref): similar to `@avx` but does not use type information
57+
- [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!`
58+
- [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!`
59+
- [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!`
60+
- [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!`
61+
"""
62+
LoopVectorization
5163

5264
include("precompile.jl")
5365
_precompile_()

src/broadcast.jl

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,21 @@ end
4040
# recursive_eltype(ARGS)
4141
# end
4242

43+
"""
44+
A *ˡ B
45+
46+
A lazy product of `A` and `B`. While functionally identical to `A * B`, this may avoid the
47+
need for intermediate storage for any computations in `A` or `B`. Example:
48+
49+
@avx @. a + B *ˡ (c + d')
50+
51+
which is equivalent to
52+
53+
a .+ B * (c .+ d')
54+
55+
It should only be used inside an `@avx` block, and to materialize the result it cannot be
56+
the final operation.
57+
"""
4358
@inline *ˡ(a::A, b::B) where {A,B} = Product{A,B}(a, b)
4459
@inline Base.Broadcast.broadcasted(::typeof(*ˡ), a::A, b::B) where {A, B} = Product{A,B}(a, b)
4560
# TODO: Need to make this handle A or B being (1 or 2)-D broadcast objects.
@@ -88,7 +103,7 @@ function add_broadcast!(
88103
reductop = Operation(
89104
ls, mC, elementbytes, :vfmadd231, compute, reductdeps, kvec, Operation[loadA, loadB, setC]
90105
)
91-
reductop = pushop!(ls, reductop, mC)
106+
reductop = pushop!(ls, reductop, mC)
92107
reductfinal = Operation(
93108
ls, mCt, elementbytes, :reduce_to_add, compute, cloopsyms, kvec, Operation[reductop, targetC]
94109
)
@@ -123,7 +138,7 @@ function add_broadcast_adjoint_array!(
123138
parent = gensym(:parent)
124139
pushpreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
125140
ref = ArrayReference(parent, Symbol[loopsyms[N + 1 - n] for n 1:N])
126-
add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
141+
add_simple_load!( ls, destname, ref, elementbytes, true, true )::Operation
127142
end
128143
function add_broadcast_adjoint_array!(
129144
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{<:AbstractVector}, elementbytes::Int

src/constructors.jl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant
6666
end
6767
end
6868
69-
The macro models the set of nested loops, and chooses a
69+
The macro models the set of nested loops, and chooses an ordering of the three loops to
70+
minimize predicted computation time.
7071
7172
It may also apply to broadcasts:
7273
@@ -81,7 +82,7 @@ julia> c = similar(b);
8182
8283
julia> @avx @. c = exp(2a);
8384
84-
julia> b ≈ c
85+
julia> b ≈ c
8586
true
8687
```
8788
@@ -148,8 +149,9 @@ end
148149
"""
149150
@_avx
150151
151-
This macro transforms loops, making default assumptions rather than punting to a generated
152-
function that is often capable of using type information in place of some assumptions.
152+
This macro transforms loops similarly to [`@avx`](@ref).
153+
While `@avx` punts to a generated function to enable type-based analysis, `_@avx`
154+
works on just the expressions. This requires that it makes a number of default assumptions.
153155
"""
154156
macro _avx(q)
155157
esc(lower(LoopSet(q, __module__)))
@@ -164,4 +166,3 @@ end
164166
macro avx_debug(q)
165167
esc(LoopVectorization.setup_call_debug(LoopSet(q, __module__)))
166168
end
167-

src/costs.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
6868
else # we assume custom cost, and that latency == recip_throughput
6969
scaling = ic.scaling
7070
sl, srt = round(Int,scaling), scaling
71-
end
71+
end
7272
srt, sl, srp
7373
end
7474
# instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
@@ -278,7 +278,7 @@ function reduction_combine_to(x::Float64)
278278
x == 1.0 ? :reduce_to_add : x == 2.0 ? :reduce_to_prod : x == 5.0 ? :reduce_to_max : x == 6.0 ? :reduce_to_min : throw("Reduction not found.")
279279
end
280280
reduction_combine_to(x) = reduction_combine_to(reduction_instruction_class(x))
281-
function reduction_zero(x::Float64)
281+
function reduction_zero(x::Float64)
282282
# x == 1.0 ? :zero : x == 2.0 ? :one : x == 3.0 ? :false : x == 4.0 ? :true : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
283283
x == 1.0 ? :zero : x == 2.0 ? :one : x == 5.0 ? :typemin : x == 6.0 ? :typemax : throw("Reduction not found.")
284284
end
@@ -373,4 +373,3 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
373373
typeof(ifelse) => :vifelse,
374374
typeof(vifelse) => :vifelse
375375
)
376-

src/filter.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,16 @@ if (Base.libllvm_version ≥ v"7" && VectorizationBase.AVX512F) || Base.libllvm_
3030
end
3131
vfilter(f, y) = filter(f, y)
3232

33+
"""
34+
vfilter(f, a::AbstractArray)
35+
36+
SIMD-vectorized `filter`, returning an array containing the elements of `a` for which `f` return `true`.
37+
"""
38+
vfilter
39+
40+
"""
41+
vfilter!(f, a::AbstractArray)
42+
43+
SIMD-vectorized `filter!`, removing the element of `a` for which `f` is false.
44+
"""
45+
vfilter!

src/map.jl

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
1+
# Expression-generator for vmap!
32
function vmap_quote(N, ::Type{T}) where {T}
43
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
54
val = Expr(:call, Expr(:curly, :Val, W))
@@ -21,7 +20,15 @@ function vmap_quote(N, ::Type{T}) where {T}
2120
push!(q.args, :dest)
2221
q
2322
end
23+
"""
24+
vmap!(f, destination, a::AbstractArray)
25+
vmap!(f, destination, a::AbstractArray, b::AbstractArray, ...)
26+
27+
Vectorized-`map!`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
28+
and storing the result in `destination`.
29+
"""
2430
@generated function vmap!(f::F, dest::AbstractArray{T}, args::Vararg{<:AbstractArray,N}) where {F,T,N}
31+
# do not change argnames here without compensatory changes in vmap_quote
2532
vmap_quote(N, T)
2633
end
2734

@@ -112,7 +119,7 @@ function vmapnt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A
112119
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
113120
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
114121
end
115-
while i < N - (W - 1) # stops at 16 when
122+
while i < N - (W - 1) # stops at 16 when
116123
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
117124
end
118125
if i < N
@@ -125,7 +132,7 @@ end
125132
"""
126133
vmapntt!(::Function, dest, args...)
127134
128-
Like `vmapnt!` (see `vmapnt!`), but but uses `Threads.@threads` for parallel execution.
135+
Like `vmapnt!` (see `vmapnt!`), but uses `Threads.@threads` for parallel execution.
129136
"""
130137
function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A}
131138
ptry, ptrargs, N = alignstores!(f, y, args...)
@@ -142,7 +149,7 @@ function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,
142149
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i)
143150
end
144151
ii = Niter << Wsh
145-
while ii < N - (W - 1) # stops at 16 when
152+
while ii < N - (W - 1) # stops at 16 when
146153
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, ii)...)), ii); ii += W
147154
end
148155
if ii < N
@@ -157,12 +164,33 @@ function vmap_call(f::F, vm!::V, args::Vararg{<:Any,N}) where {V,F,N}
157164
dest = similar(first(args), T)
158165
vm!(f, dest, args...)
159166
end
167+
168+
"""
169+
vmap(f, a::AbstractArray)
170+
vmap(f, a::AbstractArray, b::AbstractArray, ...)
171+
172+
SIMD-vectorized `map`, applying `f` to each element of `a` (or paired elements of `a`, `b`, ...)
173+
and returning a new array.
174+
"""
160175
vmap(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmap!, args...)
176+
177+
"""
178+
vmapnt(f, a::AbstractArray)
179+
vmapnt(f, a::AbstractArray, b::AbstractArray, ...)
180+
181+
A "non-temporal" variant of [`vmap`](@ref). This can improve performance in cases where
182+
`destination` will not be needed soon.
183+
"""
161184
vmapnt(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmapnt!, args...)
185+
186+
"""
187+
vmapntt(f, a::AbstractArray)
188+
vmapntt(f, a::AbstractArray, b::AbstractArray, ...)
189+
190+
A threaded variant of [`vmapnt`](@ref).
191+
"""
162192
vmapntt(f::F, args::Vararg{<:Any,N}) where {F,N} = vmap_call(f, vmapntt!, args...)
163193

164194

165195
# @inline vmap!(f, y, x...) = @avx y .= f.(x...)
166196
# @inline vmap(f, x...) = @avx f.(x...)
167-
168-

0 commit comments

Comments
 (0)