Skip to content

Commit 1fddba8

Browse files
authored
Merge pull request #72 from timholy/teh/devdocs
Small tweaks to the devdocs
2 parents 64f765d + b47c95f commit 1fddba8

File tree

7 files changed

+81
-42
lines changed

7 files changed

+81
-42
lines changed

docs/Manifest.toml

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,51 @@ uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
1313

1414
[[DocStringExtensions]]
1515
deps = ["LibGit2", "Markdown", "Pkg", "Test"]
16-
git-tree-sha1 = "1df01539a1c952cef21f2d2d1c092c2bcf0177d7"
16+
git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a"
1717
uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
18-
version = "0.6.0"
18+
version = "0.8.1"
1919

2020
[[Documenter]]
21-
deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "LibGit2", "Logging", "Markdown", "Pkg", "REPL", "Random", "Test", "Unicode"]
22-
git-tree-sha1 = "a6db1c69925cdc53aafb38caec4446be26e0c617"
21+
deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
22+
git-tree-sha1 = "d497bcc45bb98a1fbe19445a774cfafeabc6c6df"
2323
uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
24-
version = "0.21.0"
24+
version = "0.24.5"
2525

2626
[[InteractiveUtils]]
2727
deps = ["Markdown"]
2828
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
2929

30+
[[JSON]]
31+
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
32+
git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
33+
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
34+
version = "0.21.0"
35+
3036
[[LibGit2]]
37+
deps = ["Printf"]
3138
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
3239

40+
[[Libdl]]
41+
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
42+
3343
[[Logging]]
3444
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
3545

3646
[[Markdown]]
3747
deps = ["Base64"]
3848
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
3949

50+
[[Mmap]]
51+
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
52+
53+
[[Parsers]]
54+
deps = ["Dates", "Test"]
55+
git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553"
56+
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
57+
version = "0.3.12"
58+
4059
[[Pkg]]
41-
deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
60+
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
4261
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
4362

4463
[[Printf]]

docs/Project.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
[deps]
22
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
3+
4+
[compat]
5+
Documenter = "0.24"

docs/make.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ using Documenter, LoopVectorization
22

33
makedocs(;
44
modules=[LoopVectorization],
5-
format=Documenter.HTML(),
5+
format=Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
66
pages=[
77
"Home" => "index.md",
88
"Getting Started" => "getting_started.md",
Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,6 @@
11
# Constructing LoopSets
22

3-
When applying the `@avx` macro to a broadcast expression, the LoopSet object is constructed by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts.
4-
```julia
5-
julia> Meta.@lower @. f(g(a,b) + c) / d
6-
:($(Expr(:thunk, CodeInfo(
7-
@ none within `top-level scope'
8-
1 ─ %1 = Base.broadcasted(g, a, b)
9-
│ %2 = Base.broadcasted(+, %1, c)
10-
│ %3 = Base.broadcasted(f, %2)
11-
│ %4 = Base.broadcasted(/, %3, d)
12-
│ %5 = Base.materialize(%4)
13-
└── return %5
14-
))))
15-
16-
julia> @macroexpand @avx @. f(g(a,b) + c) / d
17-
quote
18-
var"##262" = Base.broadcasted(g, a, b)
19-
var"##263" = Base.broadcasted(+, var"##262", c)
20-
var"##264" = Base.broadcasted(f, var"##263")
21-
var"##265" = Base.broadcasted(/, var"##264", d)
22-
var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
23-
end
24-
```
25-
These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies.
3+
## Loop expressions
264

275
When applying `@avx` to a loop expression, it creates a `LoopSet` without awareness to type information, and then [condenses the information](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/condense_loopset.jl) into a summary which is passed as type information to a generated function.
286
```julia
@@ -41,7 +19,8 @@ quote
4119
end
4220
end
4321
```
44-
This summary is then [reconstruced](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been tranposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This is why
22+
When the corresponding method gets compiled for specific type of `A`, `B`, and `C`, the call to the `@generated` function `_avx_!` get compiled. This causes the summary to be [reconstructed](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/reconstruct_loopset.jl) using the available type information. This type information can be used, for example, to realize an array has been transposed, and thus correctly identify which axis contains contiguous elements that are efficient to load from. This kind of information cannot be extracted from the raw expression, which is why these decisions are made when the method gets compiled for specific types via the `@generated` function `_avx_!`.
23+
4524
The three chief components of the summaries are the definitions of operations, e.g.:
4625
```julia
4726
:LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x0000000000000013, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, LoopVectorization.memload, 0x02, 0x03)
@@ -55,6 +34,28 @@ and the set of loop bounds:
5534
(LoopVectorization.StaticLowerUnitRange{0}(M), LoopVectorization.StaticLowerUnitRange{0}(N), LoopVectorization.StaticLowerUnitRange{0}(K))
5635
```
5736

37+
## Broadcasting
5838

39+
When applying the `@avx` macro to a broadcast expression, there are no explicit loops, and even the dimensionality of the operation is unknown. Consequently the `LoopSet` object must be constructed at compile time. The function and involved operations are their relationships are straightforward to infer from the structure of nested broadcasts:
40+
```julia
41+
julia> Meta.@lower @. f(g(a,b) + c) / d
42+
:($(Expr(:thunk, CodeInfo(
43+
@ none within `top-level scope'
44+
1 ─ %1 = Base.broadcasted(g, a, b)
45+
│ %2 = Base.broadcasted(+, %1, c)
46+
│ %3 = Base.broadcasted(f, %2)
47+
│ %4 = Base.broadcasted(/, %3, d)
48+
│ %5 = Base.materialize(%4)
49+
└── return %5
50+
))))
5951
60-
52+
julia> @macroexpand @avx @. f(g(a,b) + c) / d
53+
quote
54+
var"##262" = Base.broadcasted(g, a, b)
55+
var"##263" = Base.broadcasted(+, var"##262", c)
56+
var"##264" = Base.broadcasted(f, var"##263")
57+
var"##265" = Base.broadcasted(/, var"##264", d)
58+
var"##266" = LoopVectorization.vmaterialize(var"##265", Val{:Main}())
59+
end
60+
```
61+
These nested broadcasted objects already express information very similar to what the `LoopSet` objects hold. The dimensionality of the objects provides the information on the associated loop dependencies, but again this information is available only when the method is compiled for specific types. The `@generated` function `vmaterialize` constructs the LoopSet by recursively evaluating [add_broadcast!](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/broadcast.jl#L166) on all the fields.

docs/src/devdocs/evaluating_loops.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,18 @@
33
The heart of the optimizatizations performed by LoopVectorization are given in the [determinestrategy.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/determinestrategy.jl) file utilizing instruction costs specified in [costs.jl](https://github.com/chriselrod/LoopVectorization.jl/blob/master/src/costs.jl).
44
Essentially, it estimates the cost of different means of evaluating the loops. It iterates through the different possible loop orders, as well as considering which loops to unroll, and which to vectorize. It will consider unrolling 1 or 2 loops (but it could settle on unrolling by a factor of 1, i.e. not unrolling), and vectorizing 1.
55

6+
The cost estimate is based on the costs of individual instructions and the number of times each one needs to be executed for the given strategy. The instruction cost can be broken into several components:
7+
8+
- The `scalar latency` is the minimum delay, in clock cycles, associated with the instruction. Think of it as the delay from turning on the water to when water starts coming out the hose.
9+
- The `reciprocal throughput` is similar to the latency, but it measures the number of cycles per operation when many of the same operation are repeated in sequence. Continuing our hose analogy, think of it as the inverse of the flow rate at steady-state. It is typically ≤ the `scalar latency`.
10+
- The `register pressure` measures the register consumption by the operation
11+
12+
Data on individual instructions for specific architectures can be found on [Agner Fog's website](https://agner.org/optimize/instruction_tables.pdf). Most of the costs used were those for the Skylake-X architecture.
13+
14+
Examples of how these come into play:
15+
- Vectorizing a loop will result in each instruction evaluating multiple iterations, but the costs of loads and stores will change based on the memory layouts of the accessed arrays.
16+
- Unrolling can help reduce the number of times an operation must be performed, for example if it can allow us to reuse memory multiple times rather than reloading it every time it is needed.
17+
- When there is a reduction, such as performing a sum, there is a dependency chain. Each `+` has to wait for the previous `+` to finish executing before it can begin, thus execution time is bounded by latency rather than minimum of the throughput of the `+` and load operations. By unrolling the loop, we can create multiple independent dependency chains.
18+
19+
20+

docs/src/devdocs/loopset_structure.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# LoopSet Structure
22

3-
The loopsets define loops as a set of operations that depend on one another, and also on loops. Cycles are not allowed, making it a directed acyclic graph. Currently, only single return values are supported.
4-
Lets use a set of nested loops performing matrix multiplication as an example. We can create a naive `LoopSet` from an expression (naive due to being created without access to any type information):
3+
The loopsets define loops as a set of operations that depend on one another, and also on loops. Cycles are not allowed, making it a directed acyclic graph.
4+
Let's use a set of nested loops performing matrix multiplication as an example. We can create a naive `LoopSet` from an expression (naive due to being created without access to any type information):
55
```julia
66
julia> using LoopVectorization
77

@@ -50,13 +50,14 @@ julia> LoopVectorization.parents(ans)
5050
var"##tempload#258" = A[m, k]
5151
var"##tempload#259" = B[k, n]
5252
var"##reduction#260" = var"##reductzero#261"
53-
```
54-
References to arrays are represtened with an `ArrayReferenceMeta` data structure:
53+
```
54+
References to arrays are represented with an `ArrayReferenceMeta` data structure:
5555
```julia
5656
julia> LoopVectorization.operations(lsAmulB)[3].ref
5757
LoopVectorization.ArrayReferenceMeta(LoopVectorization.ArrayReference(:A, [:m, :k], Int8[0, 0]), Bool[1, 1], Symbol("##vptr##_A"))
5858
```
5959
It contains the name of the parent array (`:A`), the indicies `[:m,:k]`, and a boolean vector (`Bool[1, 1]`) indicating whether these indices are loop iterables. Note that the optimizer assumes arrays are column-major, and thus that it is efficient to read contiguous elements from the first index. In lower level terms, it means that [high-throughput vmov](https://www.felixcloutier.com/x86/movupd) instructions can be used rather than [low-throughput](https://www.felixcloutier.com/x86/vgatherdpd:vgatherqpd) [gathers](https://www.felixcloutier.com/x86/vgatherqps:vgatherqpd). Similar story for storing elements.
6060
When no axis has unit stride, the first given index will be the dummy `Symbol("##DISCONTIGUOUSSUBARRAY##")`.
6161

62-
62+
!!! warning
63+
Currently, only single return values are supported (tuple destructuring is not supported in assignments).

src/costs.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,21 +35,21 @@ Base.isequal(ins1::Instruction, ins2::Instruction) = (ins1.instr === ins2.instr)
3535
const LOOPCONSTANT = Instruction(gensym())
3636

3737
struct InstructionCost
38-
scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 -> == latency == reciprical throughput
39-
scalar_reciprical_throughput::Float64
38+
scaling::Float64 # sentinel values: -3 == no scaling; -2 == offset_scaling, -1 == linear scaling, >0 -> == latency == reciprocal throughput
39+
scalar_reciprocal_throughput::Float64
4040
scalar_latency::Int
4141
register_pressure::Int
4242
end
4343
InstructionCost(sl::Int, srt::Float64, scaling::Float64 = -3.0) = InstructionCost(scaling, srt, sl, 0)
4444

45-
nocost(c::InstructionCost) = c.scalar_reciprical_throughput == 0.0
45+
nocost(c::InstructionCost) = c.scalar_reciprocal_throughput == 0.0
4646
flatcost(c::InstructionCost) = c.scaling == -3.0
4747
offsetscaling(c::InstructionCost) = c.scaling == -2.0
4848
linearscaling(c::InstructionCost) = c.scaling == -1.0
4949

5050
function scalar_cost(ic::InstructionCost)#, ::Type{T} = Float64) where {T}
51-
@unpack scalar_reciprical_throughput, scalar_latency, register_pressure = ic
52-
scalar_reciprical_throughput, scalar_latency, register_pressure
51+
@unpack scalar_reciprocal_throughput, scalar_latency, register_pressure = ic
52+
scalar_reciprocal_throughput, scalar_latency, register_pressure
5353
end
5454
function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
5555
srt, sl, srp = scalar_cost(ic)

0 commit comments

Comments
 (0)