Skip to content

Commit 37f43c4

Browse files
committed
Update for latest VectorizationBase and SIMDPirates using 1-based indexing, so that LoopVectorization now uses the same indexing as the original arrays, hopefully simplifying code generation.
1 parent 1fddba8 commit 37f43c4

22 files changed

+185
-144
lines changed

Manifest.toml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ deps = ["Base64"]
3232
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
3333

3434
[[OffsetArrays]]
35-
git-tree-sha1 = "707e34562700b81e8aa13548eb6b23b18112e49b"
35+
git-tree-sha1 = "6a35d9446b40ae5004cd7bd0f1ae3505528c7fd6"
3636
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
37-
version = "1.0.2"
37+
version = "1.0.3"
3838

3939
[[OrderedCollections]]
4040
deps = ["Random", "Serialization", "Test"]
@@ -54,15 +54,15 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
5454

5555
[[SIMDPirates]]
5656
deps = ["VectorizationBase"]
57-
git-tree-sha1 = "f91198b7ef74b04028f98e0eed7c556b93538a2e"
57+
git-tree-sha1 = "8705f93f66f789605baa670e8df0244ad4c80d40"
5858
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
59-
version = "0.6.6"
59+
version = "0.7.0"
6060

6161
[[SLEEFPirates]]
6262
deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
63-
git-tree-sha1 = "66014d5d3f77753a70b706aeaede1ad70a942915"
63+
git-tree-sha1 = "d53dea0b025c01e5a40c827236505507d4572a05"
6464
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
65-
version = "0.4.1"
65+
version = "0.4.2"
6666

6767
[[Serialization]]
6868
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -76,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7676

7777
[[VectorizationBase]]
7878
deps = ["CpuId", "LinearAlgebra"]
79-
git-tree-sha1 = "8abb5697fb64cadccd1bba444c955942d3181e5c"
79+
git-tree-sha1 = "34004c00c561c6d4de6d30970bbd9c0a5c2be0a3"
8080
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
81-
version = "0.7.1"
81+
version = "0.8.1"

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1414
[compat]
1515
OffsetArrays = "1"
1616
Parameters = "0"
17-
SIMDPirates = "~0.6.6"
18-
SLEEFPirates = "~0.4"
19-
VectorizationBase = "~0.7.1"
17+
SIMDPirates = "0.7"
18+
SLEEFPirates = "0.4"
19+
VectorizationBase = "0.8"
2020
julia = "1.1"
2121

2222
[extras]

src/LoopVectorization.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector
55
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
66
maybestaticlength, maybestaticsize, staticm1, subsetview, vzero, stridedpointer_for_broadcast,
77
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
8-
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
8+
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct,
9+
maybestaticfirst, maybestaticlast
910
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
1011
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
1112
vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, #prefetch,

src/add_loads.jl

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,13 @@ end
5656

5757
function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int)
5858
# check for CSE opportunity
59-
loopsym = Symbol("##LOOPSYMBOL##", arg)
60-
ar = ArrayReference(loopsym, [arg])
61-
for op operations(ls)
62-
if isload(op) && op.ref.ref == ar
63-
return op
64-
end
59+
instr = Instruction(arg, arg)
60+
for op operations(ls)#check to CSE
61+
(op.variable === arg && instr == instruction(op)) && return op
6562
end
66-
pushpreamble!(ls, Expr(:(=), loopsym, LoopValue()))
67-
loopsymop = add_simple_load!(ls, gensym(loopsym), ar, elementbytes, false)
68-
push!(ls.syms_aliasing_refs, name(loopsymop))
69-
push!(ls.refs_aliasing_syms, loopsymop.ref)
70-
loopsymop
63+
op = Operation(
64+
length(operations(ls)), arg, elementbytes, instr, loopvalue, [arg]
65+
)
66+
pushop!(ls, op, arg)
7167
end
7268

src/broadcast.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ function add_broadcast!(
6969
pushpreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
7070
pushpreamble!(ls, Expr(:(=), K, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:ref, Expr(:call, :size, mB), 1))))
7171
k = gensym(:k)
72-
add_loop!(ls, Loop(k, 0, K), k)
72+
add_loop!(ls, Loop(k, 1, K), k)
7373
m = loopsyms[1];
7474
if numdims(B) == 1
7575
bloopsyms = Symbol[k]
@@ -226,7 +226,7 @@ end
226226
sizes = Expr(:tuple)
227227
for (n,itersym) enumerate(loopsyms)
228228
Nsym = gensym(:N)
229-
add_loop!(ls, Loop(itersym, 0, Nsym), itersym)
229+
add_loop!(ls, Loop(itersym, 1, Nsym), itersym)
230230
push!(sizes.args, Nsym)
231231
end
232232
pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest)))
@@ -251,7 +251,7 @@ end
251251
sizes = Expr(:tuple)
252252
for (n,itersym) enumerate(loopsyms)
253253
Nsym = gensym(:N)
254-
add_loop!(ls, Loop(itersym, 0, Nsym), itersym)
254+
add_loop!(ls, Loop(itersym, 1, Nsym), itersym)
255255
push!(sizes.args, Nsym)
256256
end
257257
pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest′)))

src/condense_loopset.jl

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ function ArrayRefStruct(ls::LoopSet, mref::ArrayReferenceMeta, arraysymbolinds::
4646
ArrayRefStruct( index_types, indices, offsets )
4747
end
4848

49-
struct OperationStruct
49+
struct OperationStruct <: AbstractLoopOperation
5050
# instruction::Instruction
5151
loopdeps::UInt64
5252
reduceddeps::UInt64
@@ -56,10 +56,8 @@ struct OperationStruct
5656
array::UInt8
5757
symid::UInt8
5858
end
59-
isload(os::OperationStruct) = os.node_type == memload
60-
isstore(os::OperationStruct) = os.node_type == memstore
61-
iscompute(os::OperationStruct) = os.node_type == compute
62-
isconstant(os::OperationStruct) = os.node_type == constant
59+
optype(os) = os.node_type
60+
6361
function findmatchingarray(ls::LoopSet, mref::ArrayReferenceMeta)
6462
id = 0x01
6563
for r ls.refs_aliasing_syms
@@ -213,12 +211,8 @@ end
213211
assigned_names[i] = LHS
214212
d = (D[i])::Union{Nothing,Int}
215213
if d === nothing # stridedpointer
216-
if ari == -1
217-
RHS = Expr(:call, :LoopValue)
218-
else
219-
num_arrays += 1
220-
RHS = Expr(:call, lv(:stridedpointer), Expr(:ref, :vargs, ari), Expr(:ref, :arraydescript, ari))
221-
end
214+
num_arrays += 1
215+
RHS = Expr(:call, lv(:stridedpointer), Expr(:ref, :vargs, ari), Expr(:ref, :arraydescript, ari))
222216
else #subsetview
223217
j += 1
224218
RHS = Expr(:call, :subsetview, assigned_names[ari], Expr(:call, Expr(:curly, :Val, d)), Expr(:ref, :subsetvals, j))
@@ -317,8 +311,6 @@ function setup_call_noinline(ls::LoopSet, U = zero(Int8), T = zero(Int8))
317311
push!(stridedpointerLHS, vp)
318312
push!(vptrindices.args, findfirst(a -> vptr(a) == vp, ls.refs_aliasing_syms))
319313
end
320-
elseif ex.args[2] == LoopValue()
321-
push!(loopvalueLHS, first(ex.args))
322314
end
323315
end
324316
push!(q.args, ex)

src/determinestrategy.jl

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,46 @@
11

2-
# TODO: FIXME for general case
3-
# wrong for transposed matrices, and certain views/SubArrays.
4-
unitstride(op::Operation, s) = first(getindices(op)) === s
2+
function indexappearences(op::Operation, s::Symbol)
3+
s loopdependencies(op) && return 0
4+
appearences = 0
5+
if isloopvalue(op)
6+
return s === first(loopdependencies(op)) ? 1 : 0
7+
elseif isload(op)
8+
return 100
9+
end
10+
newapp = 0
11+
for opp parents(op)
12+
newapp += indexappearences(opp, s)
13+
end
14+
factor = instruction(op).instr (:+, :vadd, :add_fast, :evadd) ? 1 : 10
15+
newapp * factor
16+
end
17+
function findparent(ls::LoopSet, s::Symbol)#opdict isn't filled when reconstructing
18+
id = findfirst(op -> name(op) === s, operations(ls))
19+
id === nothing && throw("$s not found")
20+
operations(ls)[id]
21+
end
22+
function unitstride(ls::LoopSet, op::Operation, s::Symbol)
23+
inds = getindices(op)
24+
li = op.ref.loopedindex
25+
# The first index is allowed to be indexed by `s`
26+
fi = first(inds)
27+
if fi === Symbol("##DISCONTIGUOUSSUBARRAY##")
28+
return false
29+
elseif !first(li)
30+
# We must check if this
31+
parent = findparent(ls, fi)
32+
indexappearences(parent, s) > 1 && return false
33+
end
34+
for i 2:length(inds)
35+
if li[i]
36+
s === inds[i] && return false
37+
else
38+
parent = findparent(ls, inds[i])
39+
s loopdependencies(parent) && return false
40+
end
41+
end
42+
true
43+
end
544

645
function register_pressure(op::Operation)
746
if isconstant(op)
@@ -10,7 +49,7 @@ function register_pressure(op::Operation)
1049
instruction_cost(instruction(op)).register_pressure
1150
end
1251
end
13-
function cost(op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
52+
function cost(ls::LoopSet, op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int = op.elementbytes)
1453
isconstant(op) && return 0.0, 0, 1
1554
# Wshift == dependson(op, unrolled) ? Wshift : 0
1655
# c = first(cost(instruction(op), Wshift, size_T))::Int
@@ -27,7 +66,7 @@ function cost(op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int = op.ele
2766
# either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
2867
# @show instr, unrolled, loopdependencies(op), unitstride(op, unrolled)
2968
if opisunrolled
30-
if !unitstride(op, unrolled)# || !isdense(op) # need gather/scatter
69+
if !unitstride(ls, op, unrolled)# || !isdense(op) # need gather/scatter
3170
r = (1 << Wshift)
3271
srt *= r
3372
sl *= r
@@ -93,7 +132,7 @@ function evaluate_cost_unroll(
93132
hasintersection(rd, nested_loop_syms[1:end-length(rd)]) && return Inf
94133
included_vars[id] = true
95134
# @show op first(cost(op, vectorized, Wshift, size_T)), iter
96-
total_cost += iter * first(cost(op, vectorized, Wshift, size_T))
135+
total_cost += iter * first(cost(ls, op, vectorized, Wshift, size_T))
97136
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
98137
end
99138
end
@@ -102,18 +141,18 @@ end
102141

103142
# only covers vectorized ops; everything else considered lifted?
104143
function depchain_cost!(
105-
skip::Vector{Bool}, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int, rt::Float64 = 0.0, sl::Int = 0
144+
ls::LoopSet, skip::Vector{Bool}, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int, rt::Float64 = 0.0, sl::Int = 0
106145
)
107146
skip[identifier(op)] = true
108147
# depth first search
109148
for opp parents(op)
110149
skip[identifier(opp)] && continue
111-
rt, sl = depchain_cost!(skip, opp, vectorized, Wshift, size_T, rt, sl)
150+
rt, sl = depchain_cost!(ls, skip, opp, vectorized, Wshift, size_T, rt, sl)
112151
end
113152
# Basically assuming memory and compute don't conflict, but everything else does
114153
# Ie, ignoring the fact that integer and floating point operations likely don't either
115154
if iscompute(op)
116-
rtᵢ, slᵢ = cost(op, vectorized, Wshift, size_T)
155+
rtᵢ, slᵢ = cost(ls, op, vectorized, Wshift, size_T)
117156
rt += rtᵢ; sl += slᵢ
118157
end
119158
rt, sl
@@ -139,9 +178,9 @@ function unroll_no_reductions(ls, order, vectorized, Wshift, size_T)
139178
for op operations(ls)
140179
dependson(op, innermost) || continue
141180
if iscompute(op)
142-
compute_rt += first(cost(op, vectorized, Wshift, size_T))
181+
compute_rt += first(cost(ls, op, vectorized, Wshift, size_T))
143182
elseif isload(op)
144-
load_rt += first(cost(op, vectorized, Wshift, size_T))
183+
load_rt += first(cost(ls, op, vectorized, Wshift, size_T))
145184
end
146185
end
147186
# heuristic guess
@@ -181,13 +220,13 @@ function determine_unroll_factor(
181220
for op operations(ls)
182221
dependson(op, unrolled) || continue
183222
if isreduction(op)
184-
rt, sl = depchain_cost!(visited_nodes, op, vectorized, Wshift, size_T)
223+
rt, sl = depchain_cost!(ls, visited_nodes, op, vectorized, Wshift, size_T)
185224
latency = max(sl, latency)
186225
compute_recip_throughput += rt
187226
elseif isload(op)
188-
load_recip_throughput += first(cost(op, vectorized, Wshift, size_T))
227+
load_recip_throughput += first(cost(ls, op, vectorized, Wshift, size_T))
189228
elseif isstore(op)
190-
store_recip_throughput += first(cost(op, vectorized, Wshift, size_T))
229+
store_recip_throughput += first(cost(ls, op, vectorized, Wshift, size_T))
191230
end
192231
end
193232
recip_throughput = max(
@@ -424,7 +463,7 @@ function evaluate_cost_tile(
424463
opisininnerloop = descendentsininnerloop[id]
425464
isunrolled = unrolledtiled[1,id]
426465
istiled = unrolledtiled[2,id]
427-
rt, lat, rp = cost(op, vectorized, Wshift, size_T)
466+
rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
428467
rp = opisininnerloop ? rp : 0 # we only care about register pressure within the inner most loop
429468
rt *= iters[id]
430469
if isunrolled && istiled # no cost decrease; cost must be repeated

0 commit comments

Comments
 (0)