Skip to content

Commit 6485332

Browse files
committed
Update benchmark plots, make a few changes to improve handling of aliasing, and quit using masktable now that mask in VectorizationBase v0.9 takes a length argument and returns an all-on mask when the remainder is 0.
1 parent 681f828 commit 6485332

24 files changed

+111
-63
lines changed

docs/src/assets/bench_AmulB_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_AmulBt_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_Amulvb_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_AplusAt_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_AtmulB_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_AtmulBt_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_Atmulvb_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_aplusBc_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_dot3_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_dot_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_exp_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_filter2d_3x3_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_filter2d_dynamic_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_filter2d_unrolled_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_logdettriangle_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_random_access_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_selfdot_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_sse_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

src/condense_loopset.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ end
213213
LHS = ind === nothing ? gensym() : vptrs[ind]
214214
assigned_names[i] = LHS
215215
d = (D[i])::Union{Nothing,Int}
216-
if d === nothing # stridedpointer instead of noaliasstridedpointer, because alias info will be lost across function boundary...
216+
if d === nothing
217217
num_arrays += 1
218218
RHS = Expr(:call, lv(:stridedpointer), Expr(:ref, :vargs, ari), Expr(:ref, :arraydescript, ari))
219219
else #subsetview
@@ -255,7 +255,7 @@ function generate_call(ls::LoopSet, IUT, debug::Bool = false)
255255
)
256256
debug && deleteat!(q.args, 2)
257257
foreach(ref -> push!(q.args, vptr(ref)), ls.refs_aliasing_syms)
258-
else
258+
else# not forcing inline; calling __avx__! which calls an inlined _avx_!
259259
arraydescript = Expr(:tuple)
260260
q = Expr(
261261
:call, lv(:__avx__!), Expr(:call, Expr(:curly, :Val, (U,T))),
@@ -288,7 +288,7 @@ function setup_call_noinline(ls::LoopSet, U = zero(Int8), T = zero(Int8))
288288
if ex isa Expr && ex.head === :(=) && length(ex.args) == 2
289289
if ex.args[2] isa Expr && ex.args[2].head === :call
290290
gr = first(ex.args[2].args)
291-
if gr == lv(:noaliasstridedpointer)
291+
if gr == lv(:stridedpointer)
292292
array = ex.args[2].args[2]
293293
arrayid = findfirst(a -> a === array, ls.includedactualarrays)
294294
if arrayid isa Int

src/lower_compute.jl

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,18 @@ function lower_compute!(
5757
# parentsyms = [opp.variable for opp ∈ parents(op)]
5858
Uiter = opunrolled ? U - 1 : 0
5959
isreduct = isreduction(op)
60-
if !isnothing(suffix) && isreduct && tiledouterreduction == -1
61-
instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub))
62-
if instrfid !== nothing
60+
# if instr.instr === :vfmadd_fast
61+
# diffdeps = !any(opp -> isload(opp) && all(in(loopdependencies(opp)), loopdependencies(op)), parents(op)) # want to instcombine when parent load's deps are superset
62+
# @show suffix, !isnothing(suffix), isreduct, diffdeps
63+
# end
64+
if !isnothing(suffix) && isreduct
65+
# instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub))
66+
instrfid = findfirst(isequal(instr.instr), (:vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast))
67+
if instrfid !== nothing && !any(opp -> isload(opp) && all(in(loopdependencies(opp)), loopdependencies(op)), parents(op)) # want to instcombine when parent load's deps are superset
6368
instr = Instruction((:vfmadd231, :vfnmadd231, :vfmsub231, :vfnmsub231)[instrfid])
6469
end
6570
end
71+
# @show instr.instr
6672
maskreduct = mask !== nothing && isreduct && vectorized reduceddependencies(op) #any(opp -> opp.variable === var, parents_op)
6773
# if a parent is not unrolled, the compiler should handle broadcasting CSE.
6874
# because unrolled/tiled parents result in an unrolled/tiled dependendency,

src/lower_store.jl

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
using VectorizationBase: vnoaliasstore!
2+
const STOREOP = :vnoaliasstore!
3+
# const STOREOP = :vstore!
14
variable_name(op::Operation, ::Nothing) = mangledvar(op)
25
variable_name(op::Operation, suffix) = Symbol(mangledvar(op), suffix, :_)
36
function reduce_range!(q::Expr, toreduct::Symbol, instr::Instruction, Uh::Int, Uh2::Int)
@@ -66,7 +69,7 @@ function lower_conditionalstore_scalar!(
6669
varname = varassignname(var, u, parentisunrolled)
6770
condvarname = varassignname(condvar, u, condunrolled)
6871
td = UnrollArgs(u, unrolled, tiled, suffix)
69-
push!(q.args, Expr(:&&, condvarname, Expr(:call, lv(:vstore!), ptr, varname, mem_offset_u(op, td))))
72+
push!(q.args, Expr(:&&, condvarname, Expr(:call, lv(STOREOP), ptr, varname, mem_offset_u(op, td))))
7073
end
7174
nothing
7275
end
@@ -99,7 +102,7 @@ function lower_conditionalstore_vectorized!(
99102
td = UnrollArgs(u, unrolled, tiled, suffix)
100103
name, mo = name_memoffset(var, op, td, W, vecnotunrolled, parentisunrolled)
101104
condvarname = varassignname(condvar, u, condunrolled)
102-
instrcall = Expr(:call, lv(:vstore!), ptr, name, mo)
105+
instrcall = Expr(:call, lv(STOREOP), ptr, name, mo)
103106
if mask !== nothing && (vecnotunrolled || u == U - 1)
104107
push!(instrcall.args, Expr(:call, :&, condvarname, mask))
105108
else
@@ -119,7 +122,7 @@ function lower_store_scalar!(
119122
for u 0:U-1
120123
varname = varassignname(var, u, parentisunrolled)
121124
td = UnrollArgs(u, unrolled, tiled, suffix)
122-
push!(q.args, Expr(:call, lv(:vstore!), ptr, varname, mem_offset_u(op, td)))
125+
push!(q.args, Expr(:call, lv(STOREOP), ptr, varname, mem_offset_u(op, td)))
123126
end
124127
nothing
125128
end
@@ -143,7 +146,7 @@ function lower_store_vectorized!(
143146
for u umin:U-1
144147
td = UnrollArgs(u, unrolled, tiled, suffix)
145148
name, mo = name_memoffset(var, op, td, W, vecnotunrolled, parentisunrolled)
146-
instrcall = Expr(:call, lv(:vstore!), ptr, name, mo)
149+
instrcall = Expr(:call, lv(STOREOP), ptr, name, mo)
147150
if mask !== nothing && (vecnotunrolled || u == U - 1)
148151
push!(instrcall.args, mask)
149152
end

src/lowering.jl

Lines changed: 59 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
function lower!(
1212
q::Expr, op::Operation, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
13-
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}
13+
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}, ::Nothing
1414
)
1515
W = ls.W
1616
if isconstant(op)
@@ -29,11 +29,35 @@ function lower!(
2929
# elseif isloopvalue(op)
3030
end
3131
end
32+
function lower!(
33+
q::Expr, op::Operation, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
34+
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}, filterstore::Bool
35+
)
36+
W = ls.W
37+
if filterstore
38+
if isstore(op)
39+
lower_store!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
40+
end
41+
else
42+
if isconstant(op)
43+
zerotyp = zerotype(ls, op)
44+
if zerotyp == INVALID
45+
lower_constant!(q, op, vectorized, ls, unrolled, U, suffix)
46+
else
47+
lower_zero!(q, op, vectorized, ls, unrolled, U, suffix, zerotyp)
48+
end
49+
elseif isload(op)
50+
lower_load!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
51+
elseif iscompute(op)
52+
lower_compute!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
53+
end
54+
end
55+
end
3256
function lower!(
3357
q::Expr, ops::AbstractVector{Operation}, vectorized::Symbol, ls::LoopSet, unrolled::Symbol, tiled::Symbol, U::Int,
34-
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}
58+
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned}, filterstore = nothing
3559
)
36-
foreach(op -> lower!(q, op, vectorized, ls, unrolled, tiled, U, suffix, mask), ops)
60+
foreach(op -> lower!(q, op, vectorized, ls, unrolled, tiled, U, suffix, mask, filterstore), ops)
3761
end
3862

3963
function lower_block(ls::LoopSet, us::UnrollSpecification, n::Int, inclmask::Bool, UF)
@@ -62,26 +86,36 @@ function lower_block(
6286
lower!(blockq, ops[2,1,prepost,n], vectorized, ls, unrolled, tiled, U, nothing, mask)
6387
# end
6488
if length(ops[1,2,prepost,n]) + length(ops[2,2,prepost,n]) > 0
65-
for t 0:T-1
66-
if t == 0
67-
push!(blockq.args, Expr(:(=), tiled, tiledsym(tiled)))
68-
elseif tiledloopnum == vectorizedloopnum
69-
push!(blockq.args, Expr(:(=), tiled, Expr(:call, lv(:valadd), ls.W, tiled)))
70-
else
71-
push!(blockq.args, Expr(:+=, tiled, 1))
72-
end
73-
# !U && T
74-
if dontmaskfirsttiles && t < T - 1
75-
lower!(blockq, ops[1,2,prepost,n], vectorized, ls, unrolled, tiled, U, t, nothing)
76-
# for u ∈ 0:U-1 # U && T
77-
lower!(blockq, ops[2,2,prepost,n], vectorized, ls, unrolled, tiled, U, t, nothing)
78-
# end
79-
else
80-
lower!(blockq, ops[1,2,prepost,n], vectorized, ls, unrolled, tiled, U, t, mask)
81-
# for u ∈ 0:U-1 # U && T
82-
lower!(blockq, ops[2,2,prepost,n], vectorized, ls, unrolled, tiled, U, t, mask)
83-
# end
89+
for store (false,true)
90+
# let store = nothing
91+
nstores = 0
92+
opsv1 = ops[1,2,prepost,n]
93+
opsv2 = ops[2,2,prepost,n]
94+
iszero(length(opsv1) + length(opsv2)) && continue
95+
iszero(length(opsv1)) || (nstores += sum(isstore, opsv1))
96+
iszero(length(opsv2)) || (nstores += sum(isstore, opsv2))
97+
for t 0:T-1
98+
if t == 0
99+
push!(blockq.args, Expr(:(=), tiled, tiledsym(tiled)))
100+
elseif tiledloopnum == vectorizedloopnum
101+
push!(blockq.args, Expr(:(=), tiled, Expr(:call, lv(:valadd), ls.W, tiled)))
102+
else
103+
push!(blockq.args, Expr(:+=, tiled, 1))
104+
end
105+
# !U && T
106+
if dontmaskfirsttiles && t < T - 1
107+
lower!(blockq, opsv1, vectorized, ls, unrolled, tiled, U, t, nothing, store)
108+
# for u ∈ 0:U-1 # U && T
109+
lower!(blockq, opsv2, vectorized, ls, unrolled, tiled, U, t, nothing, store)
110+
# end
111+
else
112+
lower!(blockq, opsv1, vectorized, ls, unrolled, tiled, U, t, mask, store)
113+
# for u ∈ 0:U-1 # U && T
114+
lower!(blockq, opsv2, vectorized, ls, unrolled, tiled, U, t, mask, store)
115+
# end
116+
end
84117
end
118+
nstores == 0 && break
85119
end
86120
end
87121
if n > 1 && prepost == 1
@@ -276,8 +310,9 @@ function init_remblock(unrolledloop::Loop, unrolled::Symbol = unrolledloop.iters
276310
end
277311

278312
function maskexpr(W::Symbol, looplimit)
279-
rem = Expr(:call, lv(:valrem), W, looplimit)
280-
Expr(:(=), Symbol("##mask##"), Expr(:call, lv(:masktable), W, rem))
313+
Expr(:(=), Symbol("##mask##"), Expr(:call, lv(:mask), W, looplimit))
314+
# rem = Expr(:call, lv(:valrem), W, looplimit)
315+
# Expr(:(=), Symbol("##mask##"), Expr(:call, lv(:masktable), W, rem))
281316
end
282317
function definemask(loop::Loop, W::Symbol)
283318
if isstaticloop(loop)

src/memory_ops_common.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ function add_vptr!(ls::LoopSet, array::Symbol, vptrarray::Symbol = vptr(array),
2323
if broadcast
2424
pushpreamble!(ls, Expr(:(=), vptrarray, Expr(:call, lv(:stridedpointer_for_broadcast), array)))
2525
else
26-
# pushpreamble!(ls, Expr(:(=), vptrarray, Expr(:call, lv(:stridedpointer), array)))
27-
pushpreamble!(ls, Expr(:(=), vptrarray, Expr(:call, lv(:noaliasstridedpointer), array)))
26+
pushpreamble!(ls, Expr(:(=), vptrarray, Expr(:call, lv(:stridedpointer), array)))
27+
# pushpreamble!(ls, Expr(:(=), vptrarray, Expr(:call, lv(:noaliasstridedpointer), array)))
2828
end
2929
end
3030
nothing

test/dot.jl

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,42 +69,46 @@ using Test
6969
@avx unroll=2 for i 1:length(x)
7070
z += x[i]*y[i]
7171
end
72-
return z
72+
z
7373
end
7474
function dot_unroll3avx(x::Vector{T}, y::Vector{T}) where {T<:Number}
7575
z = zero(T)
7676
@avx unroll=3 for i 1:length(x)
7777
z += x[i]*y[i]
7878
end
79-
return z
79+
z
8080
end
81+
@macroexpand @avx inline=false unroll=2 for i 1:length(x)
82+
z += x[i]*y[i]
83+
end
84+
8185
function dot_unroll2avx_noinline(x::Vector{T}, y::Vector{T}) where {T<:Number}
8286
z = zero(T)
83-
@avx inline=false unroll=2 for i 1:length(x)
87+
@avx inline=true unroll=2 for i 1:length(x)
8488
z += x[i]*y[i]
8589
end
86-
return z
90+
z
8791
end
8892
function dot_unroll3avx_inline(x::Vector{T}, y::Vector{T}) where {T<:Number}
8993
z = zero(T)
9094
@avx unroll=3 inline=true for i 1:length(x)
9195
z += x[i]*y[i]
9296
end
93-
return z
97+
z
9498
end
9599
function dot_unroll2_avx(x::Vector{T}, y::Vector{T}) where {T<:Number}
96100
z = zero(T)
97101
@_avx unroll=2 for i 1:length(x)
98102
z += x[i]*y[i]
99103
end
100-
return z
104+
z
101105
end
102106
function dot_unroll3_avx(x::Vector{T}, y::Vector{T}) where {T<:Number}
103107
z = zero(T)
104108
@_avx unroll=3 for i 1:length(x)
105109
z += x[i]*y[i]
106110
end
107-
return z
111+
z
108112
end
109113
function complex_dot_soa(
110114
xre::AbstractVector{T}, xim::AbstractVector{T},
@@ -116,13 +120,13 @@ using Test
116120
zre += xre[i]*yre[i] - xim[i]*yim[i]
117121
zim += xre[i]*yim[i] + xim[i]*yre[i]
118122
end
119-
return Complex{T}(zre,zim)
123+
Complex{T}(zre,zim)
120124
end
121125
qc = :(for i 1:length(xre)
122126
zre += xre[i]*yre[i] - xim[i]*yim[i]
123127
zim += xre[i]*yim[i] + xim[i]*yre[i]
124128
end);
125-
lsc = LoopVectorization.LoopSet(qc)
129+
lsc = LoopVectorization.LoopSet(qc);
126130
function complex_mul_with_index_offset!(c_re, c_im, a_re, a_im, b_re, b_im)
127131
@inbounds @simd ivdep for i = 1:length(a_re) - 1
128132
c_re[i] = b_re[i] * a_re[i + 1] - b_im[i] * a_im[i + 1]

0 commit comments

Comments
 (0)