Skip to content

Commit 73e2ed5

Browse files
committed
Updates for better OffsetArray support. Check the start values of iterables, and handle different starting offsets of OffsetArrays based on linear vs cartesian indexing.
1 parent b720d11 commit 73e2ed5

10 files changed

+144
-23
lines changed

Manifest.toml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
3131
deps = ["Base64"]
3232
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
3333

34+
[[OffsetArrays]]
35+
git-tree-sha1 = "707e34562700b81e8aa13548eb6b23b18112e49b"
36+
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
37+
version = "1.0.2"
38+
3439
[[OrderedCollections]]
3540
deps = ["Random", "Serialization", "Test"]
3641
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
@@ -49,9 +54,9 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
4954

5055
[[SIMDPirates]]
5156
deps = ["VectorizationBase"]
52-
git-tree-sha1 = "34dff4f4715f871e71b38f31397d96e62621f14d"
57+
git-tree-sha1 = "f91198b7ef74b04028f98e0eed7c556b93538a2e"
5358
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
54-
version = "0.6.5"
59+
version = "0.6.6"
5560

5661
[[SLEEFPirates]]
5762
deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
@@ -71,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7176

7277
[[VectorizationBase]]
7378
deps = ["CpuId", "LinearAlgebra"]
74-
git-tree-sha1 = "006d7b7f276db8d728f8bfd70ebf2efd132f9548"
79+
git-tree-sha1 = "8abb5697fb64cadccd1bba444c955942d3181e5c"
7580
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
76-
version = "0.7.0"
81+
version = "0.7.1"

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ version = "0.6.20"
55

66
[deps]
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
8+
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
89
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
910
SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
1011
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"

src/LoopVectorization.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ using VectorizationBase, SIMDPirates, SLEEFPirates, Parameters
44
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
55
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
66
maybestaticlength, maybestaticsize, staticm1, subsetview, vzero, stridedpointer_for_broadcast,
7-
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange,
7+
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
88
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct
99
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod,
1010
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
@@ -22,6 +22,7 @@ export LowDimArray, stridedpointer, vectorizable,
2222
vfilter, vfilter!
2323

2424

25+
include("vectorizationbase_extensions.jl")
2526
include("map.jl")
2627
include("filter.jl")
2728
include("costs.jl")

src/add_loads.jl

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,3 @@ function add_loopvalue!(ls::LoopSet, arg::Symbol, elementbytes::Int)
7070
loopsymop
7171
end
7272

73-
74-
struct LoopValue end
75-
@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
76-
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
77-
# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
78-
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
79-
@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
80-
@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
81-
@inline Base.eltype(::LoopValue) = Int8
82-

src/graphs.jl

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ function startloop(loop::Loop, isvectorized, W, itersymbol = loop.itersymbol)
7070
elseif startexact
7171
Expr(:(=), itersymbol, loop.starthint)
7272
else
73-
Expr(:(=), itersymbol, loop.startsym)
73+
Expr(:(=), itersymbol, Expr(:call, lv(:unwrap), loop.startsym))
7474
end
7575
end
7676
function vec_looprange(loop::Loop, isunrolled::Bool, W::Symbol, U::Int)
@@ -397,9 +397,11 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
397397
Loop(itersym, L, U)
398398
end
399399
elseif f === :eachindex
400-
N = gensym(Symbol(:loop, itersym))
401-
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r.args[2])))
402-
Loop(itersym, 0, N)
400+
N = gensym(Symbol(:loopeachindex, itersym))
401+
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
402+
L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
403+
U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
404+
Loop(itersym, L, U)
403405
elseif f === :OneTo || f == Expr(:(.), :Base, QuoteNode(:OneTo))
404406
otN = r.args[2]
405407
if otN isa Integer
@@ -416,8 +418,10 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
416418
elseif isa(r, Symbol)
417419
# Treat similar to `eachindex`
418420
N = gensym(Symbol(:loop, itersym))
419-
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticlength), r)))
420-
loop = Loop(itersym, 0, N)
421+
pushpreamble!(ls, Expr(:(=), N, Expr(:call, lv(:maybestaticrange), r)))
422+
L = add_loop_bound!(ls, itersym, Expr(:call, :first, N), false)
423+
U = add_loop_bound!(ls, itersym, Expr(:call, :last, N), true)
424+
loop = Loop(itersym, L, U)
421425
else
422426
throw("Unrecognized loop range type: $r.")
423427
end

src/reconstruct_loopset.jl

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@ function Loop(ls::LoopSet, l::Int, ::Type{StaticLowerUnitRange{L}}) where {L}
1414
pushpreamble!(ls, Expr(:(=), stop, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:U)))))
1515
Loop(gensym(:n), L, L + 1024, Symbol(""), stop, true, false)::Loop
1616
end
17+
# Is there any likely way to generate such a range?
18+
# function Loop(ls::LoopSet, l::Int, ::Type{StaticLengthUnitRange{N}}) where {N}
19+
# start = gensym(:loopstart); stop = gensym(:loopstop)
20+
# pushpreamble!(ls, Expr(:(=), start, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), Expr(:ref, :lb, l), QuoteNode(:L)))))
21+
# pushpreamble!(ls, Expr(:(=), stop, Expr(:call, :(+), start, N - 1)))
22+
# Loop(gensym(:n), 0, N, start, stop, false, false)::Loop
23+
# end
1724
function Loop(ls, l, ::Type{StaticUnitRange{L,U}}) where {L,U}
1825
Loop(gensym(:n), L, U, Symbol(""), Symbol(""), true, true)::Loop
1926
end
@@ -63,14 +70,18 @@ extract_varg(i) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__
6370
pushvarg!(ls::LoopSet, ar::ArrayReferenceMeta, i) = pushpreamble!(ls, Expr(:(=), vptr(ar), extract_varg(i)))
6471
function pushvarg′!(ls::LoopSet, ar::ArrayReferenceMeta, i)
6572
reverse!(ar.loopedindex); reverse!(getindices(ar)) # reverse the listed indices here, and transpose it to make it column major
66-
pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:Transpose), extract_varg(i))))
73+
pushpreamble!(ls, Expr(:(=), vptr(ar), Expr(:call, lv(:transpose), extract_varg(i))))
6774
end
6875
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{PackedStridedPointer{T, N}}) where {T, N}
6976
pushvarg!(ls, ar, i)
7077
end
7178
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{RowMajorStridedPointer{T, N}}) where {T, N}
7279
pushvarg′!(ls, ar, i)
7380
end
81+
function add_mref!(ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{OffsetStridedPointer{T,N,P}}) where {T,N,P}
82+
add_mref!(ls, ar, i, P)
83+
end
84+
7485
function add_mref!(
7586
ls::LoopSet, ar::ArrayReferenceMeta, i::Int, ::Type{S}
7687
) where {T, X <: Tuple, S <: VectorizationBase.AbstractStaticStridedPointer{T,X}}

src/vectorizationbase_extensions.jl

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
2+
struct LoopValue end
3+
@inline VectorizationBase.stridedpointer(::LoopValue) = LoopValue()
4+
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
5+
# @inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Unsigned) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
6+
@inline VectorizationBase.vload(::LoopValue, i::Tuple{_MM{W}}, ::Mask) where {W} = _MM{W}(@inbounds(i[1].i) + 1)
7+
@inline VectorizationBase.vload(::LoopValue, i::Integer) = i + one(i)
8+
@inline VectorizationBase.vload(::LoopValue, i::Tuple{I}) where {I<:Integer} = @inbounds(i[1]) + one(I)
9+
@inline Base.eltype(::LoopValue) = Int8
10+
11+
import OffsetArrays
12+
13+
# If ndim(::OffsetArray) == 1, we can convert to a regular strided pointer and offset.
14+
@inline VectorizationBase.stridedpointer(a::OffsetArrays.OffsetArray{<:Any,1}) = gesp(stridedpointer(parent(a)), (-@inbounds(a.offsets[1]),))
15+
16+
struct OffsetStridedPointer{T, N, P <: VectorizationBase.AbstractStridedPointer{T}} <: VectorizationBase.AbstractStridedPointer{T}
17+
ptr::P
18+
offsets::NTuple{N,Int}
19+
end
20+
# if ndim(A::OffsetArray) ≥ 2, then eachindex(A) isa Base.OneTo, index starting at 1.
21+
# but multiple indexing is calculated using offsets, so we need a special type to express this.
22+
@inline function VectorizationBase.stridedpointer(A::OffsetArrays.OffsetArray)
23+
OffsetStridedPointer(stridedpointer(parent(A)), A.offsets)
24+
end
25+
# Tuple of length == 1, use ind directly.
26+
# @inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = VectorizationBase.offset(ptr.ptr, ind)
27+
# Tuple of length > 1, subtract offsets.
28+
# @inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = VectorizationBase.offset(ptr.ptr, ntuple(n -> ind[n] + ptr.offsets[n], Val{N}()))
29+
@inline VectorizationBase.offset(ptr::OffsetStridedPointer, ind::Tuple{I}) where {I} = ind
30+
# Tuple of length > 1, subtract offsets.
31+
@inline VectorizationBase.offset(ptr::OffsetStridedPointer{<:Any,N}, ind::Tuple) where {N} = ntuple(n -> ind[n] - ptr.offsets[n], Val{N}())
32+
@inline Base.similar(p::OffsetStridedPointer, ptr::Ptr) = OffsetStridedPointer(similar(p.ptr, ptr), p.offsets)
33+
34+
# If an OffsetArray is getting indexed by a (loop-)constant value, then this particular vptr object cannot also be eachindexed, so we can safely return a stridedpointer
35+
@inline function VectorizationBase.subsetview(ptr::OffsetStridedPointer{<:Any,N}, ::Val{I}, i) where {I,N}
36+
subsetview(gesp(ptr.ptr, ntuple(n -> 0 - @inbounds(ptr.offsets[n]), Val{N}())), Val{I}(), i)
37+
end
38+

test/dot.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using LoopVectorization
1+
using LoopVectorization, OffsetArrays
22
using Test
33

44
@testset "dot" begin
@@ -190,9 +190,12 @@ using Test
190190
N = 127
191191
R = T <: Integer ? (T(-100):T(100)) : T
192192
a = rand(T, N); b = rand(R, N);
193+
ao = OffsetArray(a, -60:66); bo = OffsetArray(b, -60:66);
193194
s = mydot(a, b)
194195
@test mydotavx(a,b) s
195196
@test mydot_avx(a,b) s
197+
@test mydotavx(ao,bo) s
198+
@test mydot_avx(ao,bo) s
196199
@test dot_unroll2avx(a,b) s
197200
@test dot_unroll3avx(a,b) s
198201
@test dot_unroll2_avx(a,b) s

test/offsetarrays.jl

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
using LoopVectorization, OffsetArrays
2+
using Test
3+
4+
@testset "OffsetArrays" begin
5+
6+
function old2d!(out::AbstractMatrix, A::AbstractMatrix, kern, R=CartesianIndices(out), z=zero(eltype(out)))
7+
rng1k, rng2k = axes(kern)
8+
rng1, rng2 = R.indices
9+
for j in rng2, i in rng1
10+
tmp = z
11+
@inbounds for jk in rng2k, ik in rng1k
12+
tmp += oftype(tmp, A[i+ik,j+jk])*kern[ik,jk]
13+
end
14+
@inbounds out[i,j] = tmp
15+
end
16+
out
17+
end
18+
function avx2d!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out)))
19+
rng1k, rng2k = axes(kern)
20+
rng1, rng2 = R.indices
21+
# Manually unpack the OffsetArray
22+
kernA = parent(kern)
23+
o1, o2 = kern.offsets
24+
for j in rng2, i in rng1
25+
tmp = z
26+
@avx for jk in rng2k, ik in rng1k
27+
tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2]
28+
end
29+
out[i,j] = tmp
30+
end
31+
out
32+
end
33+
function avx2douter!(out::AbstractMatrix, A::AbstractMatrix, kern::OffsetArray, R=CartesianIndices(out), z=zero(eltype(out)))
34+
rng1k, rng2k = axes(kern)
35+
rng1, rng2 = R.indices
36+
# Manually unpack the OffsetArray
37+
kernA = parent(kern)
38+
o1, o2 = kern.offsets
39+
@avx for j in rng2, i in rng1
40+
tmp = z
41+
for jk in rng2k, ik in rng1k
42+
tmp += A[i+ik,j+jk]*kernA[ik-o1,jk-o2]
43+
1
44+
end
45+
out[i,j] = tmp
46+
end
47+
out
48+
end
49+
50+
for T (Float32, Float64)
51+
@show T, @__LINE__
52+
A = rand(T, 100, 100);
53+
kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1);
54+
out1 = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A
55+
out2 = similar(out1); out3 = similar(out1);
56+
57+
old2d!(out1, A, kern);
58+
avx2d!(out2, A, kern);
59+
@test out1 out2
60+
avx2douter!(out3, A, kern);
61+
@test out1 out3
62+
end
63+
64+
65+
end
66+

test/runtests.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ end
1717
@time @testset "LoopVectorization.jl" begin
1818

1919
@time include("printmethods.jl")
20+
21+
@time include("offsetarrays.jl")
2022

2123
@time include("map.jl")
2224

0 commit comments

Comments
 (0)