Skip to content

Commit d47fb09

Browse files
committed
Added tests for macros inside @avx loops.
1 parent 6d6dcca commit d47fb09

File tree

1 file changed

+63
-10
lines changed

1 file changed

+63
-10
lines changed

test/offsetarrays.jl

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
using LoopVectorization, OffsetArrays
22
using LoopVectorization.VectorizationBase: StaticUnitRange
33
using Test
4-
T = Float32
4+
# T = Float32
55

66
@testset "OffsetArrays" begin
77

@@ -28,23 +28,24 @@ T = Float32
2828
# tmp += A[i+ik,j+jk]*skern[ik,jk]
2929
# end;
3030
# ls1
31-
# ls2 = LoopVectorization.@avx_debug for j in rng2, i in rng1
31+
# rng1, rng2 = CartesianIndices(out1).indices;
32+
# rng1k, rng2k = axes(skern);
33+
# ls2dstatic = LoopVectorization.@avx_debug for j in rng2, i in rng1
3234
# tmp = zero(eltype(out))
3335
# for jk in rng2k, ik in rng1k
34-
# tmp += A[i+ik,j+jk]*kern[ik,jk]
36+
# tmp += A[i+ik,j+jk]*skern[ik,jk]
3537
# end
36-
# out[i,j] = tmp
38+
# out1[i,j] = tmp
3739
# end;
38-
# ls2
39-
# oq = :(for j in rng2, i in rng1
40+
# LoopVectorization.choose_order(ls2dstatic)
41+
# q2d = :(for j in rng2, i in rng1
4042
# tmp = zero(eltype(out))
4143
# for jk in rng2k, ik in rng1k
4244
# tmp += A[i+ik,j+jk]*kern[ik,jk]
4345
# end
4446
# out[i,j] = tmp
4547
# end);
46-
# lsoq = LoopVectorization.LoopSet(oq);
47-
# LoopVectorization.choose_order(lsoq)
48+
# lsq2d = LoopVectorization.LoopSet(q2d); LoopVectorization.choose_order(lsq2d)
4849

4950
# oq2 = :(for j in rng2, i in rng1
5051
# tmp = zero(eltype(out))
@@ -82,6 +83,8 @@ T = Float32
8283
end
8384
out
8485
end
86+
87+
8588
struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: AbstractMatrix{T}
8689
data::Matrix{T}
8790
end
@@ -97,7 +100,48 @@ T = Float32
97100
end
98101
# Base.size(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (1 + UR-LR, 1 + UC-LC)
99102
# Base.CartesianIndices(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = CartesianIndices((LR:UR,LC:UC))
100-
# Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j)) # only needed to print
103+
Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j)) # only needed to print
104+
function avx2dunrolled!(out::AbstractMatrix, A::AbstractMatrix, kern::SizedOffsetMatrix{T,-1,1,-1,1}) where {T}
105+
rng1, rng2 = axes(out)
106+
Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> kern_ik_jk = kern[ik-2,jk-2]
107+
# Manually unpack the OffsetArray
108+
@avx for j in rng2, i in rng1
109+
tmp_0 = zero(eltype(out))
110+
Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
111+
out[i,j] = tmp_9
112+
end
113+
out
114+
end
115+
function avx2dunrolled2x2!(out::AbstractMatrix, A::AbstractMatrix, kern::SizedOffsetMatrix{T,-1,1,-1,1}) where {T}
116+
rng1, rng2 = axes(out)
117+
Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> kern_ik_jk = kern[ik-2,jk-2]
118+
# Manually unpack the OffsetArray
119+
@avx tile=(2,2) for j in rng2, i in rng1
120+
tmp_0 = zero(eltype(out))
121+
Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
122+
out[i,j] = tmp_9
123+
end
124+
out
125+
end
126+
function avx2dunrolled3x3!(out::AbstractMatrix, A::AbstractMatrix, kern::SizedOffsetMatrix{T,-1,1,-1,1}) where {T}
127+
rng1, rng2 = axes(out)
128+
Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> kern_ik_jk = kern[ik-2,jk-2]
129+
# Manually unpack the OffsetArray
130+
@avx tile=(3,3) for j in rng2, i in rng1
131+
tmp_0 = zero(eltype(out))
132+
Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
133+
out[i,j] = tmp_9
134+
end
135+
out
136+
end
137+
# uq = :(for j in rng2, i in rng1
138+
# tmp_0 = zero(eltype(out))
139+
# Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
140+
# out[i,j] = tmp_9
141+
# end);
142+
# lsuq = LoopVectorization.LoopSet(macroexpand(Base, uq));
143+
# LoopVectorization.choose_order(lsuq)
144+
101145

102146
for T (Float32, Float64)
103147
@show T, @__LINE__
@@ -119,7 +163,16 @@ T = Float32
119163

120164
fill!(out3, NaN); avx2douter!(out3, A, skern);
121165
@test out1 out3
122-
end
166+
167+
fill!(out3, NaN); avx2dunrolled!(out3, A, skern);
168+
@test out1 out3
169+
170+
fill!(out3, NaN); avx2dunrolled2x2!(out3, A, skern);
171+
@test out1 out3
172+
173+
fill!(out3, NaN); avx2dunrolled3x3!(out3, A, skern);
174+
@test out1 out3
175+
end
123176

124177

125178
end

0 commit comments

Comments
 (0)