1
1
using LoopVectorization, OffsetArrays
2
2
using LoopVectorization. VectorizationBase: StaticUnitRange
3
3
using Test
4
- T = Float32
4
+ # T = Float32
5
5
6
6
@testset " OffsetArrays" begin
7
7
@@ -28,23 +28,24 @@ T = Float32
28
28
# tmp += A[i+ik,j+jk]*skern[ik,jk]
29
29
# end;
30
30
# ls1
31
- # ls2 = LoopVectorization.@avx_debug for j in rng2, i in rng1
31
+ # rng1, rng2 = CartesianIndices(out1).indices;
32
+ # rng1k, rng2k = axes(skern);
33
+ # ls2dstatic = LoopVectorization.@avx_debug for j in rng2, i in rng1
32
34
# tmp = zero(eltype(out))
33
35
# for jk in rng2k, ik in rng1k
34
- # tmp += A[i+ik,j+jk]*kern [ik,jk]
36
+ # tmp += A[i+ik,j+jk]*skern [ik,jk]
35
37
# end
36
- # out [i,j] = tmp
38
+ # out1 [i,j] = tmp
37
39
# end;
38
- # ls2
39
- # oq = :(for j in rng2, i in rng1
40
+ # LoopVectorization.choose_order(ls2dstatic)
41
+ # q2d = :(for j in rng2, i in rng1
40
42
# tmp = zero(eltype(out))
41
43
# for jk in rng2k, ik in rng1k
42
44
# tmp += A[i+ik,j+jk]*kern[ik,jk]
43
45
# end
44
46
# out[i,j] = tmp
45
47
# end);
46
- # lsoq = LoopVectorization.LoopSet(oq);
47
- # LoopVectorization.choose_order(lsoq)
48
+ # lsq2d = LoopVectorization.LoopSet(q2d); LoopVectorization.choose_order(lsq2d)
48
49
49
50
# oq2 = :(for j in rng2, i in rng1
50
51
# tmp = zero(eltype(out))
@@ -82,6 +83,8 @@ T = Float32
82
83
end
83
84
out
84
85
end
86
+
87
+
85
88
struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: AbstractMatrix{T}
86
89
data:: Matrix{T}
87
90
end
@@ -97,7 +100,48 @@ T = Float32
97
100
end
98
101
# Base.size(A::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = (1 + UR-LR, 1 + UC-LC)
99
102
# Base.CartesianIndices(::SizedOffsetMatrix{T,LR,UR,LC,UC}) where {T,LR,UR,LC,UC} = CartesianIndices((LR:UR,LC:UC))
100
- # Base.getindex(A::SizedOffsetMatrix, i, j) = LoopVectorization.vload(LoopVectorization.stridedpointer(A), (i,j)) # only needed to print
103
+ Base. getindex (A:: SizedOffsetMatrix , i, j) = LoopVectorization. vload (LoopVectorization. stridedpointer (A), (i,j)) # only needed to print
104
+ function avx2dunrolled! (out:: AbstractMatrix , A:: AbstractMatrix , kern:: SizedOffsetMatrix{T,-1,1,-1,1} ) where {T}
105
+ rng1, rng2 = axes (out)
106
+ Base. Cartesian. @nexprs 3 jk -> Base. Cartesian. @nexprs 3 ik -> kern_ik_jk = kern[ik- 2 ,jk- 2 ]
107
+ # Manually unpack the OffsetArray
108
+ @avx for j in rng2, i in rng1
109
+ tmp_0 = zero (eltype (out))
110
+ Base. Cartesian. @nexprs 3 jk -> Base. Cartesian. @nexprs 3 ik -> tmp_{ik+ (jk- 1 )* 3 } = A[i+ (ik- 2 ),j+ (jk- 2 )] * kern_ik_jk + tmp_{ik+ (jk- 1 )* 3 - 1 }
111
+ out[i,j] = tmp_9
112
+ end
113
+ out
114
+ end
115
+ function avx2dunrolled2x2! (out:: AbstractMatrix , A:: AbstractMatrix , kern:: SizedOffsetMatrix{T,-1,1,-1,1} ) where {T}
116
+ rng1, rng2 = axes (out)
117
+ Base. Cartesian. @nexprs 3 jk -> Base. Cartesian. @nexprs 3 ik -> kern_ik_jk = kern[ik- 2 ,jk- 2 ]
118
+ # Manually unpack the OffsetArray
119
+ @avx tile= (2 ,2 ) for j in rng2, i in rng1
120
+ tmp_0 = zero (eltype (out))
121
+ Base. Cartesian. @nexprs 3 jk -> Base. Cartesian. @nexprs 3 ik -> tmp_{ik+ (jk- 1 )* 3 } = A[i+ (ik- 2 ),j+ (jk- 2 )] * kern_ik_jk + tmp_{ik+ (jk- 1 )* 3 - 1 }
122
+ out[i,j] = tmp_9
123
+ end
124
+ out
125
+ end
126
+ function avx2dunrolled3x3! (out:: AbstractMatrix , A:: AbstractMatrix , kern:: SizedOffsetMatrix{T,-1,1,-1,1} ) where {T}
127
+ rng1, rng2 = axes (out)
128
+ Base. Cartesian. @nexprs 3 jk -> Base. Cartesian. @nexprs 3 ik -> kern_ik_jk = kern[ik- 2 ,jk- 2 ]
129
+ # Manually unpack the OffsetArray
130
+ @avx tile= (3 ,3 ) for j in rng2, i in rng1
131
+ tmp_0 = zero (eltype (out))
132
+ Base. Cartesian. @nexprs 3 jk -> Base. Cartesian. @nexprs 3 ik -> tmp_{ik+ (jk- 1 )* 3 } = A[i+ (ik- 2 ),j+ (jk- 2 )] * kern_ik_jk + tmp_{ik+ (jk- 1 )* 3 - 1 }
133
+ out[i,j] = tmp_9
134
+ end
135
+ out
136
+ end
137
+ # uq = :(for j in rng2, i in rng1
138
+ # tmp_0 = zero(eltype(out))
139
+ # Base.Cartesian.@nexprs 3 jk -> Base.Cartesian.@nexprs 3 ik -> tmp_{ik+(jk-1)*3} = A[i+(ik-2),j+(jk-2)] * kern_ik_jk + tmp_{ik+(jk-1)*3-1}
140
+ # out[i,j] = tmp_9
141
+ # end);
142
+ # lsuq = LoopVectorization.LoopSet(macroexpand(Base, uq));
143
+ # LoopVectorization.choose_order(lsuq)
144
+
101
145
102
146
for T ∈ (Float32, Float64)
103
147
@show T, @__LINE__
@@ -119,7 +163,16 @@ T = Float32
119
163
120
164
fill! (out3, NaN ); avx2douter! (out3, A, skern);
121
165
@test out1 ≈ out3
122
- end
166
+
167
+ fill! (out3, NaN ); avx2dunrolled! (out3, A, skern);
168
+ @test out1 ≈ out3
169
+
170
+ fill! (out3, NaN ); avx2dunrolled2x2! (out3, A, skern);
171
+ @test out1 ≈ out3
172
+
173
+ fill! (out3, NaN ); avx2dunrolled3x3! (out3, A, skern);
174
+ @test out1 ≈ out3
175
+ end
123
176
124
177
125
178
end
0 commit comments