Skip to content

Commit 6759dba

Browse files
committed
Bump deps to fix #63, and add tests for more transposed combinations.
1 parent 120ad94 commit 6759dba

File tree

6 files changed

+59
-37
lines changed

6 files changed

+59
-37
lines changed

Manifest.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,15 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
4949

5050
[[SIMDPirates]]
5151
deps = ["VectorizationBase"]
52-
git-tree-sha1 = "95b57fa378dfa8683246bdfc222052bec9c10636"
52+
git-tree-sha1 = "34dff4f4715f871e71b38f31397d96e62621f14d"
5353
uuid = "21efa798-c60a-11e8-04d3-e1a92915a26a"
54-
version = "0.6.4"
54+
version = "0.6.5"
5555

5656
[[SLEEFPirates]]
5757
deps = ["Libdl", "SIMDPirates", "VectorizationBase"]
58-
git-tree-sha1 = "62368836fef70b461ac005ed0112315222eab5b5"
58+
git-tree-sha1 = "66014d5d3f77753a70b706aeaede1ad70a942915"
5959
uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
60-
version = "0.4.0"
60+
version = "0.4.1"
6161

6262
[[Serialization]]
6363
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -71,6 +71,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7171

7272
[[VectorizationBase]]
7373
deps = ["CpuId", "LinearAlgebra"]
74-
git-tree-sha1 = "1efe57d135fdc28ea6583d2b276c2e2b4ac03a59"
74+
git-tree-sha1 = "006d7b7f276db8d728f8bfd70ebf2efd132f9548"
7575
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
76-
version = "0.6.2"
76+
version = "0.7.0"

Project.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.6.18"
4+
version = "0.6.19"
55

66
[deps]
7+
CpuId = "adafc99b-e345-5852-983c-f28acb93d879"
78
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
89
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
910
SIMDPirates = "21efa798-c60a-11e8-04d3-e1a92915a26a"
@@ -12,9 +13,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1213

1314
[compat]
1415
Parameters = "0"
15-
SIMDPirates = "~0.6.4"
16+
SIMDPirates = "~0.6.5"
1617
SLEEFPirates = "~0.4"
17-
VectorizationBase = "~0.6.1"
18+
VectorizationBase = "~0.7"
1819
julia = "1.1"
1920

2021
[extras]

src/filter.jl

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,32 @@
11

2-
function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: SUPPORTED_TYPES}
3-
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
4-
N = length(y)
5-
Nrep = N >>> Wshift
6-
Nrem = N & (W - 1)
7-
i = 0
8-
j = 0
9-
GC.@preserve x y begin
10-
ptr_x = pointer(x)
11-
ptr_y = pointer(y)
12-
for _ 1:Nrep
13-
vy = vload(Vec{W,T}, ptr_y, i)
14-
mask = f(SVec(vy))
2+
if (Base.libllvm_version v"7" && VectorizationBase.AVX512F) || Base.libllvm_version v"9"
3+
function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: SUPPORTED_TYPES}
4+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
5+
N = length(y)
6+
Nrep = N >>> Wshift
7+
Nrem = N & (W - 1)
8+
i = 0
9+
j = 0
10+
GC.@preserve x y begin
11+
ptr_x = pointer(x)
12+
ptr_y = pointer(y)
13+
for _ 1:Nrep
14+
vy = vload(Vec{W,T}, ptr_y, i)
15+
mask = f(SVec(vy))
16+
SIMDPirates.compressstore!(gep(ptr_x, j), vy, mask)
17+
i += W
18+
j += count_ones(mask)
19+
end
20+
rem_mask = VectorizationBase.mask(T, Nrem)
21+
vy = vload(Vec{W,T}, gep(ptr_y, i), rem_mask)
22+
mask = rem_mask & f(SVec(vy))
1523
SIMDPirates.compressstore!(gep(ptr_x, j), vy, mask)
16-
i += W
1724
j += count_ones(mask)
25+
Base._deleteend!(x, N-j) # resize!(x, j)
1826
end
19-
rem_mask = VectorizationBase.mask(T, Nrem)
20-
vy = vload(Vec{W,T}, gep(ptr_y, i), rem_mask)
21-
mask = rem_mask & f(SVec(vy))
22-
SIMDPirates.compressstore!(gep(ptr_x, j), vy, mask)
23-
j += count_ones(mask)
24-
Base._deleteend!(x, N-j) # resize!(x, j)
27+
x
2528
end
26-
x
29+
vfilter(f, y::AbstractArray{T}) where {T<:SUPPORTED_TYPES} = vfilter!(f, Vector{T}(undef, length(y)), y)
2730
end
28-
vfilter(f, y::AbstractArray{T}) where {T<:SUPPORTED_TYPES} = vfilter!(f, Vector{T}(undef, length(y)), y)
31+
vfilter(f, y) = filter(f, y)
2932

src/precompile.jl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ function _precompile_()
5858
precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(false, true, true),Int64,3,Array{Int64,3}}})
5959
precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(true, false, true),Int64,3,Array{Int64,3}}})
6060
precompile(Tuple{typeof(Base.Broadcast.broadcasted),Function,Array{Int64,3},LowDimArray{(true, true, false),Int64,3,Array{Int64,3}}})
61-
precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Int32,2},Array{Int32,1}})
62-
precompile(Tuple{typeof(Base.Broadcast.broadcasted),typeof(*ˡ),Array{Int64,2},Array{Int64,1}})
6361
precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,NTuple{4,DataType}})
6462
precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,NTuple{5,DataType}})
6563
precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,NTuple{6,DataType}})
@@ -150,32 +148,36 @@ function _precompile_()
150148
precompile(Tuple{typeof(LoopVectorization.add_constant!),LoopVectorization.LoopSet,Float64,Array{Symbol,1},Symbol,Int64})
151149
precompile(Tuple{typeof(LoopVectorization.add_loop_bound!),LoopVectorization.LoopSet,Symbol,Symbol,Bool})
152150
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.PackedStridedPointer{Float32,1}}})
151+
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.PackedStridedPointer{Float64,0}}})
153152
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.RowMajorStridedPointer{Float32,1}}})
154153
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.SparseStridedPointer{Float32,1}}})
155-
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.SparseStridedPointer{Float64,1}}})
156154
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.SparseStridedPointer{Int32,1}}})
157155
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.SparseStridedPointer{Int64,1}}})
158156
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{1,73}}}})
159157
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{1,75}}}})
160158
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{1}}}})
159+
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{69,1}}}})
161160
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{73,1}}}})
162161
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{73}}}})
163162
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float32,Tuple{75,1}}}})
164163
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{1,73}}}})
165164
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{1,75}}}})
166165
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{1}}}})
166+
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{69,1}}}})
167167
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{73,1}}}})
168168
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{73}}}})
169169
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Float64,Tuple{75,1}}}})
170170
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{1,73}}}})
171171
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{1,75}}}})
172172
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{1}}}})
173+
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{69,1}}}})
173174
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{73,1}}}})
174175
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{73}}}})
175176
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int32,Tuple{75,1}}}})
176177
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{1,73}}}})
177178
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{1,75}}}})
178179
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{1}}}})
180+
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{69,1}}}})
179181
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{73,1}}}})
180182
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{73}}}})
181183
precompile(Tuple{typeof(LoopVectorization.add_mref!),LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Int64,Type{VectorizationBase.StaticStridedPointer{Int64,Tuple{75,1}}}})

test/gemm.jl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,10 +524,12 @@
524524
C = Matrix{TC}(undef, M, N);
525525
A = rand(R, M, K); B = rand(R, K, N);
526526
At = copy(A');
527+
Bt = copy(B');
527528
C2 = similar(C);
528529
As = SizedMatrix{M,K}(A);
529530
Ats = SizedMatrix{K,M}(At);
530531
Bs = SizedMatrix{K,N}(B);
532+
Bts = SizedMatrix{N,K}(Bt);
531533
Cs = SizedMatrix{M,N}(C);
532534
@time @testset "avx $T dynamc gemm" begin
533535
AmulB!(C2, A, B)
@@ -539,6 +541,10 @@
539541
@test C C2
540542
fill!(C, 999.99); AmulBavx2!(C, At', B)
541543
@test C C2
544+
fill!(C, 999.99); AmulBavx2!(C, A, Bt')
545+
@test C C2
546+
fill!(C, 999.99); AmulBavx2!(C, At', Bt')
547+
@test C C2
542548
fill!(C, 999.99); AmulBavx3!(C, A, B)
543549
@test C C2
544550
fill!(C, 999.99); AmulBavx3!(C, At', B)
@@ -581,6 +587,10 @@
581587
@test C C2
582588
fill!(C, 999.99); AmulB_avx2!(C, At', B)
583589
@test C C2
590+
fill!(C, 999.99); AmulB_avx2!(C, A, Bt')
591+
@test C C2
592+
fill!(C, 999.99); AmulB_avx2!(C, At', Bt')
593+
@test C C2
584594
fill!(C, 999.99); AmulB_avx3!(C, A, B)
585595
@test C C2
586596
fill!(C, 999.99); AmulB_avx3!(C, At', B)
@@ -619,6 +629,10 @@
619629
@test Cs C2
620630
fill!(Cs, 999.99); AmulBavx2!(Cs, Ats', Bs)
621631
@test Cs C2
632+
fill!(Cs, 999.99); AmulBavx2!(Cs, As, Bts')
633+
@test Cs C2
634+
fill!(Cs, 999.99); AmulBavx2!(Cs, Ats', Bts')
635+
@test Cs C2
622636
fill!(Cs, 999.99); AmulBavx3!(Cs, As, Bs)
623637
@test Cs C2
624638
fill!(Cs, 999.99); AmulBavx3!(Cs, Ats', Bs)
@@ -661,6 +675,10 @@
661675
@test Cs C2
662676
fill!(Cs, 999.99); AmulB_avx2!(Cs, Ats', Bs)
663677
@test Cs C2
678+
fill!(Cs, 999.99); AmulB_avx2!(Cs, As, Bts')
679+
@test Cs C2
680+
fill!(Cs, 999.99); AmulB_avx2!(Cs, Ats', Bts')
681+
@test Cs C2
664682
fill!(Cs, 999.99); AmulB_avx3!(Cs, As, Bs)
665683
@test Cs C2
666684
fill!(Cs, 999.99); AmulB_avx3!(Cs, Ats', Bs)

test/runtests.jl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@ end
3434

3535
@time include("map.jl")
3636

37-
if Base.libllvm_version > v"7"
38-
@time include("filter.jl")
39-
end
37+
@time include("filter.jl")
4038

4139
@time include("gemm.jl")
4240

0 commit comments

Comments
 (0)