@@ -30,6 +30,56 @@ function vmap(f::F, args...) where {F}
30
30
vmap! (f, dest, args... )
31
31
end
32
32
33
+ function vmapnt! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
34
+ ptry = pointer (y)
35
+ @assert reinterpret (UInt, ptry) & (VectorizationBase. REGISTER_SIZE - 1 ) == 0
36
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
37
+ ptrargs = pointer .(args)
38
+ V = VectorizationBase. pick_vector_width_val (T)
39
+ N = length (y)
40
+ i = 0
41
+ while i < N - ((W << 2 ) - 1 )
42
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
43
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
44
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
45
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
46
+ end
47
+ while i < N - (W - 1 ) # stops at 16 when
48
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
49
+ end
50
+ if i < N
51
+ m = mask (T, N & (W - 1 ))
52
+ vstore! (ptry, extract_data (f (vload .(V, ptrargs, i, m)... )), i, m)
53
+ end
54
+ y
55
+ end
56
+ function vmapntt! (f:: F , y:: AbstractVector{T} , args:: Vararg{<:Any,A} ) where {F,T,A}
57
+ ptry = pointer (y)
58
+ @assert reinterpret (UInt, ptry) & (VectorizationBase. REGISTER_SIZE - 1 ) == 0
59
+ W, Wshift = VectorizationBase. pick_vector_width_shift (T)
60
+ ptrargs = pointer .(args)
61
+ V = VectorizationBase. pick_vector_width_val (T)
62
+ N = length (y)
63
+ Wsh = Wshift + 2
64
+ Niter = N >>> Wsh
65
+ Base. Threads. @threads for j ∈ 0 : Niter- 1
66
+ i = j << Wsh
67
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
68
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
69
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i); i += W
70
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, i)... )), i)
71
+ end
72
+ ii = Niter << Wsh
73
+ while ii < N - (W - 1 ) # stops at 16 when
74
+ vstorent! (ptry, extract_data (f (vload .(V, ptrargs, ii)... )), ii); ii += W
75
+ end
76
+ if ii < N
77
+ m = mask (T, N & (W - 1 ))
78
+ vstore! (ptry, extract_data (f (vload .(V, ptrargs, ii, m)... )), ii, m)
79
+ end
80
+ y
81
+ end
82
+
33
83
# @inline vmap!(f, y, x...) = @avx y .= f.(x...)
34
84
# @inline vmap(f, x...) = @avx f.(x...)
35
85
0 commit comments