Skip to content

Commit 47f2789

Browse files
committed
One line rather than nested do concurrents in fortran benchmarks, test small and larger arrays for map.
1 parent e047ea4 commit 47f2789

File tree

3 files changed

+59
-95
lines changed

3 files changed

+59
-95
lines changed

benchmark/looptests.f90

Lines changed: 44 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,8 @@ subroutine gemm_mnk(C, A, B, M, K, N) BIND(C, name="gemm_mnk")
1313
real(C_double), dimension(K, N), intent(in) :: B
1414
integer(C_long) :: mm, kk, nn
1515
C = 0.0
16-
do concurrent(mm = 1:M)
17-
do concurrent(nn = 1:N)
18-
do concurrent(kk = 1:K)
19-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
20-
end do
21-
end do
16+
do concurrent(mm = 1:M, nn = 1:N, kk = 1:K)
17+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
2218
end do
2319
end subroutine gemm_mnk
2420
subroutine gemm_mkn(C, A, B, M, K, N) BIND(C, name="gemm_mkn")
@@ -28,12 +24,8 @@ subroutine gemm_mkn(C, A, B, M, K, N) BIND(C, name="gemm_mkn")
2824
real(C_double), dimension(K, N), intent(in) :: B
2925
integer(C_long) :: mm, kk, nn
3026
C = 0.0
31-
do concurrent(mm = 1:M)
32-
do concurrent(kk = 1:K)
33-
do concurrent(nn = 1:N)
34-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
35-
end do
36-
end do
27+
do concurrent(mm = 1:M, kk = 1:K, nn = 1:N)
28+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
3729
end do
3830
end subroutine gemm_mkn
3931
subroutine gemm_nmk(C, A, B, M, K, N) BIND(C, name="gemm_nmk")
@@ -43,12 +35,8 @@ subroutine gemm_nmk(C, A, B, M, K, N) BIND(C, name="gemm_nmk")
4335
real(C_double), dimension(K, N), intent(in) :: B
4436
integer(C_long) :: mm, kk, nn
4537
C = 0.0
46-
do concurrent(nn = 1:N)
47-
do concurrent(mm = 1:M)
48-
do concurrent(kk = 1:K)
49-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
50-
end do
51-
end do
38+
do concurrent(nn = 1:N, mm = 1:M, kk = 1:K)
39+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
5240
end do
5341
end subroutine gemm_nmk
5442
subroutine gemm_nkm(C, A, B, M, K, N) BIND(C, name="gemm_nkm")
@@ -58,12 +46,8 @@ subroutine gemm_nkm(C, A, B, M, K, N) BIND(C, name="gemm_nkm")
5846
real(C_double), dimension(K, N), intent(in) :: B
5947
integer(C_long) :: mm, kk, nn
6048
C = 0.0
61-
do concurrent(kk = 1:K)
62-
do concurrent(nn = 1:N)
63-
do concurrent(mm = 1:M)
64-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
65-
end do
66-
end do
49+
do concurrent(kk = 1:K, nn = 1:N, mm = 1:M)
50+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
6751
end do
6852
end subroutine gemm_nkm
6953
subroutine gemm_kmn(C, A, B, M, K, N) BIND(C, name="gemm_kmn")
@@ -73,12 +57,8 @@ subroutine gemm_kmn(C, A, B, M, K, N) BIND(C, name="gemm_kmn")
7357
real(C_double), dimension(K, N), intent(in) :: B
7458
integer(C_long) :: mm, kk, nn
7559
C = 0.0
76-
do concurrent(kk = 1:K)
77-
do concurrent(mm = 1:M)
78-
do concurrent(nn = 1:N)
79-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
80-
end do
81-
end do
60+
do concurrent(kk = 1:K, mm = 1:M, nn = 1:N)
61+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
8262
end do
8363
end subroutine gemm_kmn
8464
subroutine gemm_knm(C, A, B, M, K, N) BIND(C, name="gemm_knm")
@@ -88,12 +68,8 @@ subroutine gemm_knm(C, A, B, M, K, N) BIND(C, name="gemm_knm")
8868
real(C_double), dimension(K, N), intent(in) :: B
8969
integer(C_long) :: mm, kk, nn
9070
C = 0.0
91-
do concurrent(kk = 1:K)
92-
do concurrent(nn = 1:N)
93-
do concurrent(mm = 1:M)
94-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
95-
end do
96-
end do
71+
do concurrent(kk = 1:K, nn = 1:N, mm = 1:M)
72+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(kk,nn)
9773
end do
9874
end subroutine gemm_knm
9975
subroutine gemmbuiltin(C, A, B, M, K, N) BIND(C, name="gemmbuiltin")
@@ -110,12 +86,8 @@ subroutine AtmulB(C, A, B, M, K, N) BIND(C, name="AtmulB")
11086
real(C_double), dimension(K, N), intent(in) :: B
11187
integer(C_long) :: mm, kk, nn
11288
C = 0.0
113-
do concurrent(nn = 1:N)
114-
do concurrent(mm = 1:M)
115-
do concurrent(kk = 1:K)
116-
C(mm,nn) = C(mm,nn) + A(kk,mm) * B(kk,nn)
117-
end do
118-
end do
89+
do concurrent(nn = 1:N, mm = 1:M, kk = 1:K)
90+
C(mm,nn) = C(mm,nn) + A(kk,mm) * B(kk,nn)
11991
end do
12092
end subroutine AtmulB
12193
subroutine AtmulBbuiltin(C, A, B, M, K, N) BIND(C, name="AtmulBbuiltin")
@@ -132,12 +104,8 @@ subroutine AmulBt(C, A, B, M, K, N) BIND(C, name="AmulBt")
132104
real(C_double), dimension(N, K), intent(in) :: B
133105
integer(C_long) :: mm, kk, nn
134106
C = 0.0
135-
do concurrent(kk = 1:K)
136-
do concurrent(nn = 1:N)
137-
do concurrent(mm = 1:M)
138-
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(nn,kk)
139-
end do
140-
end do
107+
do concurrent(kk = 1:K, nn = 1:N, mm = 1:M)
108+
C(mm,nn) = C(mm,nn) + A(mm,kk) * B(nn,kk)
141109
end do
142110
end subroutine AmulBt
143111
subroutine AmulBtbuiltin(C, A, B, M, K, N) BIND(C, name="AmulBtbuiltin")
@@ -154,12 +122,8 @@ subroutine AtmulBt(C, A, B, M, K, N) BIND(C, name="AtmulBt")
154122
real(C_double), dimension(N, K), intent(in) :: B
155123
integer(C_long) :: mm, kk, nn
156124
C = 0.0
157-
do concurrent(nn = 1:N)
158-
do concurrent(kk = 1:K)
159-
do concurrent(mm = 1:M)
160-
C(mm,nn) = C(mm,nn) + A(kk,mm) * B(nn,kk)
161-
end do
162-
end do
125+
do concurrent(nn = 1:N, kk = 1:K, mm = 1:M)
126+
C(mm,nn) = C(mm,nn) + A(kk,mm) * B(nn,kk)
163127
end do
164128
end subroutine AtmulBt
165129
subroutine AtmulBtbuiltin(C, A, B, M, K, N) BIND(C, name="AtmulBtbuiltin")
@@ -194,10 +158,8 @@ subroutine dot3(s, x, A, y, M, N) BIND(C, name="dot3")
194158
real(C_double), intent(in) :: x(M), A(M,N), y(N)
195159
real(C_double), intent(out) :: s
196160
integer(C_long) :: mm, nn
197-
do concurrent(nn = 1:N)
198-
do concurrent(mm = 1:M)
199-
s = s + x(mm) * A(mm, nn) * y(nn)
200-
end do
161+
do concurrent(nn = 1:N, mm = 1:M)
162+
s = s + x(mm) * A(mm, nn) * y(nn)
201163
end do
202164
end subroutine dot3
203165
!GCC$ builtin (exp) attributes simd (notinbranch) if('x86_64')
@@ -226,10 +188,8 @@ subroutine gemv(y, A, x, M, K) BIND(C, name="gemv")
226188
real(C_double), dimension(M), intent(out) :: y
227189
integer(C_long) :: mm, kk
228190
y = 0.0
229-
do concurrent(kk = 1:K)
230-
do concurrent(mm = 1:M)
231-
y(mm) = y(mm) + A(mm,kk) * x(kk)
232-
end do
191+
do concurrent(kk = 1:K, mm = 1:M)
192+
y(mm) = y(mm) + A(mm,kk) * x(kk)
233193
end do
234194
end subroutine gemv
235195
subroutine gemvbuiltin(y, A, x, M, K) BIND(C, name="gemvbuiltin")
@@ -244,12 +204,9 @@ subroutine Atmulvb(y, A, x, M, K) BIND(C, name="Atmulvb")
244204
real(C_double), dimension(M), intent(out) :: y
245205
integer(C_long) :: mm, kk
246206
real(C_double) :: ymm
247-
do concurrent(mm = 1:M)
248-
ymm = 0
249-
do concurrent(kk = 1:K)
250-
ymm = ymm + A(kk,mm) * x(kk)
251-
end do
252-
y(mm) = ymm
207+
y = 0
208+
do concurrent(mm = 1:M, kk = 1:K)
209+
y(mm) = y(mm) + A(kk,mm) * x(kk)
253210
end do
254211
end subroutine Atmulvb
255212
subroutine Atmulvbbuiltin(y, A, x, M, K) BIND(C, name="Atmulvbbuiltin")
@@ -266,22 +223,18 @@ subroutine unscaledvar(s, A, x, M, N) BIND(C, name="unscaledvar")
266223
integer(C_long) :: mm, nn
267224
real(C_double) :: d
268225
s = 0.0
269-
do concurrent(nn = 1:N)
270-
do concurrent(mm = 1:M)
271-
d = A(mm,nn) - x(mm)
272-
s(mm) = s(mm) + d * d
273-
end do
226+
do concurrent(nn = 1:N, mm = 1:M)
227+
d = A(mm,nn) - x(mm)
228+
s(mm) = s(mm) + d * d
274229
end do
275230
end subroutine unscaledvar
276231
subroutine aplusBc(D, a, B, c, M, N) BIND(C, name="aplusBc")
277232
integer(C_long), intent(in) :: M, N
278233
real(C_double), intent(in) :: a(M), B(M,N), c(N)
279234
real(C_double), dimension(M,N), intent(out) :: D
280235
integer(C_long) :: mm, nn
281-
do concurrent(nn = 1:N)
282-
do concurrent(mm = 1:M)
283-
D(mm,nn) = a(mm) + B(mm,nn) * c(nn)
284-
end do
236+
do concurrent(nn = 1:N, mm = 1:M)
237+
D(mm,nn) = a(mm) + B(mm,nn) * c(nn)
285238
end do
286239
end subroutine aplusBc
287240
subroutine OLSlp(lp, y, X, b, N, P) BIND(C, name="OLSlp")
@@ -299,15 +252,25 @@ subroutine OLSlp(lp, y, X, b, N, P) BIND(C, name="OLSlp")
299252
lp = lp + d*d
300253
end do
301254
end subroutine OLSlp
255+
subroutine OLSlpsplit(lp, y, X, b, N, P) BIND(C, name="OLSlpsplit")
256+
integer(C_long), intent(in) :: N, P
257+
real(C_double), intent(in) :: y(N), X(N, P), b(P)
258+
real(C_double), intent(out) :: lp
259+
integer(C_long) :: nn, pp
260+
real(C_double) :: d(N)
261+
d = y
262+
do concurrent(nn = 1:N, pp = 1:P)
263+
d(nn) = d(nn) - X(nn,pp) * b(pp)
264+
end do
265+
lp = dot_product(d, d)
266+
end subroutine OLSlpsplit
302267
subroutine AplusAt(B, A, N) BIND(C, name="AplusAt")
303268
integer(C_long), intent(in) :: N
304269
real(C_double), dimension(N,N), intent(out) :: B
305270
real(C_double), dimension(N,N), intent(in) :: A
306271
integer(C_long) :: i, j
307-
do concurrent(i = 1:N)
308-
do concurrent(j = 1:N)
309-
B(j,i) = A(j,i) + A(i,j)
310-
end do
272+
do concurrent(i = 1:N, j = 1:N)
273+
B(j,i) = A(j,i) + A(i,j)
311274
end do
312275
end subroutine AplusAt
313276
subroutine AplusAtbuiltin(B, A, N) BIND(C, name="AplusAtbuiltin")

src/map.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ function vmapnt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A
6868
end
6969
function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A}
7070
ptry, ptrargs, N = alignstores!(f, y, args...)
71+
N > 0 || return y
7172
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
7273
V = VectorizationBase.pick_vector_width_val(T)
7374
Wsh = Wshift + 2

test/map.jl

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
@testset "map" begin
22
@inline foo(x, y) = exp(x) - sin(y)
3-
N = 3781
4-
53
for T (Float32,Float64)
64
@show T, @__LINE__
7-
a = rand(T, N); b = rand(T, N);
8-
c1 = map(foo, a, b);
9-
c2 = vmap(foo, a, b);
10-
@test c1 c2
11-
c2 = vmapnt(foo, a, b);
12-
@test c1 c2
13-
c2 = vmapntt(foo, a, b);
14-
@test c1 c2
15-
fill!(c2, NaN); @views vmapnt!(foo, c2[2:end], a[2:end], b[2:end]);
16-
@test @views c1[2:end] c2[2:end]
17-
fill!(c2, NaN); @views vmapntt!(foo, c2[2:end], a[2:end], b[2:end]);
18-
@test @views c1[2:end] c2[2:end]
5+
for N [ 3, 371 ]
6+
a = rand(T, N); b = rand(T, N);
7+
c1 = map(foo, a, b);
8+
c2 = vmap(foo, a, b);
9+
@test c1 c2
10+
c2 = vmapnt(foo, a, b);
11+
@test c1 c2
12+
c2 = vmapntt(foo, a, b);
13+
@test c1 c2
14+
fill!(c2, NaN); @views vmapnt!(foo, c2[2:end], a[2:end], b[2:end]);
15+
@test @views c1[2:end] c2[2:end]
16+
fill!(c2, NaN); @views vmapntt!(foo, c2[2:end], a[2:end], b[2:end]);
17+
@test @views c1[2:end] c2[2:end]
18+
end
1919

2020
c = rand(T,100); x = rand(T,10^4); y1 = similar(x); y2 = similar(x);
2121
map!(xᵢ -> clenshaw(xᵢ, c), y1, x)

0 commit comments

Comments
 (0)