Skip to content

Commit c6a70a7

Browse files
committed
Improve tile size decision making for fixed size dimensions.
1 parent 47c11cb commit c6a70a7

File tree

5 files changed

+42
-23
lines changed

5 files changed

+42
-23
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ benchmark/#*#
1212
*.mod
1313
*.mod0
1414
*.so
15+
*.s

benchmark/looptests.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,11 @@ void gemv(double* restrict y, double* restrict A, double* restrict x, long M, l
140140
}
141141
void Atmulvb(double* restrict y, double* restrict A, double* restrict x, long M, long K){
142142
for (long m = 0; m < M; m++){
143-
y[m] = 0.0;
144-
}
145-
for (long m = 0; m < M; m++){
143+
double ym = 0.0;
146144
for (long k = 0; k < K; k++){
147-
y[m] += A[k + m*K] * x[k];
145+
ym += A[k + m*K] * x[k];
148146
}
147+
y[m] = ym;
149148
}
150149
return;
151150
}

benchmark/looptests.f90

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,18 +221,21 @@ subroutine Atmulvb(y, A, x, M, K) BIND(C, name="Atmulvb")
221221
real(C_double), intent(in) :: A(K,M), x(K)
222222
real(C_double), dimension(M), intent(out) :: y
223223
integer(C_long) :: mm, kk
224-
y = 0.0
224+
real(C_double) :: ymm
225225
do concurrent(mm = 1:M)
226+
ymm = 0
226227
do concurrent(kk = 1:K)
227-
y(mm) = y(mm) + A(kk,mm) * x(kk)
228+
ymm = ymm + A(kk,mm) * x(kk)
228229
end do
230+
y(mm) = ymm
229231
end do
230232
end subroutine Atmulvb
231233
subroutine Atmulvbbuiltin(y, A, x, M, K) BIND(C, name="Atmulvbbuiltin")
232234
integer(C_long), intent(in) :: M, K
233-
real(C_double), intent(in) :: A(K,M), x(K)
234-
real(C_double), dimension(M), intent(out) :: y
235-
y = matmul(transpose(A), x)
235+
real(C_double), intent(in) :: A(K,M), x(1,K)
236+
real(C_double), dimension(1,M), intent(out) :: y
237+
! y = matmul(transpose(A), x)
238+
y = matmul(x, A)
236239
end subroutine Atmulvbbuiltin
237240
subroutine unscaledvar(s, A, x, M, N) BIND(C, name="unscaledvar")
238241
integer(C_long), intent(in) :: M, N

benchmark/looptests.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,12 @@ function jgemv!(y, A, x)
119119
end
120120
@inline function jgemv!(y, Aᵀ::Adjoint, x)
121121
A = parent(Aᵀ)
122-
y .= 0.0
123122
@inbounds for i eachindex(y)
123+
yᵢ = 0.0
124124
@simd ivdep for j eachindex(x)
125-
y[i] += A[j,i] * x[j]
125+
yᵢ += A[j,i] * x[j]
126126
end
127+
y[i] = yᵢ
127128
end
128129
end
129130
@inline function jgemvavx!(y, A, x)

src/determinestrategy.jl

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -270,43 +270,58 @@ function solve_tilesize(X, R, Umax, Tmax, UL, TL)
270270
T = Tmax
271271
else # U too large, resolve T
272272
U = Umax
273-
T = solve_tilesize_constU(X, R, U)
273+
T = min(Tmax, solve_tilesize_constU(X, R, U))
274274
end
275275
elseif T_too_large
276276
T = Tmax
277-
U = solve_tilesize_constT(X, R, T)
277+
U = min(Umax, solve_tilesize_constT(X, R, T))
278278
end
279279
U, T, cost
280280
end
281281
function maybedemotesize(U::Int, N::Int)
282-
U > 1 || return 1
282+
# U > 1 || return 1
283283
Um1 = U - 1
284284
urep = num_iterations(N, U)
285285
um1rep = num_iterations(N, Um1)
286286
um1rep > urep ? U : Um1
287287
end
288+
function maybedemotesize(T::Int, N::Int, U::Int, Uloop::Loop, maxTbase::Int)
289+
T > 1 || return T
290+
T == N && return T
291+
T = maybedemotesize(T, N)
292+
if !(isstaticloop(Uloop) && length(Uloop) == U)
293+
if N % T != 0
294+
T = min(T, maxTbase)
295+
end
296+
end
297+
T
298+
end
288299
function solve_tilesize(
289300
ls::LoopSet, unrolled::Symbol, tiled::Symbol,
290-
cost_vec::AbstractVector{Float64} = @view(ls.cost_vec[:,1]),
291-
reg_pressure::AbstractVector{Int} = @view(ls.reg_pres[:,1])
301+
cost_vec::AbstractVector{Float64},
302+
reg_pressure::AbstractVector{Int},
303+
W::Int, vectorized::Symbol
292304
)
293-
maxT = 4#8
294-
maxU = 4#8
305+
maxTbase = maxUbase = 4
306+
maxT = maxTbase#8
307+
maxU = maxUbase#8
295308
tiledloop = getloop(ls, tiled)
296309
unrolledloop = getloop(ls, unrolled)
297310
if isstaticloop(tiledloop)
298311
maxT = min(4maxT, length(tiledloop))
299312
end
300313
if isstaticloop(unrolledloop)
301-
maxU = min(4maxU, length(unrolledloop))
314+
UL = length(unrolledloop)
315+
UL = unrolled === vectorized ? cld(UL,W) : UL
316+
maxU = min(4maxU, UL)
302317
end
303318
U, T, cost = solve_tilesize(cost_vec, reg_pressure, maxU, maxT, length(unrolledloop), length(tiledloop))
304319
# heuristic to more evenly divide small numbers of iterations
305-
if isstaticloop(tiledloop) & T > 1
306-
T = maybedemotesize(T, length(tiledloop))
320+
if isstaticloop(tiledloop)
321+
T = maybedemotesize(T, length(tiledloop), U, unrolledloop, maxTbase)
307322
end
308323
if isstaticloop(unrolledloop)
309-
U = maybedemotesize(U, length(unrolledloop))
324+
U = maybedemotesize(U, length(unrolledloop), T, tiledloop, maxUbase)
310325
end
311326
U, T, cost
312327
end
@@ -427,7 +442,7 @@ function evaluate_cost_tile(
427442
end
428443
# @show order, vectorized cost_vec reg_pressure
429444
# @show solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure)
430-
U, T, tcost = solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure)
445+
U, T, tcost = solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure, W, vectorized)
431446
U, T, tcost + stride_penalty(ls, order)
432447
end
433448

0 commit comments

Comments
 (0)