@@ -111,7 +111,7 @@ function cost(
111
111
# cannot shuffle false means reject curly
112
112
# either false means shuffle
113
113
dont_shuffle =
114
- rejectinterleave (op) && (cannot_shuffle (op, u₁, u₂, contigind, indices))
114
+ (Wshift > 3 ) || ( rejectinterleave (op) && (cannot_shuffle (op, u₁, u₂, contigind, indices) ))
115
115
if dont_shuffle
116
116
# offset = 0.0 # gather/scatter, alignment doesn't matter
117
117
r = 1 << shifter
@@ -301,7 +301,6 @@ function unroll_no_reductions(ls, order, vloopsym)
301
301
u = if compute_rt ≤ 1
302
302
4
303
303
elseif compute_rt > memory_rt
304
- # @show load_rt, store_rt, compute_rt, compute_l, rpc, rpp
305
304
# if compute_rt > 40
306
305
# max(VectorizationBase.nextpow2( min( 4, round(Int, compute_rt / memory_rt) ) ), 1)
307
306
# else
@@ -312,7 +311,6 @@ function unroll_no_reductions(ls, order, vloopsym)
312
311
else
313
312
max (1 , min (4 , round (Int, 1.75 compute_rt / load_rt)))
314
313
end
315
- # @show load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp
316
314
# u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
317
315
# commented out here is to decide to align loops
318
316
# if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
@@ -381,7 +379,6 @@ function determine_unroll_factor(
381
379
end
382
380
recip_throughput =
383
381
max (compute_recip_throughput, load_recip_throughput, store_recip_throughput)
384
- # @show latency, recip_throughput
385
382
recip_throughput, latency
386
383
end
387
384
function count_reductions (ls:: LoopSet )
@@ -992,7 +989,6 @@ function load_elimination_cost_factor!(
992
989
# cost_vec[1] -= rt
993
990
# cost_vec[1] -= 0.5625 * iters
994
991
# cost_vec[1] -= 0.5625 * iters / 2
995
- # @show rto, 0.8rt, op
996
992
# reg_pressure[1] += 0.25rp
997
993
reg_pressure[1 ] += 0.25 rp
998
994
cost_vec[2 ] += rt
@@ -1156,7 +1152,6 @@ function evaluate_cost_tile!(
1156
1152
@unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
1157
1153
cacheunrolled! (ls, u₁loopsym, u₂loopsym, vloopsym)
1158
1154
# println("\n")
1159
- # @show order unrollsyms
1160
1155
# u₂loopsym = order[1]
1161
1156
# u₁loopsym = order[2]
1162
1157
ops = operations (ls)
@@ -1248,9 +1243,6 @@ function evaluate_cost_tile!(
1248
1243
inner₁ = u₁reached | depends_on_u₂
1249
1244
inner₂ = u₂reached | depends_on_u₁
1250
1245
# if isconstantop(op)
1251
- # if iscompute(op)
1252
- # @show inner₁, depends_on_u₁, inner₂, depends_on_u₂, op
1253
- # end
1254
1246
reduced_by_unrolling[1 , 2 , id] = inner₁ & ! depends_on_u₁
1255
1247
reduced_by_unrolling[2 , 2 , id] = inner₂ & ! depends_on_u₂
1256
1248
# else
@@ -1285,7 +1277,6 @@ function evaluate_cost_tile!(
1285
1277
size_T,
1286
1278
opisininnerloop,
1287
1279
)
1288
- # println("constoffelim")
1289
1280
continue
1290
1281
elseif load_elimination_cost_factor! (
1291
1282
cost_vec,
@@ -1298,11 +1289,8 @@ function evaluate_cost_tile!(
1298
1289
Wshift,
1299
1290
size_T,
1300
1291
)
1301
- # println("loadelim")
1302
- # A[i,j-1], A[i,j]
1303
1292
continue
1304
1293
end
1305
- # elseif isconstant(op)
1306
1294
end
1307
1295
rt, lat, rp = cost (ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
1308
1296
if isload (op) & (! prefetch_good_idea)
@@ -1315,9 +1303,8 @@ function evaluate_cost_tile!(
1315
1303
else # FIXME : hack to not go crazy
1316
1304
max (zero (rp), rp - one (rp))
1317
1305
end
1318
- rto = rt
1306
+ # rto = rt
1319
1307
rt *= iters[id]
1320
- # @show (u₁reducesrt, u₂reducesrt), (u₁reducesrp, u₂reducesrp), rto, rt, lat, rp, op
1321
1308
if isstore (op) & (! u₁reducesrt) & (! u₂reducesrt)
1322
1309
irreducible_storecosts += rt
1323
1310
end
@@ -1340,7 +1327,6 @@ function evaluate_cost_tile!(
1340
1327
end
1341
1328
end
1342
1329
end
1343
- # @show u₁reducesrp, u₂reducesrp, rp, op
1344
1330
update_reg_pres! (reg_pressure, rp, u₁reducesrp, u₂reducesrp)
1345
1331
# end
1346
1332
# update_costs!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
0 commit comments