27
27
28
28
// avoid inadevertently using a library based abs
29
29
template <typename T> T __kmp_abs (const T val) {
30
- return (val < 0 ) ? -val: val;
30
+ return (val < 0 ) ? -val : val;
31
31
}
32
32
kmp_uint32 __kmp_abs (const kmp_uint32 val) { return val; }
33
33
kmp_uint64 __kmp_abs (const kmp_uint64 val) { return val; }
@@ -36,7 +36,34 @@ kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
36
36
// Common functions for working with rectangular and non-rectangular loops
37
37
// ----------------------------------------------------------------------------
38
38
39
- template <typename T> int __kmp_sign (T val) { return (T (0 ) < val) - (val < T (0 )); }
39
+ template <typename T> int __kmp_sign (T val) {
40
+ return (T (0 ) < val) - (val < T (0 ));
41
+ }
42
+
43
+ template <typename T> class CollapseAllocator {
44
+ typedef T *pT;
45
+
46
+ private:
47
+ static const size_t allocaSize = 32 ; // size limit for stack allocations
48
+ // (8 bytes x 4 nested loops)
49
+ char stackAlloc[allocaSize];
50
+ static constexpr size_t maxElemCount = allocaSize / sizeof (T);
51
+ pT pTAlloc;
52
+
53
+ public:
54
+ CollapseAllocator (size_t n) : pTAlloc(reinterpret_cast <pT>(stackAlloc)) {
55
+ if (n > maxElemCount) {
56
+ pTAlloc = reinterpret_cast <pT>(__kmp_allocate (n * sizeof (T)));
57
+ }
58
+ }
59
+ ~CollapseAllocator () {
60
+ if (pTAlloc != reinterpret_cast <pT>(stackAlloc)) {
61
+ __kmp_free (pTAlloc);
62
+ }
63
+ }
64
+ T &operator [](int index) { return pTAlloc[index]; }
65
+ operator const pT () { return pTAlloc; }
66
+ };
40
67
41
68
// ----------Loop canonicalization---------------------------------------------
42
69
@@ -463,8 +490,7 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
463
490
/* out*/ kmp_uint64 *original_ivs,
464
491
kmp_index_t n) {
465
492
466
- kmp_iterations_t iterations =
467
- (kmp_iterations_t )__kmp_allocate (sizeof (kmp_loop_nest_iv_t ) * n);
493
+ CollapseAllocator<kmp_loop_nest_iv_t > iterations (n);
468
494
469
495
// First, calc corresponding iteration in every original loop:
470
496
for (kmp_index_t ind = n; ind > 0 ;) {
@@ -485,7 +511,6 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
485
511
486
512
kmp_calc_one_iv_rectang (bounds, /* in/out*/ original_ivs, iterations, ind);
487
513
}
488
- __kmp_free (iterations);
489
514
}
490
515
491
516
// ----------------------------------------------------------------------------
@@ -924,9 +949,7 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
924
949
/* out*/ kmp_point_t original_ivs) {
925
950
926
951
// Iterations in the original space, multiplied by step:
927
- kmp_iterations_t iterations =
928
- (kmp_iterations_t )__kmp_allocate (sizeof (kmp_loop_nest_iv_t ) * n);
929
-
952
+ CollapseAllocator<kmp_loop_nest_iv_t > iterations (n);
930
953
for (kmp_index_t ind = n; ind > 0 ;) {
931
954
--ind;
932
955
iterations[ind] = 0 ;
@@ -936,7 +959,6 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
936
959
bool b = kmp_calc_original_ivs_from_iterations (original_bounds_nest, n,
937
960
/* in/out*/ original_ivs,
938
961
/* in/out*/ iterations, 0 );
939
- __kmp_free (iterations);
940
962
return b;
941
963
}
942
964
@@ -948,9 +970,7 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
948
970
kmp_index_t n, const kmp_point_t original_ivs,
949
971
/* out*/ kmp_point_t next_original_ivs) {
950
972
// Iterations in the original space, multiplied by step (so can be negative):
951
- kmp_iterations_t iterations =
952
- (kmp_iterations_t )__kmp_allocate (sizeof (kmp_loop_nest_iv_t ) * n);
953
-
973
+ CollapseAllocator<kmp_loop_nest_iv_t > iterations (n);
954
974
// First, calc corresponding iteration in every original loop:
955
975
for (kmp_index_t ind = 0 ; ind < n; ++ind) {
956
976
auto bounds = &(original_bounds_nest[ind]);
@@ -969,7 +989,6 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
969
989
bool b = kmp_calc_original_ivs_from_iterations (
970
990
original_bounds_nest, n, /* in/out*/ next_original_ivs, iterations, ind);
971
991
972
- __kmp_free (iterations);
973
992
return b;
974
993
}
975
994
@@ -1132,9 +1151,7 @@ bool kmp_calc_original_ivs_for_chunk_end(
1132
1151
/* out*/ kmp_point_t original_ivs) {
1133
1152
1134
1153
// Iterations in the expanded space:
1135
- kmp_iterations_t iterations =
1136
- (kmp_iterations_t )__kmp_allocate (sizeof (kmp_loop_nest_iv_t ) * n);
1137
-
1154
+ CollapseAllocator<kmp_loop_nest_iv_t > iterations (n);
1138
1155
// First, calc corresponding iteration in every modified loop:
1139
1156
for (kmp_index_t ind = n; ind > 0 ;) {
1140
1157
--ind;
@@ -1166,7 +1183,6 @@ bool kmp_calc_original_ivs_for_chunk_end(
1166
1183
// Too big (or too small for >=).
1167
1184
if (ind == 0 ) {
1168
1185
// Need to reduce to the end.
1169
- __kmp_free (iterations);
1170
1186
return false ;
1171
1187
} else {
1172
1188
// Go to next iteration on outer loop:
@@ -1197,7 +1213,6 @@ bool kmp_calc_original_ivs_for_chunk_end(
1197
1213
++ind;
1198
1214
}
1199
1215
1200
- __kmp_free (iterations);
1201
1216
return true ;
1202
1217
}
1203
1218
@@ -1291,9 +1306,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1291
1306
1292
1307
kmp_canonicalize_loop_nest (loc, /* in/out*/ original_bounds_nest, n);
1293
1308
1294
- bounds_info_internal_t *updated_bounds_nest =
1295
- (bounds_info_internal_t *)__kmp_allocate (sizeof (bounds_info_internal_t ) *
1296
- n);
1309
+ CollapseAllocator<bounds_info_internal_t > updated_bounds_nest (n);
1297
1310
1298
1311
for (kmp_index_t i = 0 ; i < n; ++i) {
1299
1312
updated_bounds_nest[i].b = original_bounds_nest[i];
@@ -1308,7 +1321,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1308
1321
1309
1322
if (total == 0 ) {
1310
1323
// Loop won't execute:
1311
- __kmp_free (updated_bounds_nest);
1312
1324
return FALSE ;
1313
1325
}
1314
1326
@@ -1322,20 +1334,11 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1322
1334
1323
1335
KMP_DEBUG_ASSERT (tid < nth);
1324
1336
1325
- kmp_point_t original_ivs_start =
1326
- (kmp_point_t )__kmp_allocate (sizeof (kmp_uint64) * n);
1327
- kmp_point_t original_ivs_end =
1328
- (kmp_point_t )__kmp_allocate (sizeof (kmp_uint64) * n);
1329
- kmp_point_t original_ivs_next_start =
1330
- (kmp_point_t )__kmp_allocate (sizeof (kmp_uint64) * n);
1337
+ CollapseAllocator<kmp_uint64> original_ivs_start (n);
1331
1338
1332
1339
if (!kmp_calc_original_ivs_for_start (original_bounds_nest, n,
1333
1340
/* out*/ original_ivs_start)) {
1334
1341
// Loop won't execute:
1335
- __kmp_free (updated_bounds_nest);
1336
- __kmp_free (original_ivs_start);
1337
- __kmp_free (original_ivs_end);
1338
- __kmp_free (original_ivs_next_start);
1339
1342
return FALSE ;
1340
1343
}
1341
1344
@@ -1354,10 +1357,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1354
1357
// if (plastiter != NULL) {
1355
1358
// *plastiter = TRUE;
1356
1359
// }
1357
- // __kmp_free(updated_bounds_nest);
1358
- // __kmp_free(original_ivs_start);
1359
- // __kmp_free(original_ivs_end);
1360
- // __kmp_free(original_ivs_next_start);
1361
1360
// return TRUE;
1362
1361
// }
1363
1362
@@ -1391,6 +1390,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1391
1390
new_iv += curr_chunk_size - 1 ;
1392
1391
}
1393
1392
1393
+ CollapseAllocator<kmp_uint64> original_ivs_end (n);
1394
1394
if ((nth == 1 ) || (new_iv >= total - 1 )) {
1395
1395
// Do this one till the end - just in case we miscalculated
1396
1396
// and either too much is left to process or new_iv is a bit too big:
@@ -1421,17 +1421,14 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1421
1421
if (last_iter && (tid != 0 )) {
1422
1422
// We are done, this was last chunk, but no chunk for current thread was
1423
1423
// found:
1424
- __kmp_free (updated_bounds_nest);
1425
- __kmp_free (original_ivs_start);
1426
- __kmp_free (original_ivs_end);
1427
- __kmp_free (original_ivs_next_start);
1428
1424
return FALSE ;
1429
1425
}
1430
1426
1431
1427
if (tid == 0 ) {
1432
1428
// We found the chunk for this thread, now we need to check if it's the
1433
1429
// last chunk or not:
1434
1430
1431
+ CollapseAllocator<kmp_uint64> original_ivs_next_start (n);
1435
1432
if (last_iter ||
1436
1433
!kmp_calc_next_original_ivs (original_bounds_nest, n, original_ivs_end,
1437
1434
/* out*/ original_ivs_next_start)) {
@@ -1453,10 +1450,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1453
1450
chunk_bounds_nest[i].ub1_u64 = 0 ;
1454
1451
}
1455
1452
1456
- __kmp_free (updated_bounds_nest);
1457
- __kmp_free (original_ivs_start);
1458
- __kmp_free (original_ivs_end);
1459
- __kmp_free (original_ivs_next_start);
1460
1453
return TRUE ;
1461
1454
}
1462
1455
@@ -1478,9 +1471,5 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1478
1471
original_ivs_start, n);
1479
1472
}
1480
1473
1481
- __kmp_free (updated_bounds_nest);
1482
- __kmp_free (original_ivs_start);
1483
- __kmp_free (original_ivs_end);
1484
- __kmp_free (original_ivs_next_start);
1485
1474
return FALSE ;
1486
1475
}
0 commit comments