Skip to content

Commit 6789dda

Browse files
author
Vadim Paretsky
committed
[OpenMP] make small memory allocations in loop collapse code on the stack
A few places in the loop collapse support code make small dynamic allocations that introduce a noticeable performance overhead when made on the heap. This change moves allocations up to 32 bytes to the stack instead of the heap. Differential Revision: https://reviews.llvm.org/D158220
1 parent 3db0e40 commit 6789dda

File tree

1 file changed

+37
-48
lines changed

1 file changed

+37
-48
lines changed

openmp/runtime/src/kmp_collapse.cpp

Lines changed: 37 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
// avoid inadevertently using a library based abs
2929
template <typename T> T __kmp_abs(const T val) {
30-
return (val < 0) ? -val: val;
30+
return (val < 0) ? -val : val;
3131
}
3232
kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
3333
kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
@@ -36,7 +36,34 @@ kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
3636
// Common functions for working with rectangular and non-rectangular loops
3737
//----------------------------------------------------------------------------
3838

39-
template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); }
39+
template <typename T> int __kmp_sign(T val) {
40+
return (T(0) < val) - (val < T(0));
41+
}
42+
43+
template <typename T> class CollapseAllocator {
44+
typedef T *pT;
45+
46+
private:
47+
static const size_t allocaSize = 32; // size limit for stack allocations
48+
// (8 bytes x 4 nested loops)
49+
char stackAlloc[allocaSize];
50+
static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51+
pT pTAlloc;
52+
53+
public:
54+
CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55+
if (n > maxElemCount) {
56+
pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57+
}
58+
}
59+
~CollapseAllocator() {
60+
if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61+
__kmp_free(pTAlloc);
62+
}
63+
}
64+
T &operator[](int index) { return pTAlloc[index]; }
65+
operator const pT() { return pTAlloc; }
66+
};
4067

4168
//----------Loop canonicalization---------------------------------------------
4269

@@ -463,8 +490,7 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
463490
/*out*/ kmp_uint64 *original_ivs,
464491
kmp_index_t n) {
465492

466-
kmp_iterations_t iterations =
467-
(kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
493+
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
468494

469495
// First, calc corresponding iteration in every original loop:
470496
for (kmp_index_t ind = n; ind > 0;) {
@@ -485,7 +511,6 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
485511

486512
kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
487513
}
488-
__kmp_free(iterations);
489514
}
490515

491516
//----------------------------------------------------------------------------
@@ -924,9 +949,7 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
924949
/*out*/ kmp_point_t original_ivs) {
925950

926951
// Iterations in the original space, multiplied by step:
927-
kmp_iterations_t iterations =
928-
(kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
929-
952+
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
930953
for (kmp_index_t ind = n; ind > 0;) {
931954
--ind;
932955
iterations[ind] = 0;
@@ -936,7 +959,6 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
936959
bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
937960
/*in/out*/ original_ivs,
938961
/*in/out*/ iterations, 0);
939-
__kmp_free(iterations);
940962
return b;
941963
}
942964

@@ -948,9 +970,7 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
948970
kmp_index_t n, const kmp_point_t original_ivs,
949971
/*out*/ kmp_point_t next_original_ivs) {
950972
// Iterations in the original space, multiplied by step (so can be negative):
951-
kmp_iterations_t iterations =
952-
(kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
953-
973+
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
954974
// First, calc corresponding iteration in every original loop:
955975
for (kmp_index_t ind = 0; ind < n; ++ind) {
956976
auto bounds = &(original_bounds_nest[ind]);
@@ -969,7 +989,6 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
969989
bool b = kmp_calc_original_ivs_from_iterations(
970990
original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
971991

972-
__kmp_free(iterations);
973992
return b;
974993
}
975994

@@ -1132,9 +1151,7 @@ bool kmp_calc_original_ivs_for_chunk_end(
11321151
/*out*/ kmp_point_t original_ivs) {
11331152

11341153
// Iterations in the expanded space:
1135-
kmp_iterations_t iterations =
1136-
(kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
1137-
1154+
CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
11381155
// First, calc corresponding iteration in every modified loop:
11391156
for (kmp_index_t ind = n; ind > 0;) {
11401157
--ind;
@@ -1166,7 +1183,6 @@ bool kmp_calc_original_ivs_for_chunk_end(
11661183
// Too big (or too small for >=).
11671184
if (ind == 0) {
11681185
// Need to reduce to the end.
1169-
__kmp_free(iterations);
11701186
return false;
11711187
} else {
11721188
// Go to next iteration on outer loop:
@@ -1197,7 +1213,6 @@ bool kmp_calc_original_ivs_for_chunk_end(
11971213
++ind;
11981214
}
11991215

1200-
__kmp_free(iterations);
12011216
return true;
12021217
}
12031218

@@ -1291,9 +1306,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
12911306

12921307
kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
12931308

1294-
bounds_info_internal_t *updated_bounds_nest =
1295-
(bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) *
1296-
n);
1309+
CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
12971310

12981311
for (kmp_index_t i = 0; i < n; ++i) {
12991312
updated_bounds_nest[i].b = original_bounds_nest[i];
@@ -1308,7 +1321,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
13081321

13091322
if (total == 0) {
13101323
// Loop won't execute:
1311-
__kmp_free(updated_bounds_nest);
13121324
return FALSE;
13131325
}
13141326

@@ -1322,20 +1334,11 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
13221334

13231335
KMP_DEBUG_ASSERT(tid < nth);
13241336

1325-
kmp_point_t original_ivs_start =
1326-
(kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1327-
kmp_point_t original_ivs_end =
1328-
(kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1329-
kmp_point_t original_ivs_next_start =
1330-
(kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1337+
CollapseAllocator<kmp_uint64> original_ivs_start(n);
13311338

13321339
if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
13331340
/*out*/ original_ivs_start)) {
13341341
// Loop won't execute:
1335-
__kmp_free(updated_bounds_nest);
1336-
__kmp_free(original_ivs_start);
1337-
__kmp_free(original_ivs_end);
1338-
__kmp_free(original_ivs_next_start);
13391342
return FALSE;
13401343
}
13411344

@@ -1354,10 +1357,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
13541357
// if (plastiter != NULL) {
13551358
// *plastiter = TRUE;
13561359
// }
1357-
// __kmp_free(updated_bounds_nest);
1358-
// __kmp_free(original_ivs_start);
1359-
// __kmp_free(original_ivs_end);
1360-
// __kmp_free(original_ivs_next_start);
13611360
// return TRUE;
13621361
//}
13631362

@@ -1391,6 +1390,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
13911390
new_iv += curr_chunk_size - 1;
13921391
}
13931392

1393+
CollapseAllocator<kmp_uint64> original_ivs_end(n);
13941394
if ((nth == 1) || (new_iv >= total - 1)) {
13951395
// Do this one till the end - just in case we miscalculated
13961396
// and either too much is left to process or new_iv is a bit too big:
@@ -1421,17 +1421,14 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
14211421
if (last_iter && (tid != 0)) {
14221422
// We are done, this was last chunk, but no chunk for current thread was
14231423
// found:
1424-
__kmp_free(updated_bounds_nest);
1425-
__kmp_free(original_ivs_start);
1426-
__kmp_free(original_ivs_end);
1427-
__kmp_free(original_ivs_next_start);
14281424
return FALSE;
14291425
}
14301426

14311427
if (tid == 0) {
14321428
// We found the chunk for this thread, now we need to check if it's the
14331429
// last chunk or not:
14341430

1431+
CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
14351432
if (last_iter ||
14361433
!kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
14371434
/*out*/ original_ivs_next_start)) {
@@ -1453,10 +1450,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
14531450
chunk_bounds_nest[i].ub1_u64 = 0;
14541451
}
14551452

1456-
__kmp_free(updated_bounds_nest);
1457-
__kmp_free(original_ivs_start);
1458-
__kmp_free(original_ivs_end);
1459-
__kmp_free(original_ivs_next_start);
14601453
return TRUE;
14611454
}
14621455

@@ -1478,9 +1471,5 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
14781471
original_ivs_start, n);
14791472
}
14801473

1481-
__kmp_free(updated_bounds_nest);
1482-
__kmp_free(original_ivs_start);
1483-
__kmp_free(original_ivs_end);
1484-
__kmp_free(original_ivs_next_start);
14851474
return FALSE;
14861475
}

0 commit comments

Comments
 (0)