Skip to content

Commit fcd2d48

Browse files
vadikp-intelVadim Paretsky
andauthored
[OpenMP] runtime support for efficient partitioning of collapsed triangular loops (#83939)
This PR adds OMP runtime support for more efficient partitioning of certain types of collapsed loops that can be used by compilers that support loop collapsing (i.e. MSVC) to achieve more optimal thread load balancing. In particular, this PR addresses double nested upper and lower isosceles triangular loops of the following types 1. lower triangular 'less_than' for (int i=0; i<N; i++) for (int j=0; j<i; j++) 2. lower triangular 'less_than_equal' for (int i=0; i<N; j++) for (int j=0; j<=i; j++) 3. upper triangular for (int i=0; i<N; i++) for (int j=i; j<N; j++) Includes tests for the three supported loop types. --------- Co-authored-by: Vadim Paretsky <[email protected]>
1 parent 0d4978f commit fcd2d48

File tree

5 files changed

+692
-2
lines changed

5 files changed

+692
-2
lines changed

openmp/runtime/src/kmp_collapse.cpp

Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,304 @@ void kmp_calc_original_ivs_for_end(
12721272
}
12731273
}
12741274

1275+
/**************************************************************************
1276+
* Identify nested loop structure - loops come in the canonical form
1277+
* Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
1278+
* j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279+
* Upper Triangle matrix
1280+
* i = 0; i <= N; i++ {0,0}:{N,0}
1281+
* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1282+
* ************************************************************************/
1283+
nested_loop_type_t
1284+
kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285+
/*in*/ kmp_index_t n) {
1286+
// only 2-level nested loops are supported
1287+
if (n != 2) {
1288+
return nested_loop_type_unkown;
1289+
}
1290+
// loops must be canonical
1291+
KMP_ASSERT(
1292+
(original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293+
(original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294+
// check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295+
kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296+
original_bounds_nest[0].lb0_u64);
1297+
kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298+
original_bounds_nest[0].ub0_u64);
1299+
kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300+
original_bounds_nest[0].lb1_u64);
1301+
kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302+
original_bounds_nest[0].ub1_u64);
1303+
if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304+
return nested_loop_type_unkown;
1305+
}
1306+
// check inner bounds to determine triangle type
1307+
kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308+
original_bounds_nest[1].lb0_u64);
1309+
kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310+
original_bounds_nest[1].ub0_u64);
1311+
kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312+
original_bounds_nest[1].lb1_u64);
1313+
kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314+
original_bounds_nest[1].ub1_u64);
1315+
// lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316+
if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317+
(inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318+
return nested_loop_type_lower_triangular_matrix;
1319+
}
1320+
// upper triangle loop inner bounds need to be {0,1}:{N,0}
1321+
if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322+
inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323+
return nested_loop_type_upper_triangular_matrix;
1324+
}
1325+
return nested_loop_type_unkown;
1326+
}
1327+
1328+
/**************************************************************************
1329+
* SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330+
* Start point is x so the result is always > sqrt(x)
1331+
* The method has uniform convergence, PRECISION is set to 0.1
1332+
* ************************************************************************/
1333+
#define level_of_precision 0.1
1334+
double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335+
double sqrt_old = 0.;
1336+
double sqrt_new = (double)x;
1337+
do {
1338+
sqrt_old = sqrt_new;
1339+
sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340+
} while ((sqrt_old - sqrt_new) > level_of_precision);
1341+
return sqrt_new;
1342+
}
1343+
1344+
/**************************************************************************
1345+
* Handle lower triangle matrix in the canonical form
1346+
* i = 0; i <= N; i++ {0,0}:{N,0}
1347+
* j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348+
* ************************************************************************/
1349+
void kmp_handle_lower_triangle_matrix(
1350+
/*in*/ kmp_uint32 nth,
1351+
/*in*/ kmp_uint32 tid,
1352+
/*in */ kmp_index_t n,
1353+
/*in/out*/ bounds_info_t *original_bounds_nest,
1354+
/*out*/ bounds_info_t *chunk_bounds_nest) {
1355+
1356+
// transfer loop types from the original loop to the chunks
1357+
for (kmp_index_t i = 0; i < n; ++i) {
1358+
chunk_bounds_nest[i] = original_bounds_nest[i];
1359+
}
1360+
// cleanup iv variables
1361+
kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362+
original_bounds_nest[0].ub0_u64);
1363+
kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364+
original_bounds_nest[0].lb0_u64);
1365+
kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366+
original_bounds_nest[1].ub0_u64);
1367+
// calculate the chunk's lower and upper bounds
1368+
// the total number of iterations in the loop is the sum of the arithmetic
1369+
// progression from the outer lower to outer upper bound (inclusive since the
1370+
// loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371+
// effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372+
// + 1) -> N - 1
1373+
kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374+
kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375+
// the current thread's number of iterations:
1376+
// each thread gets an equal number of iterations: total number of iterations
1377+
// divided by the number of threads plus, if there's a remainder,
1378+
// the first threads with the number up to the remainder get an additional
1379+
// iteration each to cover it
1380+
kmp_uint64 iter_current =
1381+
iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382+
// cumulative number of iterations executed by all the previous threads:
1383+
// threads with the tid below the remainder will have (iter_total/nth+1)
1384+
// elements, and so will all threads before them so the cumulative number of
1385+
// iterations executed by the all previous will be the current thread's number
1386+
// of iterations multiplied by the number of previous threads which is equal
1387+
// to the current thread's tid; threads with the number equal or above the
1388+
// remainder will have (iter_total/nth) elements so the cumulative number of
1389+
// iterations previously executed is its number of iterations multipled by the
1390+
// number of previous threads which is again equal to the current thread's tid
1391+
// PLUS all the remainder iterations that will have been executed by the
1392+
// previous threads
1393+
kmp_uint64 iter_before_current =
1394+
tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395+
// cumulative number of iterations executed with the current thread is
1396+
// the cumulative number executed before it plus its own
1397+
kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398+
// calculate the outer loop lower bound (lbo) which is the max outer iv value
1399+
// that gives the number of iterations that is equal or just below the total
1400+
// number of iterations executed by the previous threads, for less_than
1401+
// (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402+
// lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403+
// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404+
// i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405+
// lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406+
// using a parameter to control the equation sign
1407+
kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408+
kmp_uint64 lower_bound_outer =
1409+
(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410+
8 * iter_before_current) +
1411+
inner_adjustment) /
1412+
2 -
1413+
inner_adjustment;
1414+
// calculate the inner loop lower bound which is the remaining number of
1415+
// iterations required to hit the total number of iterations executed by the
1416+
// previous threads giving the starting point of this thread
1417+
kmp_uint64 lower_bound_inner =
1418+
iter_before_current -
1419+
((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420+
// calculate the outer loop upper bound using the same approach as for the
1421+
// inner bound except using the total number of iterations executed with the
1422+
// current thread
1423+
kmp_uint64 upper_bound_outer =
1424+
(kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425+
8 * iter_with_current) +
1426+
inner_adjustment) /
1427+
2 -
1428+
inner_adjustment;
1429+
// calculate the inner loop upper bound which is the remaining number of
1430+
// iterations required to hit the total number of iterations executed after
1431+
// the current thread giving the starting point of the next thread
1432+
kmp_uint64 upper_bound_inner =
1433+
iter_with_current -
1434+
((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435+
// adjust the upper bounds down by 1 element to point at the last iteration of
1436+
// the current thread the first iteration of the next thread
1437+
if (upper_bound_inner == 0) {
1438+
// {n,0} => {n-1,n-1}
1439+
upper_bound_outer -= 1;
1440+
upper_bound_inner = upper_bound_outer;
1441+
} else {
1442+
// {n,m} => {n,m-1} (m!=0)
1443+
upper_bound_inner -= 1;
1444+
}
1445+
1446+
// assign the values, zeroing out lb1 and ub1 values since the iteration space
1447+
// is now one-dimensional
1448+
chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449+
chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450+
chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451+
chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452+
chunk_bounds_nest[0].lb1_u64 = 0;
1453+
chunk_bounds_nest[0].ub1_u64 = 0;
1454+
chunk_bounds_nest[1].lb1_u64 = 0;
1455+
chunk_bounds_nest[1].ub1_u64 = 0;
1456+
1457+
#if 0
1458+
printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459+
tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460+
chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461+
#endif
1462+
}
1463+
1464+
/**************************************************************************
1465+
* Handle upper triangle matrix in the canonical form
1466+
* i = 0; i <= N; i++ {0,0}:{N,0}
1467+
* j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468+
* ************************************************************************/
1469+
void kmp_handle_upper_triangle_matrix(
1470+
/*in*/ kmp_uint32 nth,
1471+
/*in*/ kmp_uint32 tid,
1472+
/*in */ kmp_index_t n,
1473+
/*in/out*/ bounds_info_t *original_bounds_nest,
1474+
/*out*/ bounds_info_t *chunk_bounds_nest) {
1475+
1476+
// transfer loop types from the original loop to the chunks
1477+
for (kmp_index_t i = 0; i < n; ++i) {
1478+
chunk_bounds_nest[i] = original_bounds_nest[i];
1479+
}
1480+
// cleanup iv variables
1481+
kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482+
original_bounds_nest[0].ub0_u64);
1483+
kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484+
original_bounds_nest[0].lb0_u64);
1485+
kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1486+
original_bounds_nest[1].ub0_u64);
1487+
// calculate the chunk's lower and upper bounds
1488+
// the total number of iterations in the loop is the sum of the arithmetic
1489+
// progression from the outer lower to outer upper bound (inclusive since the
1490+
// loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491+
// effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492+
// + 1) -> N - 1
1493+
kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494+
kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495+
// the current thread's number of iterations:
1496+
// each thread gets an equal number of iterations: total number of iterations
1497+
// divided by the number of threads plus, if there's a remainder,
1498+
// the first threads with the number up to the remainder get an additional
1499+
// iteration each to cover it
1500+
kmp_uint64 iter_current =
1501+
iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502+
// cumulative number of iterations executed by all the previous threads:
1503+
// threads with the tid below the remainder will have (iter_total/nth+1)
1504+
// elements, and so will all threads before them so the cumulative number of
1505+
// iterations executed by the all previous will be the current thread's number
1506+
// of iterations multiplied by the number of previous threads which is equal
1507+
// to the current thread's tid; threads with the number equal or above the
1508+
// remainder will have (iter_total/nth) elements so the cumulative number of
1509+
// iterations previously executed is its number of iterations multipled by the
1510+
// number of previous threads which is again equal to the current thread's tid
1511+
// PLUS all the remainder iterations that will have been executed by the
1512+
// previous threads
1513+
kmp_uint64 iter_before_current =
1514+
tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515+
// cumulative number of iterations executed with the current thread is
1516+
// the cumulative number executed before it plus its own
1517+
kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518+
// calculate the outer loop lower bound (lbo) which is the max outer iv value
1519+
// that gives the number of iterations that is equal or just below the total
1520+
// number of iterations executed by the previous threads, for less_than
1521+
// (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1522+
// lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1523+
// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1524+
// i.e. lbo*(lbo+1)/2<=iter_before_current =>
1525+
// lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1526+
// using a parameter to control the equatio sign
1527+
kmp_uint64 lower_bound_outer =
1528+
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1529+
;
1530+
// calculate the inner loop lower bound which is the remaining number of
1531+
// iterations required to hit the total number of iterations executed by the
1532+
// previous threads giving the starting point of this thread
1533+
kmp_uint64 lower_bound_inner =
1534+
iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1535+
// calculate the outer loop upper bound using the same approach as for the
1536+
// inner bound except using the total number of iterations executed with the
1537+
// current thread
1538+
kmp_uint64 upper_bound_outer =
1539+
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1540+
// calculate the inner loop upper bound which is the remaining number of
1541+
// iterations required to hit the total number of iterations executed after
1542+
// the current thread giving the starting point of the next thread
1543+
kmp_uint64 upper_bound_inner =
1544+
iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1545+
// adjust the upper bounds down by 1 element to point at the last iteration of
1546+
// the current thread the first iteration of the next thread
1547+
if (upper_bound_inner == 0) {
1548+
// {n,0} => {n-1,n-1}
1549+
upper_bound_outer -= 1;
1550+
upper_bound_inner = upper_bound_outer;
1551+
} else {
1552+
// {n,m} => {n,m-1} (m!=0)
1553+
upper_bound_inner -= 1;
1554+
}
1555+
1556+
// assign the values, zeroing out lb1 and ub1 values since the iteration space
1557+
// is now one-dimensional
1558+
chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1559+
chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1560+
chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1561+
chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1562+
chunk_bounds_nest[0].lb1_u64 = 0;
1563+
chunk_bounds_nest[0].ub1_u64 = 0;
1564+
chunk_bounds_nest[1].lb1_u64 = 0;
1565+
chunk_bounds_nest[1].ub1_u64 = 0;
1566+
1567+
#if 0
1568+
printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1569+
tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1570+
chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1571+
#endif
1572+
}
12751573
//----------Init API for non-rectangular loops--------------------------------
12761574

12771575
// Init API for collapsed loops (static, no chunks defined).
@@ -1334,6 +1632,19 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
13341632

13351633
KMP_DEBUG_ASSERT(tid < nth);
13361634

1635+
// Handle special cases
1636+
nested_loop_type_t loop_type =
1637+
kmp_identify_nested_loop_structure(original_bounds_nest, n);
1638+
if (loop_type == nested_loop_type_lower_triangular_matrix) {
1639+
kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1640+
chunk_bounds_nest);
1641+
return TRUE;
1642+
} else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1643+
kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1644+
chunk_bounds_nest);
1645+
return TRUE;
1646+
}
1647+
13371648
CollapseAllocator<kmp_uint64> original_ivs_start(n);
13381649

13391650
if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,

openmp/runtime/src/kmp_collapse.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ enum loop_type_t : kmp_int32 {
4545
loop_type_int64 = 7
4646
};
4747

48+
// Defining loop types to handle special cases
49+
enum nested_loop_type_t : kmp_int32 {
50+
nested_loop_type_unkown = 0,
51+
nested_loop_type_lower_triangular_matrix = 1,
52+
nested_loop_type_upper_triangular_matrix = 2
53+
};
54+
4855
/*!
4956
@ingroup WORK_SHARING
5057
* Describes the structure for rectangular nested loops.
@@ -124,14 +131,14 @@ struct bounds_info_t {
124131
// It's represented in kmp_uint64, but each dimention is calculated in
125132
// that loop IV type. Also dimentions have to be converted to those types
126133
// when used in generated code.
127-
typedef kmp_uint64* kmp_point_t;
134+
typedef kmp_uint64 *kmp_point_t;
128135

129136
// Array: Number of loop iterations on each nesting level to achieve some point,
130137
// in expanded space or in original space.
131138
// OMPTODO: move from using iterations to using offsets (iterations multiplied
132139
// by steps). For those we need to be careful with the types, as step can be
133140
// negative, but it'll remove multiplications and divisions in several places.
134-
typedef kmp_loop_nest_iv_t* kmp_iterations_t;
141+
typedef kmp_loop_nest_iv_t *kmp_iterations_t;
135142

136143
// Internal struct with additional info:
137144
template <typename T> struct bounds_info_internalXX_template {

0 commit comments

Comments
 (0)