@@ -1272,6 +1272,304 @@ void kmp_calc_original_ivs_for_end(
1272
1272
}
1273
1273
}
1274
1274
1275
+ /* *************************************************************************
1276
+ * Identify nested loop structure - loops come in the canonical form
1277
+ * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
1278
+ * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279
+ * Upper Triangle matrix
1280
+ * i = 0; i <= N; i++ {0,0}:{N,0}
1281
+ * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1282
+ * ************************************************************************/
1283
+ nested_loop_type_t
1284
+ kmp_identify_nested_loop_structure (/* in*/ bounds_info_t *original_bounds_nest,
1285
+ /* in*/ kmp_index_t n) {
1286
+ // only 2-level nested loops are supported
1287
+ if (n != 2 ) {
1288
+ return nested_loop_type_unkown;
1289
+ }
1290
+ // loops must be canonical
1291
+ KMP_ASSERT (
1292
+ (original_bounds_nest[0 ].comparison == comparison_t ::comp_less_or_eq) &&
1293
+ (original_bounds_nest[1 ].comparison == comparison_t ::comp_less_or_eq));
1294
+ // check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295
+ kmp_uint64 outer_lb0_u64 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1296
+ original_bounds_nest[0 ].lb0_u64 );
1297
+ kmp_uint64 outer_ub0_u64 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1298
+ original_bounds_nest[0 ].ub0_u64 );
1299
+ kmp_uint64 outer_lb1_u64 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1300
+ original_bounds_nest[0 ].lb1_u64 );
1301
+ kmp_uint64 outer_ub1_u64 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1302
+ original_bounds_nest[0 ].ub1_u64 );
1303
+ if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0 ) {
1304
+ return nested_loop_type_unkown;
1305
+ }
1306
+ // check inner bounds to determine triangle type
1307
+ kmp_uint64 inner_lb0_u64 = kmp_fix_iv (original_bounds_nest[1 ].loop_iv_type ,
1308
+ original_bounds_nest[1 ].lb0_u64 );
1309
+ kmp_uint64 inner_ub0_u64 = kmp_fix_iv (original_bounds_nest[1 ].loop_iv_type ,
1310
+ original_bounds_nest[1 ].ub0_u64 );
1311
+ kmp_uint64 inner_lb1_u64 = kmp_fix_iv (original_bounds_nest[1 ].loop_iv_type ,
1312
+ original_bounds_nest[1 ].lb1_u64 );
1313
+ kmp_uint64 inner_ub1_u64 = kmp_fix_iv (original_bounds_nest[1 ].loop_iv_type ,
1314
+ original_bounds_nest[1 ].ub1_u64 );
1315
+ // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316
+ if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317
+ (inner_ub0_u64 == 0 || inner_ub0_u64 == -1 ) && inner_ub1_u64 == 1 ) {
1318
+ return nested_loop_type_lower_triangular_matrix;
1319
+ }
1320
+ // upper triangle loop inner bounds need to be {0,1}:{N,0}
1321
+ if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322
+ inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0 ) {
1323
+ return nested_loop_type_upper_triangular_matrix;
1324
+ }
1325
+ return nested_loop_type_unkown;
1326
+ }
1327
+
1328
+ /* *************************************************************************
1329
+ * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330
+ * Start point is x so the result is always > sqrt(x)
1331
+ * The method has uniform convergence, PRECISION is set to 0.1
1332
+ * ************************************************************************/
1333
+ #define level_of_precision 0.1
1334
+ double sqrt_newton_approx (/* in*/ kmp_uint64 x) {
1335
+ double sqrt_old = 0 .;
1336
+ double sqrt_new = (double )x;
1337
+ do {
1338
+ sqrt_old = sqrt_new;
1339
+ sqrt_new = (sqrt_old + x / sqrt_old) / 2 ;
1340
+ } while ((sqrt_old - sqrt_new) > level_of_precision);
1341
+ return sqrt_new;
1342
+ }
1343
+
1344
+ /* *************************************************************************
1345
+ * Handle lower triangle matrix in the canonical form
1346
+ * i = 0; i <= N; i++ {0,0}:{N,0}
1347
+ * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348
+ * ************************************************************************/
1349
+ void kmp_handle_lower_triangle_matrix (
1350
+ /* in*/ kmp_uint32 nth,
1351
+ /* in*/ kmp_uint32 tid,
1352
+ /* in */ kmp_index_t n,
1353
+ /* in/out*/ bounds_info_t *original_bounds_nest,
1354
+ /* out*/ bounds_info_t *chunk_bounds_nest) {
1355
+
1356
+ // transfer loop types from the original loop to the chunks
1357
+ for (kmp_index_t i = 0 ; i < n; ++i) {
1358
+ chunk_bounds_nest[i] = original_bounds_nest[i];
1359
+ }
1360
+ // cleanup iv variables
1361
+ kmp_uint64 outer_ub0 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1362
+ original_bounds_nest[0 ].ub0_u64 );
1363
+ kmp_uint64 outer_lb0 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1364
+ original_bounds_nest[0 ].lb0_u64 );
1365
+ kmp_uint64 inner_ub0 = kmp_fix_iv (original_bounds_nest[1 ].loop_iv_type ,
1366
+ original_bounds_nest[1 ].ub0_u64 );
1367
+ // calculate the chunk's lower and upper bounds
1368
+ // the total number of iterations in the loop is the sum of the arithmetic
1369
+ // progression from the outer lower to outer upper bound (inclusive since the
1370
+ // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371
+ // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372
+ // + 1) -> N - 1
1373
+ kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1 ) + inner_ub0;
1374
+ kmp_uint64 iter_total = outer_iters * (outer_iters + 1 ) / 2 ;
1375
+ // the current thread's number of iterations:
1376
+ // each thread gets an equal number of iterations: total number of iterations
1377
+ // divided by the number of threads plus, if there's a remainder,
1378
+ // the first threads with the number up to the remainder get an additional
1379
+ // iteration each to cover it
1380
+ kmp_uint64 iter_current =
1381
+ iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0 );
1382
+ // cumulative number of iterations executed by all the previous threads:
1383
+ // threads with the tid below the remainder will have (iter_total/nth+1)
1384
+ // elements, and so will all threads before them so the cumulative number of
1385
+ // iterations executed by the all previous will be the current thread's number
1386
+ // of iterations multiplied by the number of previous threads which is equal
1387
+ // to the current thread's tid; threads with the number equal or above the
1388
+ // remainder will have (iter_total/nth) elements so the cumulative number of
1389
+ // iterations previously executed is its number of iterations multipled by the
1390
+ // number of previous threads which is again equal to the current thread's tid
1391
+ // PLUS all the remainder iterations that will have been executed by the
1392
+ // previous threads
1393
+ kmp_uint64 iter_before_current =
1394
+ tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395
+ // cumulative number of iterations executed with the current thread is
1396
+ // the cumulative number executed before it plus its own
1397
+ kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398
+ // calculate the outer loop lower bound (lbo) which is the max outer iv value
1399
+ // that gives the number of iterations that is equal or just below the total
1400
+ // number of iterations executed by the previous threads, for less_than
1401
+ // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402
+ // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403
+ // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404
+ // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405
+ // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406
+ // using a parameter to control the equation sign
1407
+ kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408
+ kmp_uint64 lower_bound_outer =
1409
+ (kmp_uint64)(sqrt_newton_approx (inner_adjustment * inner_adjustment +
1410
+ 8 * iter_before_current) +
1411
+ inner_adjustment) /
1412
+ 2 -
1413
+ inner_adjustment;
1414
+ // calculate the inner loop lower bound which is the remaining number of
1415
+ // iterations required to hit the total number of iterations executed by the
1416
+ // previous threads giving the starting point of this thread
1417
+ kmp_uint64 lower_bound_inner =
1418
+ iter_before_current -
1419
+ ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2 ;
1420
+ // calculate the outer loop upper bound using the same approach as for the
1421
+ // inner bound except using the total number of iterations executed with the
1422
+ // current thread
1423
+ kmp_uint64 upper_bound_outer =
1424
+ (kmp_uint64)(sqrt_newton_approx (inner_adjustment * inner_adjustment +
1425
+ 8 * iter_with_current) +
1426
+ inner_adjustment) /
1427
+ 2 -
1428
+ inner_adjustment;
1429
+ // calculate the inner loop upper bound which is the remaining number of
1430
+ // iterations required to hit the total number of iterations executed after
1431
+ // the current thread giving the starting point of the next thread
1432
+ kmp_uint64 upper_bound_inner =
1433
+ iter_with_current -
1434
+ ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2 ;
1435
+ // adjust the upper bounds down by 1 element to point at the last iteration of
1436
+ // the current thread the first iteration of the next thread
1437
+ if (upper_bound_inner == 0 ) {
1438
+ // {n,0} => {n-1,n-1}
1439
+ upper_bound_outer -= 1 ;
1440
+ upper_bound_inner = upper_bound_outer;
1441
+ } else {
1442
+ // {n,m} => {n,m-1} (m!=0)
1443
+ upper_bound_inner -= 1 ;
1444
+ }
1445
+
1446
+ // assign the values, zeroing out lb1 and ub1 values since the iteration space
1447
+ // is now one-dimensional
1448
+ chunk_bounds_nest[0 ].lb0_u64 = lower_bound_outer;
1449
+ chunk_bounds_nest[1 ].lb0_u64 = lower_bound_inner;
1450
+ chunk_bounds_nest[0 ].ub0_u64 = upper_bound_outer;
1451
+ chunk_bounds_nest[1 ].ub0_u64 = upper_bound_inner;
1452
+ chunk_bounds_nest[0 ].lb1_u64 = 0 ;
1453
+ chunk_bounds_nest[0 ].ub1_u64 = 0 ;
1454
+ chunk_bounds_nest[1 ].lb1_u64 = 0 ;
1455
+ chunk_bounds_nest[1 ].ub1_u64 = 0 ;
1456
+
1457
+ #if 0
1458
+ printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459
+ tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460
+ chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461
+ #endif
1462
+ }
1463
+
1464
+ /* *************************************************************************
1465
+ * Handle upper triangle matrix in the canonical form
1466
+ * i = 0; i <= N; i++ {0,0}:{N,0}
1467
+ * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468
+ * ************************************************************************/
1469
+ void kmp_handle_upper_triangle_matrix (
1470
+ /* in*/ kmp_uint32 nth,
1471
+ /* in*/ kmp_uint32 tid,
1472
+ /* in */ kmp_index_t n,
1473
+ /* in/out*/ bounds_info_t *original_bounds_nest,
1474
+ /* out*/ bounds_info_t *chunk_bounds_nest) {
1475
+
1476
+ // transfer loop types from the original loop to the chunks
1477
+ for (kmp_index_t i = 0 ; i < n; ++i) {
1478
+ chunk_bounds_nest[i] = original_bounds_nest[i];
1479
+ }
1480
+ // cleanup iv variables
1481
+ kmp_uint64 outer_ub0 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1482
+ original_bounds_nest[0 ].ub0_u64 );
1483
+ kmp_uint64 outer_lb0 = kmp_fix_iv (original_bounds_nest[0 ].loop_iv_type ,
1484
+ original_bounds_nest[0 ].lb0_u64 );
1485
+ kmp_uint64 inner_ub0 = kmp_fix_iv (original_bounds_nest[1 ].loop_iv_type ,
1486
+ original_bounds_nest[1 ].ub0_u64 );
1487
+ // calculate the chunk's lower and upper bounds
1488
+ // the total number of iterations in the loop is the sum of the arithmetic
1489
+ // progression from the outer lower to outer upper bound (inclusive since the
1490
+ // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491
+ // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492
+ // + 1) -> N - 1
1493
+ kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1 );
1494
+ kmp_uint64 iter_total = outer_iters * (outer_iters + 1 ) / 2 ;
1495
+ // the current thread's number of iterations:
1496
+ // each thread gets an equal number of iterations: total number of iterations
1497
+ // divided by the number of threads plus, if there's a remainder,
1498
+ // the first threads with the number up to the remainder get an additional
1499
+ // iteration each to cover it
1500
+ kmp_uint64 iter_current =
1501
+ iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0 );
1502
+ // cumulative number of iterations executed by all the previous threads:
1503
+ // threads with the tid below the remainder will have (iter_total/nth+1)
1504
+ // elements, and so will all threads before them so the cumulative number of
1505
+ // iterations executed by the all previous will be the current thread's number
1506
+ // of iterations multiplied by the number of previous threads which is equal
1507
+ // to the current thread's tid; threads with the number equal or above the
1508
+ // remainder will have (iter_total/nth) elements so the cumulative number of
1509
+ // iterations previously executed is its number of iterations multipled by the
1510
+ // number of previous threads which is again equal to the current thread's tid
1511
+ // PLUS all the remainder iterations that will have been executed by the
1512
+ // previous threads
1513
+ kmp_uint64 iter_before_current =
1514
+ tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515
+ // cumulative number of iterations executed with the current thread is
1516
+ // the cumulative number executed before it plus its own
1517
+ kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518
+ // calculate the outer loop lower bound (lbo) which is the max outer iv value
1519
+ // that gives the number of iterations that is equal or just below the total
1520
+ // number of iterations executed by the previous threads, for less_than
1521
+ // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1522
+ // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1523
+ // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1524
+ // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1525
+ // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1526
+ // using a parameter to control the equatio sign
1527
+ kmp_uint64 lower_bound_outer =
1528
+ (kmp_uint64)(sqrt_newton_approx (1 + 8 * iter_before_current) + 1 ) / 2 - 1 ;
1529
+ ;
1530
+ // calculate the inner loop lower bound which is the remaining number of
1531
+ // iterations required to hit the total number of iterations executed by the
1532
+ // previous threads giving the starting point of this thread
1533
+ kmp_uint64 lower_bound_inner =
1534
+ iter_before_current - ((lower_bound_outer + 1 ) * lower_bound_outer) / 2 ;
1535
+ // calculate the outer loop upper bound using the same approach as for the
1536
+ // inner bound except using the total number of iterations executed with the
1537
+ // current thread
1538
+ kmp_uint64 upper_bound_outer =
1539
+ (kmp_uint64)(sqrt_newton_approx (1 + 8 * iter_with_current) + 1 ) / 2 - 1 ;
1540
+ // calculate the inner loop upper bound which is the remaining number of
1541
+ // iterations required to hit the total number of iterations executed after
1542
+ // the current thread giving the starting point of the next thread
1543
+ kmp_uint64 upper_bound_inner =
1544
+ iter_with_current - ((upper_bound_outer + 1 ) * upper_bound_outer) / 2 ;
1545
+ // adjust the upper bounds down by 1 element to point at the last iteration of
1546
+ // the current thread the first iteration of the next thread
1547
+ if (upper_bound_inner == 0 ) {
1548
+ // {n,0} => {n-1,n-1}
1549
+ upper_bound_outer -= 1 ;
1550
+ upper_bound_inner = upper_bound_outer;
1551
+ } else {
1552
+ // {n,m} => {n,m-1} (m!=0)
1553
+ upper_bound_inner -= 1 ;
1554
+ }
1555
+
1556
+ // assign the values, zeroing out lb1 and ub1 values since the iteration space
1557
+ // is now one-dimensional
1558
+ chunk_bounds_nest[0 ].lb0_u64 = (outer_iters - 1 ) - upper_bound_outer;
1559
+ chunk_bounds_nest[1 ].lb0_u64 = (outer_iters - 1 ) - upper_bound_inner;
1560
+ chunk_bounds_nest[0 ].ub0_u64 = (outer_iters - 1 ) - lower_bound_outer;
1561
+ chunk_bounds_nest[1 ].ub0_u64 = (outer_iters - 1 ) - lower_bound_inner;
1562
+ chunk_bounds_nest[0 ].lb1_u64 = 0 ;
1563
+ chunk_bounds_nest[0 ].ub1_u64 = 0 ;
1564
+ chunk_bounds_nest[1 ].lb1_u64 = 0 ;
1565
+ chunk_bounds_nest[1 ].ub1_u64 = 0 ;
1566
+
1567
+ #if 0
1568
+ printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1569
+ tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1570
+ chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1571
+ #endif
1572
+ }
1275
1573
// ----------Init API for non-rectangular loops--------------------------------
1276
1574
1277
1575
// Init API for collapsed loops (static, no chunks defined).
@@ -1334,6 +1632,19 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1334
1632
1335
1633
KMP_DEBUG_ASSERT (tid < nth);
1336
1634
1635
+ // Handle special cases
1636
+ nested_loop_type_t loop_type =
1637
+ kmp_identify_nested_loop_structure (original_bounds_nest, n);
1638
+ if (loop_type == nested_loop_type_lower_triangular_matrix) {
1639
+ kmp_handle_lower_triangle_matrix (nth, tid, n, original_bounds_nest,
1640
+ chunk_bounds_nest);
1641
+ return TRUE ;
1642
+ } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1643
+ kmp_handle_upper_triangle_matrix (nth, tid, n, original_bounds_nest,
1644
+ chunk_bounds_nest);
1645
+ return TRUE ;
1646
+ }
1647
+
1337
1648
CollapseAllocator<kmp_uint64> original_ivs_start (n);
1338
1649
1339
1650
if (!kmp_calc_original_ivs_for_start (original_bounds_nest, n,
0 commit comments