Skip to content

[OpenMP] add loop collapse tests #86243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions openmp/runtime/src/kmp_collapse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1517,16 +1517,11 @@ void kmp_handle_upper_triangle_matrix(
kmp_uint64 iter_with_current = iter_before_current + iter_current;
// calculate the outer loop lower bound (lbo) which is the max outer iv value
// that gives the number of iterations that is equal or just below the total
// number of iterations executed by the previous threads, for less_than
// (1-based) inner loops (inner_ub0 == -1) it will be i.e.
// lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
// i.e. lbo*(lbo+1)/2<=iter_before_current =>
// lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
// using a parameter to control the equatio sign
// number of iterations executed by the previous threads:
// lbo*(lbo+1)/2<=iter_before_current =>
// lbo^2+lbo-2*iter_before_current<=0
kmp_uint64 lower_bound_outer =
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
;
// calculate the inner loop lower bound which is the remaining number of
// iterations required to hit the total number of iterations executed by the
// previous threads giving the starting point of this thread
Expand Down
201 changes: 201 additions & 0 deletions openmp/runtime/test/worksharing/for/collapse_test.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#include <omp.h>
#include <malloc.h>
#include <stdio.h>
#include <memory.h>

#define LOOP_IV_TYPE0 LOOP_TYPES
#define LOOP_TYPE0 LOOP_TYPES
#define LOOP_STYPE0 LOOP_TYPES

#define LOOP_IV_TYPE1 LOOP_TYPES
#define LOOP_TYPE1 LOOP_TYPES
#define LOOP_STYPE1 LOOP_TYPES

#define LOOP_IV_TYPE2 LOOP_TYPES
#define LOOP_TYPE2 LOOP_TYPES
#define LOOP_STYPE2 LOOP_TYPES

#define MAX_THREADS 256

#if defined VERBOSE
#define PRINTF printf
#else
#define PRINTF
#endif

LOOP_TYPE0 iLB, iUB;
LOOP_TYPE1 jA0, jB0;
LOOP_TYPE2 kA0, kB0;

LOOP_STYPE0 iStep;
LOOP_STYPE1 jA1, jB1, jStep;
LOOP_STYPE2 kA1, kB1, kStep;

// We can check <=, <, >=, > (!= has different pattern)
// Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls
// of the test from main

#if defined LOOP_LE0
#define COMPARE0 <=
#elif defined LOOP_LT0
#define COMPARE0 <
#elif defined LOOP_GE0
#define COMPARE0 >=
#elif defined LOOP_GT0
#define COMPARE0 >
#endif

#if defined LOOP_LE1
#define COMPARE1 <=
#elif defined LOOP_LT1
#define COMPARE1 <
#elif defined LOOP_GE1
#define COMPARE1 >=
#elif defined LOOP_GT1
#define COMPARE1 >
#endif

#if defined LOOP_LE2
#define COMPARE2 <=
#elif defined LOOP_LT2
#define COMPARE2 <
#elif defined LOOP_GE2
#define COMPARE2 >=
#elif defined LOOP_GT2
#define COMPARE2 >
#endif

typedef struct {
LOOP_IV_TYPE0 i;
LOOP_IV_TYPE1 j;
LOOP_IV_TYPE2 k;
} spaceType;

spaceType *AllocSpace(unsigned size) {

spaceType *p = (spaceType *)malloc(size * sizeof(spaceType));
memset(p, 0, size * sizeof(spaceType));
return p;
}

void FreeSpace(spaceType *space) { free(space); }

// record an iteration
void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i,
LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) {
if (count > trueCount) {
// number of iterations exceeded
// will be reported with checks
return;
}
space[count - 1].i = i;
space[count - 1].j = j;
space[count - 1].k = k;
}
int test() {
int pass = 1;
LOOP_IV_TYPE0 i;
LOOP_IV_TYPE1 j;
LOOP_IV_TYPE2 k;

spaceType *openmpSpace;
spaceType *scalarSpace;

unsigned trueCount = 0;
unsigned openmpCount = 0;
unsigned scalarCount = 0;
unsigned uselessThreadsOpenMP = 0;
unsigned usefulThreadsOpenMP = 0;
unsigned chunkSizesOpenmp[MAX_THREADS] = {0};

unsigned num_threads = omp_get_max_threads();
if (num_threads > MAX_THREADS)
num_threads = MAX_THREADS;
omp_set_num_threads(num_threads);

// count iterations and allocate space
LOOP { ++trueCount; }

openmpSpace = AllocSpace(trueCount);
scalarSpace = AllocSpace(trueCount);

// fill the scalar (compare) space
LOOP {
++scalarCount;
Set(scalarSpace, scalarCount, trueCount, i, j, k);
}

// test run body:
// perform and record OpenMP iterations and thread use
#pragma omp parallel num_threads(num_threads)
{
#pragma omp for collapse(3) private(i, j, k)
LOOP {
unsigned count;
unsigned gtid = omp_get_thread_num();
#pragma omp atomic update
++chunkSizesOpenmp[gtid];
#pragma omp atomic capture
count = ++openmpCount;
Set(openmpSpace, count, trueCount, i, j, k);
}
}

// check for the right number of iterations processed
// (only need to check for less, greater is checked when recording)
if (openmpCount < trueCount) {
PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n",
openmpCount, trueCount);
pass = 0;
} else if (openmpCount > trueCount) {
PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n",
openmpCount, trueCount);
pass = 0;
}

// check openMP for iteration correctnes against scalar
for (unsigned i = 0; i < trueCount; i++) {
unsigned j;
for (j = 0; j < openmpCount; j++) {
if ((scalarSpace[i].i == openmpSpace[j].i) &&
(scalarSpace[i].j == openmpSpace[j].j) &&
(scalarSpace[i].k == openmpSpace[j].k)) {
break;
}
}
if (j == openmpCount) {
PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i,
scalarSpace[i].j, scalarSpace[i].k);
pass = 0;
}
}

// check for efficient thread use
for (unsigned i = 0; i < num_threads; ++i) {
if (chunkSizesOpenmp[i] == 0) {
++uselessThreadsOpenMP;
}
}

// a check to see if at least more than one thread was used (weakish)
if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
PRINTF("OpenMP FAILURE: threads are not used\n");
pass = 0;
}

#if 0
// a check to see if the load was spread more or less evenly so that
// when there was more work than threads each one got at least something
// (stronger, but may currently fail for a general collapse case)
if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n",
uselessThreadsOpenMP, openmpCount);
pass = 0;
}
#endif

// clean up space
FreeSpace(openmpSpace);
FreeSpace(scalarSpace);
return pass;
}
65 changes: 65 additions & 0 deletions openmp/runtime/test/worksharing/for/omp_collapse_many_GELTGT_int.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// RUN: %libomp-compile-and-run

// Non-rectangular loop collapsing.
//
// Nested loops conform to OpenMP 5.2 standard,
// inner loops bounds may depend on outer loops induction variables.

#define LOOP_TYPES int
#define COMPARE0 >=
#define COMPARE1 <
#define COMPARE2 >
#define LOOP \
for (i = iLB; i COMPARE0 iUB; i += iStep) \
for (j = jA0; j COMPARE1 jB0; j += jStep) \
for (k = kA0; k COMPARE2 kB0; k += kStep)
#include "collapse_test.inc"

int main() {
int fail;

iLB = 3;
iUB = -2;
jA0 = -3;
jA1 = 0;
jB0 = -6;
jB1 = 0;
kA0 = -2;
kA1 = 0;
kB0 = -4;
kB1 = 0;
iStep = -1;
jStep = -1;
kStep = -4;
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
fail = (test() == 0);

if (!fail) {
for (iStep = -3; iStep >= -6; iStep -= 2) {
for (jA0 = -6; jA0 <= 6; jA0 += 3) {
for (jB0 = -3; jB0 <= 10; jB0 += 3) {
for (jStep = 1; jStep <= 10; jStep += 2) {
for (kA0 = -2; kA0 <= 4; ++kA0) {
for (kB0 = -4; kB0 <= 2; ++kB0) {
for (kStep = -2; kStep >= -10; kStep -= 4) {
{
PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
"jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
"jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
iStep, jStep, kStep);
fail = fail || (test() == 0);
}
}
}
}
}
}
}
}
}

return fail;
}
71 changes: 71 additions & 0 deletions openmp/runtime/test/worksharing/for/omp_collapse_many_GTGEGT_int.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// RUN: %libomp-compile-and-run

// Non-rectangular loop collapsing.
//
// Nested loops conform to OpenMP 5.2 standard,
// inner loops bounds may depend on outer loops induction variables.

#define LOOP_TYPES int
#define COMPARE0 >
#define COMPARE1 >=
#define COMPARE2 >

#define DLOOP_GT0
#define DLOOP_GE1
#define DLOOP_GT2

#define LOOP \
for (i = iLB; i COMPARE0 iUB; i += iStep) \
for (j = jA0; j COMPARE1 jB0; j += jStep) \
for (k = kA0; k COMPARE2 kB0; k += kStep)
#include "collapse_test.inc"

int main() {
int fail;

iLB = 3;
iUB = -2;
jA0 = -3;
jA1 = 0;
jB0 = -6;
jB1 = 0;
kA0 = -2;
kA1 = 0;
kB0 = -4;
kB1 = 0;
iStep = -1;
jStep = -1;
kStep = -4;
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
fail = (test() == 0);

if (!fail) {

for (iStep = -3; iStep >= -6; iStep -= 2) {
for (jA0 = -3; jA0 <= 10; jA0 += 3) {
for (jB0 = -6; jB0 <= 6; jB0 += 3) {
for (jStep = -1; jStep >= -10; jStep -= 2) {
for (kA0 = -2; kA0 <= 4; ++kA0) {
for (kB0 = -4; kB0 <= 2; ++kB0) {
for (kStep = -2; kStep >= -10; kStep -= 4) {
{
PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
"jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
"jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
iStep, jStep, kStep);
fail = fail || (test() == 0);
}
}
}
}
}
}
}
}
}

return fail;
}
Loading