Skip to content

Commit 7db4046

Browse files
vadikp-intelVadim Paretsky
andauthored
[OpenMP] add loop collapse tests (#86243)
This PR adds loop collapse tests ported from MSVC. --------- Co-authored-by: Vadim Paretsky <[email protected]>
1 parent b1a633b commit 7db4046

File tree

7 files changed

+511
-8
lines changed

7 files changed

+511
-8
lines changed

openmp/runtime/src/kmp_collapse.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,16 +1517,11 @@ void kmp_handle_upper_triangle_matrix(
15171517
kmp_uint64 iter_with_current = iter_before_current + iter_current;
15181518
// calculate the outer loop lower bound (lbo) which is the max outer iv value
15191519
// that gives the number of iterations that is equal or just below the total
1520-
// number of iterations executed by the previous threads, for less_than
1521-
// (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1522-
// lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1523-
// for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1524-
// i.e. lbo*(lbo+1)/2<=iter_before_current =>
1525-
// lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1526-
// using a parameter to control the equatio sign
1520+
// number of iterations executed by the previous threads:
1521+
// lbo*(lbo+1)/2<=iter_before_current =>
1522+
// lbo^2+lbo-2*iter_before_current<=0
15271523
kmp_uint64 lower_bound_outer =
15281524
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1529-
;
15301525
// calculate the inner loop lower bound which is the remaining number of
15311526
// iterations required to hit the total number of iterations executed by the
15321527
// previous threads giving the starting point of this thread
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#include <omp.h>
2+
#include <malloc.h>
3+
#include <stdio.h>
4+
#include <memory.h>
5+
6+
#define LOOP_IV_TYPE0 LOOP_TYPES
7+
#define LOOP_TYPE0 LOOP_TYPES
8+
#define LOOP_STYPE0 LOOP_TYPES
9+
10+
#define LOOP_IV_TYPE1 LOOP_TYPES
11+
#define LOOP_TYPE1 LOOP_TYPES
12+
#define LOOP_STYPE1 LOOP_TYPES
13+
14+
#define LOOP_IV_TYPE2 LOOP_TYPES
15+
#define LOOP_TYPE2 LOOP_TYPES
16+
#define LOOP_STYPE2 LOOP_TYPES
17+
18+
#define MAX_THREADS 256
19+
20+
#if defined VERBOSE
21+
#define PRINTF printf
22+
#else
23+
#define PRINTF
24+
#endif
25+
26+
LOOP_TYPE0 iLB, iUB;
27+
LOOP_TYPE1 jA0, jB0;
28+
LOOP_TYPE2 kA0, kB0;
29+
30+
LOOP_STYPE0 iStep;
31+
LOOP_STYPE1 jA1, jB1, jStep;
32+
LOOP_STYPE2 kA1, kB1, kStep;
33+
34+
// We can check <=, <, >=, > (!= has different pattern)
35+
// Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls
36+
// of the test from main
37+
38+
#if defined LOOP_LE0
39+
#define COMPARE0 <=
40+
#elif defined LOOP_LT0
41+
#define COMPARE0 <
42+
#elif defined LOOP_GE0
43+
#define COMPARE0 >=
44+
#elif defined LOOP_GT0
45+
#define COMPARE0 >
46+
#endif
47+
48+
#if defined LOOP_LE1
49+
#define COMPARE1 <=
50+
#elif defined LOOP_LT1
51+
#define COMPARE1 <
52+
#elif defined LOOP_GE1
53+
#define COMPARE1 >=
54+
#elif defined LOOP_GT1
55+
#define COMPARE1 >
56+
#endif
57+
58+
#if defined LOOP_LE2
59+
#define COMPARE2 <=
60+
#elif defined LOOP_LT2
61+
#define COMPARE2 <
62+
#elif defined LOOP_GE2
63+
#define COMPARE2 >=
64+
#elif defined LOOP_GT2
65+
#define COMPARE2 >
66+
#endif
67+
68+
typedef struct {
69+
LOOP_IV_TYPE0 i;
70+
LOOP_IV_TYPE1 j;
71+
LOOP_IV_TYPE2 k;
72+
} spaceType;
73+
74+
spaceType *AllocSpace(unsigned size) {
75+
76+
spaceType *p = (spaceType *)malloc(size * sizeof(spaceType));
77+
memset(p, 0, size * sizeof(spaceType));
78+
return p;
79+
}
80+
81+
void FreeSpace(spaceType *space) { free(space); }
82+
83+
// record an iteration
84+
void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i,
85+
LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) {
86+
if (count > trueCount) {
87+
// number of iterations exceeded
88+
// will be reported with checks
89+
return;
90+
}
91+
space[count - 1].i = i;
92+
space[count - 1].j = j;
93+
space[count - 1].k = k;
94+
}
95+
int test() {
96+
int pass = 1;
97+
LOOP_IV_TYPE0 i;
98+
LOOP_IV_TYPE1 j;
99+
LOOP_IV_TYPE2 k;
100+
101+
spaceType *openmpSpace;
102+
spaceType *scalarSpace;
103+
104+
unsigned trueCount = 0;
105+
unsigned openmpCount = 0;
106+
unsigned scalarCount = 0;
107+
unsigned uselessThreadsOpenMP = 0;
108+
unsigned usefulThreadsOpenMP = 0;
109+
unsigned chunkSizesOpenmp[MAX_THREADS] = {0};
110+
111+
unsigned num_threads = omp_get_max_threads();
112+
if (num_threads > MAX_THREADS)
113+
num_threads = MAX_THREADS;
114+
omp_set_num_threads(num_threads);
115+
116+
// count iterations and allocate space
117+
LOOP { ++trueCount; }
118+
119+
openmpSpace = AllocSpace(trueCount);
120+
scalarSpace = AllocSpace(trueCount);
121+
122+
// fill the scalar (compare) space
123+
LOOP {
124+
++scalarCount;
125+
Set(scalarSpace, scalarCount, trueCount, i, j, k);
126+
}
127+
128+
// test run body:
129+
// perform and record OpenMP iterations and thread use
130+
#pragma omp parallel num_threads(num_threads)
131+
{
132+
#pragma omp for collapse(3) private(i, j, k)
133+
LOOP {
134+
unsigned count;
135+
unsigned gtid = omp_get_thread_num();
136+
#pragma omp atomic update
137+
++chunkSizesOpenmp[gtid];
138+
#pragma omp atomic capture
139+
count = ++openmpCount;
140+
Set(openmpSpace, count, trueCount, i, j, k);
141+
}
142+
}
143+
144+
// check for the right number of iterations processed
145+
// (only need to check for less, greater is checked when recording)
146+
if (openmpCount < trueCount) {
147+
PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n",
148+
openmpCount, trueCount);
149+
pass = 0;
150+
} else if (openmpCount > trueCount) {
151+
PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n",
152+
openmpCount, trueCount);
153+
pass = 0;
154+
}
155+
156+
// check openMP for iteration correctnes against scalar
157+
for (unsigned i = 0; i < trueCount; i++) {
158+
unsigned j;
159+
for (j = 0; j < openmpCount; j++) {
160+
if ((scalarSpace[i].i == openmpSpace[j].i) &&
161+
(scalarSpace[i].j == openmpSpace[j].j) &&
162+
(scalarSpace[i].k == openmpSpace[j].k)) {
163+
break;
164+
}
165+
}
166+
if (j == openmpCount) {
167+
PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i,
168+
scalarSpace[i].j, scalarSpace[i].k);
169+
pass = 0;
170+
}
171+
}
172+
173+
// check for efficient thread use
174+
for (unsigned i = 0; i < num_threads; ++i) {
175+
if (chunkSizesOpenmp[i] == 0) {
176+
++uselessThreadsOpenMP;
177+
}
178+
}
179+
180+
// a check to see if at least more than one thread was used (weakish)
181+
if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
182+
PRINTF("OpenMP FAILURE: threads are not used\n");
183+
pass = 0;
184+
}
185+
186+
#if 0
187+
// a check to see if the load was spread more or less evenly so that
188+
// when there was more work than threads each one got at least something
189+
// (stronger, but may currently fail for a general collapse case)
190+
if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
191+
PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n",
192+
uselessThreadsOpenMP, openmpCount);
193+
pass = 0;
194+
}
195+
#endif
196+
197+
// clean up space
198+
FreeSpace(openmpSpace);
199+
FreeSpace(scalarSpace);
200+
return pass;
201+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// RUN: %libomp-compile-and-run
2+
3+
// Non-rectangular loop collapsing.
4+
//
5+
// Nested loops conform to OpenMP 5.2 standard,
6+
// inner loops bounds may depend on outer loops induction variables.
7+
8+
#define LOOP_TYPES int
9+
#define COMPARE0 >=
10+
#define COMPARE1 <
11+
#define COMPARE2 >
12+
#define LOOP \
13+
for (i = iLB; i COMPARE0 iUB; i += iStep) \
14+
for (j = jA0; j COMPARE1 jB0; j += jStep) \
15+
for (k = kA0; k COMPARE2 kB0; k += kStep)
16+
#include "collapse_test.inc"
17+
18+
int main() {
19+
int fail;
20+
21+
iLB = 3;
22+
iUB = -2;
23+
jA0 = -3;
24+
jA1 = 0;
25+
jB0 = -6;
26+
jB1 = 0;
27+
kA0 = -2;
28+
kA1 = 0;
29+
kB0 = -4;
30+
kB1 = 0;
31+
iStep = -1;
32+
jStep = -1;
33+
kStep = -4;
34+
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
35+
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
36+
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
37+
fail = (test() == 0);
38+
39+
if (!fail) {
40+
for (iStep = -3; iStep >= -6; iStep -= 2) {
41+
for (jA0 = -6; jA0 <= 6; jA0 += 3) {
42+
for (jB0 = -3; jB0 <= 10; jB0 += 3) {
43+
for (jStep = 1; jStep <= 10; jStep += 2) {
44+
for (kA0 = -2; kA0 <= 4; ++kA0) {
45+
for (kB0 = -4; kB0 <= 2; ++kB0) {
46+
for (kStep = -2; kStep >= -10; kStep -= 4) {
47+
{
48+
PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
49+
"jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
50+
"jStep=%d; kStep=%d;\n",
51+
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
52+
iStep, jStep, kStep);
53+
fail = fail || (test() == 0);
54+
}
55+
}
56+
}
57+
}
58+
}
59+
}
60+
}
61+
}
62+
}
63+
64+
return fail;
65+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// RUN: %libomp-compile-and-run
2+
3+
// Non-rectangular loop collapsing.
4+
//
5+
// Nested loops conform to OpenMP 5.2 standard,
6+
// inner loops bounds may depend on outer loops induction variables.
7+
8+
#define LOOP_TYPES int
9+
#define COMPARE0 >
10+
#define COMPARE1 >=
11+
#define COMPARE2 >
12+
13+
#define DLOOP_GT0
14+
#define DLOOP_GE1
15+
#define DLOOP_GT2
16+
17+
#define LOOP \
18+
for (i = iLB; i COMPARE0 iUB; i += iStep) \
19+
for (j = jA0; j COMPARE1 jB0; j += jStep) \
20+
for (k = kA0; k COMPARE2 kB0; k += kStep)
21+
#include "collapse_test.inc"
22+
23+
int main() {
24+
int fail;
25+
26+
iLB = 3;
27+
iUB = -2;
28+
jA0 = -3;
29+
jA1 = 0;
30+
jB0 = -6;
31+
jB1 = 0;
32+
kA0 = -2;
33+
kA1 = 0;
34+
kB0 = -4;
35+
kB1 = 0;
36+
iStep = -1;
37+
jStep = -1;
38+
kStep = -4;
39+
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
40+
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
41+
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
42+
fail = (test() == 0);
43+
44+
if (!fail) {
45+
46+
for (iStep = -3; iStep >= -6; iStep -= 2) {
47+
for (jA0 = -3; jA0 <= 10; jA0 += 3) {
48+
for (jB0 = -6; jB0 <= 6; jB0 += 3) {
49+
for (jStep = -1; jStep >= -10; jStep -= 2) {
50+
for (kA0 = -2; kA0 <= 4; ++kA0) {
51+
for (kB0 = -4; kB0 <= 2; ++kB0) {
52+
for (kStep = -2; kStep >= -10; kStep -= 4) {
53+
{
54+
PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
55+
"jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
56+
"jStep=%d; kStep=%d;\n",
57+
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
58+
iStep, jStep, kStep);
59+
fail = fail || (test() == 0);
60+
}
61+
}
62+
}
63+
}
64+
}
65+
}
66+
}
67+
}
68+
}
69+
70+
return fail;
71+
}

0 commit comments

Comments
 (0)