Skip to content

Commit 574f77a

Browse files
authored
[OpenACC][CIR] Add parallelism determ. to all acc.loops (#143751)
PR #143720 adds a requirement to the ACC dialect that every acc.loop must have a seq, independent, or auto attribute for the 'default' device_type. The standard has rules for how this can be intuited: orphan/parallel/parallel loop: independent kernels/kernels loop: auto serial/serial loop: seq, unless there is a gang/worker/vector, at which point it should be 'auto'. This patch implements all of this rule as a 'cleanup' step on the IR generation for combined/loop operations. Note that the test impact is much less since I inadvertently have my 'operation' terminating curley matching the end curley from 'attribute' instead of the front of the line, so I've added sufficient tests to ensure I captured the above.
1 parent 02b6849 commit 574f77a

File tree

7 files changed

+232
-17
lines changed

7 files changed

+232
-17
lines changed

clang/lib/CIR/CodeGen/CIRGenFunction.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ namespace {
3434
class ScalarExprEmitter;
3535
} // namespace
3636

37+
namespace mlir {
38+
namespace acc {
39+
class LoopOp;
40+
} // namespace acc
41+
} // namespace mlir
42+
3743
namespace clang::CIRGen {
3844

3945
class CIRGenFunction : public CIRGenTypeCache {
@@ -1082,6 +1088,12 @@ class CIRGenFunction : public CIRGenTypeCache {
10821088
OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
10831089
ArrayRef<const OpenACCClause *> clauses);
10841090

1091+
// The OpenACC LoopOp requires that we have auto, seq, or independent on all
1092+
// LoopOp operations for the 'none' device type case. This function checks if
1093+
// the LoopOp has one, else it updates it to have one.
1094+
void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan,
1095+
OpenACCDirectiveKind dk);
1096+
10851097
public:
10861098
mlir::LogicalResult
10871099
emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s);

clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
102102

103103
emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);
104104

105+
updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);
106+
105107
builder.create<TermOp>(end);
106108
}
107109

clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,36 @@ using namespace clang::CIRGen;
2222
using namespace cir;
2323
using namespace mlir::acc;
2424

25+
void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op,
26+
bool isOrphan,
27+
OpenACCDirectiveKind dk) {
28+
// Check that at least one of auto, independent, or seq is present
29+
// for the device-independent default clauses.
30+
if (op.hasParallelismFlag(mlir::acc::DeviceType::None))
31+
return;
32+
33+
switch (dk) {
34+
default:
35+
llvm_unreachable("Invalid parent directive kind");
36+
case OpenACCDirectiveKind::Invalid:
37+
case OpenACCDirectiveKind::Parallel:
38+
case OpenACCDirectiveKind::ParallelLoop:
39+
op.addIndependent(builder.getContext(), {});
40+
return;
41+
case OpenACCDirectiveKind::Kernels:
42+
case OpenACCDirectiveKind::KernelsLoop:
43+
op.addAuto(builder.getContext(), {});
44+
return;
45+
case OpenACCDirectiveKind::Serial:
46+
case OpenACCDirectiveKind::SerialLoop:
47+
if (op.hasDefaultGangWorkerVector())
48+
op.addAuto(builder.getContext(), {});
49+
else
50+
op.addSeq(builder.getContext(), {});
51+
return;
52+
};
53+
}
54+
2555
mlir::LogicalResult
2656
CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
2757
mlir::Location start = getLoc(s.getSourceRange().getBegin());
@@ -90,6 +120,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
90120
emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
91121
s.clauses());
92122

123+
updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(),
124+
s.getParentComputeConstructKind());
125+
93126
mlir::LogicalResult stmtRes = mlir::success();
94127
// Emit body.
95128
{

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) {
7474
// CHECK: acc.serial combined(loop) {
7575
// CHECK: acc.loop combined(serial) {
7676
// CHECK: acc.yield
77-
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
77+
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
7878
// CHECK: acc.yield
7979
// CHECK-NEXT: } loc
8080
#pragma acc kernels loop seq device_type(nvidia, radeon)
@@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) {
9999
// CHECK: acc.serial combined(loop) {
100100
// CHECK: acc.loop combined(serial) {
101101
// CHECK: acc.yield
102-
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
102+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
103103
// CHECK: acc.yield
104104
// CHECK-NEXT: } loc
105105
#pragma acc kernels loop auto device_type(nvidia, radeon)
@@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) {
124124
// CHECK: acc.serial combined(loop) {
125125
// CHECK: acc.loop combined(serial) {
126126
// CHECK: acc.yield
127-
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
127+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
128128
// CHECK: acc.yield
129129
// CHECK-NEXT: } loc
130130
#pragma acc kernels loop independent device_type(nvidia, radeon)
@@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) {
143143
// CHECK: acc.parallel combined(loop) {
144144
// CHECK: acc.loop combined(parallel) {
145145
// CHECK: acc.yield
146-
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
146+
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
147147
// CHECK: acc.yield
148148
// CHECK-NEXT: } loc
149149

@@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) {
154154
// CHECK: acc.serial combined(loop) {
155155
// CHECK: acc.loop combined(serial) {
156156
// CHECK: acc.yield
157-
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
157+
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]}
158158
// CHECK: acc.yield
159159
// CHECK-NEXT: } loc
160160

@@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) {
165165
// CHECK: acc.kernels combined(loop) {
166166
// CHECK: acc.loop combined(kernels) {
167167
// CHECK: acc.yield
168-
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
168+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
169169
// CHECK: acc.terminator
170170
// CHECK-NEXT: } loc
171171
#pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
@@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) {
175175
// CHECK: acc.parallel combined(loop) {
176176
// CHECK: acc.loop combined(parallel) {
177177
// CHECK: acc.yield
178-
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
178+
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
179179
// CHECK: acc.yield
180180
// CHECK-NEXT: } loc
181181

@@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) {
11841184
// CHECK-NEXT: } loc
11851185
// CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg2"}
11861186
// CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg1"}
1187+
1188+
// Checking the automatic-addition of parallelism clauses.
1189+
#pragma acc parallel loop
1190+
for(unsigned I = 0; I < 5; ++I);
1191+
// CHECK-NEXT: acc.parallel combined(loop) {
1192+
// CHECK-NEXT: acc.loop combined(parallel) {
1193+
// CHECK: acc.yield
1194+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
1195+
// CHECK-NEXT: acc.yield
1196+
// CHECK-NEXT: } loc
1197+
1198+
#pragma acc kernels loop
1199+
for(unsigned I = 0; I < 5; ++I);
1200+
// CHECK-NEXT: acc.kernels combined(loop) {
1201+
// CHECK-NEXT: acc.loop combined(kernels) {
1202+
// CHECK: acc.yield
1203+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
1204+
// CHECK-NEXT: acc.terminator
1205+
// CHECK-NEXT: } loc
1206+
1207+
#pragma acc serial loop
1208+
for(unsigned I = 0; I < 5; ++I);
1209+
// CHECK-NEXT: acc.serial combined(loop) {
1210+
// CHECK-NEXT: acc.loop combined(serial) {
1211+
// CHECK: acc.yield
1212+
// CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
1213+
// CHECK-NEXT: acc.yield
1214+
// CHECK-NEXT: } loc
1215+
1216+
#pragma acc serial loop worker
1217+
for(unsigned I = 0; I < 5; ++I);
1218+
// CHECK-NEXT: acc.serial combined(loop) {
1219+
// CHECK-NEXT: acc.loop combined(serial) worker {
1220+
// CHECK: acc.yield
1221+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
1222+
// CHECK-NEXT: acc.yield
1223+
// CHECK-NEXT: } loc
1224+
1225+
#pragma acc serial loop vector
1226+
for(unsigned I = 0; I < 5; ++I);
1227+
// CHECK-NEXT: acc.serial combined(loop) {
1228+
// CHECK-NEXT: acc.loop combined(serial) vector {
1229+
// CHECK: acc.yield
1230+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
1231+
// CHECK-NEXT: acc.yield
1232+
// CHECK-NEXT: } loc
1233+
1234+
#pragma acc serial loop gang
1235+
for(unsigned I = 0; I < 5; ++I);
1236+
// CHECK-NEXT: acc.serial combined(loop) {
1237+
// CHECK-NEXT: acc.loop combined(serial) gang {
1238+
// CHECK: acc.yield
1239+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
1240+
// CHECK-NEXT: acc.yield
1241+
// CHECK-NEXT: } loc
11871242
}

clang/test/CIR/CodeGenOpenACC/loop.cpp

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
4141
for(unsigned I = 0; I < N; ++I);
4242
// CHECK: acc.loop {
4343
// CHECK: acc.yield
44-
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
44+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
4545
#pragma acc loop device_type(radeon) seq
4646
for(unsigned I = 0; I < N; ++I);
4747
// CHECK: acc.loop {
4848
// CHECK: acc.yield
49-
// CHECK-NEXT: } attributes {seq = [#acc.device_type<radeon>]} loc
49+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<radeon>]} loc
5050
#pragma acc loop seq device_type(nvidia, radeon)
5151
for(unsigned I = 0; I < N; ++I);
5252
// CHECK: acc.loop {
@@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
6767
for(unsigned I = 0; I < N; ++I);
6868
// CHECK: acc.loop {
6969
// CHECK: acc.yield
70-
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
70+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
7171
#pragma acc loop device_type(radeon) independent
7272
for(unsigned I = 0; I < N; ++I);
7373
// CHECK: acc.loop {
7474
// CHECK: acc.yield
75-
// CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>]} loc
75+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>, #acc.device_type<none>]} loc
7676
#pragma acc loop independent device_type(nvidia, radeon)
7777
for(unsigned I = 0; I < N; ++I);
7878
// CHECK: acc.loop {
@@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
9393
for(unsigned I = 0; I < N; ++I);
9494
// CHECK: acc.loop {
9595
// CHECK: acc.yield
96-
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
96+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
9797
#pragma acc loop device_type(radeon) auto
9898
for(unsigned I = 0; I < N; ++I);
9999
// CHECK: acc.loop {
100100
// CHECK: acc.yield
101-
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>]} loc
101+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
102102
#pragma acc loop auto device_type(nvidia, radeon)
103103
for(unsigned I = 0; I < N; ++I);
104104
// CHECK: acc.loop {
@@ -116,30 +116,30 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
116116
for(unsigned K = 0; K < N; ++K);
117117
// CHECK: acc.loop {
118118
// CHECK: acc.yield
119-
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
119+
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
120120

121121
#pragma acc loop collapse(1) device_type(radeon) collapse (2)
122122
for(unsigned I = 0; I < N; ++I)
123123
for(unsigned J = 0; J < N; ++J)
124124
for(unsigned K = 0; K < N; ++K);
125125
// CHECK: acc.loop {
126126
// CHECK: acc.yield
127-
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
127+
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]}
128128

129129
#pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2)
130130
for(unsigned I = 0; I < N; ++I)
131131
for(unsigned J = 0; J < N; ++J)
132132
for(unsigned K = 0; K < N; ++K);
133133
// CHECK: acc.loop {
134134
// CHECK: acc.yield
135-
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
135+
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>], independent = [#acc.device_type<none>]}
136136
#pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
137137
for(unsigned I = 0; I < N; ++I)
138138
for(unsigned J = 0; J < N; ++J)
139139
for(unsigned K = 0; K < N; ++K);
140140
// CHECK: acc.loop {
141141
// CHECK: acc.yield
142-
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
142+
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
143143

144144
#pragma acc loop tile(1, 2, 3)
145145
for(unsigned I = 0; I < N; ++I)
@@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
392392
// CHECK: acc.yield
393393
// CHECK-NEXT: } loc
394394
}
395+
// CHECK-NEXT: acc.terminator
396+
// CHECK-NEXT: } loc
397+
398+
// Checking the automatic-addition of parallelism clauses.
399+
#pragma acc loop
400+
for(unsigned I = 0; I < N; ++I);
401+
// CHECK-NEXT: acc.loop {
402+
// CHECK: acc.yield
403+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
404+
405+
#pragma acc parallel
406+
{
407+
// CHECK-NEXT: acc.parallel {
408+
#pragma acc loop
409+
for(unsigned I = 0; I < N; ++I);
410+
// CHECK-NEXT: acc.loop {
411+
// CHECK: acc.yield
412+
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
413+
}
414+
// CHECK-NEXT: acc.yield
415+
// CHECK-NEXT: } loc
416+
417+
#pragma acc kernels
418+
{
419+
// CHECK-NEXT: acc.kernels {
420+
#pragma acc loop
421+
for(unsigned I = 0; I < N; ++I);
422+
// CHECK-NEXT: acc.loop {
423+
// CHECK: acc.yield
424+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
425+
}
426+
// CHECK-NEXT: acc.terminator
427+
// CHECK-NEXT: } loc
428+
429+
#pragma acc serial
430+
{
431+
// CHECK-NEXT: acc.serial {
432+
#pragma acc loop
433+
for(unsigned I = 0; I < N; ++I);
434+
// CHECK-NEXT: acc.loop {
435+
// CHECK: acc.yield
436+
// CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
437+
}
438+
// CHECK-NEXT: acc.yield
439+
// CHECK-NEXT: } loc
440+
441+
#pragma acc serial
442+
{
443+
// CHECK-NEXT: acc.serial {
444+
#pragma acc loop worker
445+
for(unsigned I = 0; I < N; ++I);
446+
// CHECK-NEXT: acc.loop worker {
447+
// CHECK: acc.yield
448+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
449+
}
450+
// CHECK-NEXT: acc.yield
451+
// CHECK-NEXT: } loc
452+
453+
#pragma acc serial
454+
{
455+
// CHECK-NEXT: acc.serial {
456+
#pragma acc loop vector
457+
for(unsigned I = 0; I < N; ++I);
458+
// CHECK-NEXT: acc.loop vector {
459+
// CHECK: acc.yield
460+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
461+
}
462+
// CHECK-NEXT: acc.yield
463+
// CHECK-NEXT: } loc
464+
465+
#pragma acc serial
466+
{
467+
// CHECK-NEXT: acc.serial {
468+
#pragma acc loop gang
469+
for(unsigned I = 0; I < N; ++I);
470+
// CHECK-NEXT: acc.loop gang {
471+
// CHECK: acc.yield
472+
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
473+
}
474+
// CHECK-NEXT: acc.yield
475+
// CHECK-NEXT: } loc
395476
}

mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
22462246
// device_types. This is for the case where there is no expression specified
22472247
// in a 'gang'.
22482248
void addEmptyGang(MLIRContext *, llvm::ArrayRef<DeviceType>);
2249+
2250+
// Return whether this LoopOp has an auto, seq, or independent for the
2251+
// specified device-type.
2252+
bool hasParallelismFlag(DeviceType);
2253+
2254+
// Return whether this LoopOp has a gang, worker, or vector applying to the
2255+
// 'default'/None device-type.
2256+
bool hasDefaultGangWorkerVector();
22492257
}];
22502258

22512259
let hasCustomAssemblyFormat = 1;

0 commit comments

Comments
 (0)