Skip to content

Commit a783edf

Browse files
committed
[OpenACC][CIR] 'tile' lowering for combined constructs
This clause requires that we attach it to the 'loop', and can generate variables, so this is the first loop clause to require that we properly set up the insertion location. This patch does so, as a part of lowering 'tile' correctly.
1 parent 0d51247 commit a783edf

File tree

2 files changed

+72
-4
lines changed

2 files changed

+72
-4
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ class OpenACCClauseCIREmitter final
138138
template <typename U = void,
139139
typename = std::enable_if_t<isCombinedType<OpTy>, U>>
140140
void applyToLoopOp(const OpenACCClause &c) {
141-
// TODO OpenACC: we have to set the insertion scope here correctly still.
141+
mlir::OpBuilder::InsertionGuard guardCase(builder);
142+
builder.setInsertionPoint(operation.loopOp);
142143
OpenACCClauseCIREmitter<mlir::acc::LoopOp> loopEmitter{
143144
operation.loopOp, cgf, builder, dirKind, dirLoc};
144145
loopEmitter.lastDeviceTypeValues = lastDeviceTypeValues;
@@ -448,10 +449,10 @@ class OpenACCClauseCIREmitter final
448449

449450
operation.setTileForDeviceTypes(builder.getContext(),
450451
lastDeviceTypeValues, values);
452+
} else if constexpr (isCombinedType<OpTy>) {
453+
applyToLoopOp(clause);
451454
} else {
452-
// TODO: When we've implemented this for everything, switch this to an
453-
// unreachable. Combined constructs remain.
454-
return clauseNotImplemented(clause);
455+
llvm_unreachable("Unknown construct kind in VisitTileClause");
455456
}
456457
}
457458

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,4 +252,71 @@ extern "C" void acc_combined(int N) {
252252
// CHECK-NEXT: acc.yield
253253
// CHECK-NEXT: } loc
254254

255+
#pragma acc parallel loop tile(1, 2, 3)
256+
for(unsigned I = 0; I < N; ++I)
257+
for(unsigned J = 0; J < N; ++J)
258+
for(unsigned K = 0; K < N; ++K);
259+
// CHECK-NEXT: acc.parallel combined(loop) {
260+
// CHECK: %[[ONE_CONST:.*]] = arith.constant 1 : i64
261+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
262+
// CHECK-NEXT: %[[THREE_CONST:.*]] = arith.constant 3 : i64
263+
// CHECK-NEXT: acc.loop combined(parallel) tile({%[[ONE_CONST]] : i64, %[[TWO_CONST]] : i64, %[[THREE_CONST]] : i64}) {
264+
// CHECK: acc.yield
265+
// CHECK-NEXT: } loc
266+
// CHECK-NEXT: acc.yield
267+
// CHECK-NEXT: } loc
268+
#pragma acc serial loop tile(2) device_type(radeon)
269+
for(unsigned I = 0; I < N; ++I)
270+
for(unsigned J = 0; J < N; ++J)
271+
for(unsigned K = 0; K < N; ++K);
272+
// CHECK-NEXT: acc.serial combined(loop) {
273+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
274+
// CHECK-NEXT: acc.loop combined(serial) tile({%[[TWO_CONST]] : i64}) {
275+
// CHECK: acc.yield
276+
// CHECK-NEXT: } loc
277+
// CHECK-NEXT: acc.yield
278+
// CHECK-NEXT: } loc
279+
#pragma acc kernels loop tile(2) device_type(radeon) tile (1, *)
280+
for(unsigned I = 0; I < N; ++I)
281+
for(unsigned J = 0; J < N; ++J)
282+
for(unsigned K = 0; K < N; ++K);
283+
// CHECK-NEXT: acc.kernels combined(loop) {
284+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
285+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
286+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
287+
// CHECK-NEXT: acc.loop combined(kernels) tile({%[[TWO_CONST]] : i64}, {%[[ONE_CONST]] : i64, %[[STAR_CONST]] : i64} [#acc.device_type<radeon>]) {
288+
// CHECK: acc.yield
289+
// CHECK-NEXT: } loc
290+
// CHECK-NEXT: acc.terminator
291+
// CHECK-NEXT: } loc
292+
#pragma acc parallel loop tile(*) device_type(radeon, nvidia) tile (1, 2)
293+
for(unsigned I = 0; I < N; ++I)
294+
for(unsigned J = 0; J < N; ++J)
295+
for(unsigned K = 0; K < N; ++K);
296+
// CHECK-NEXT: acc.parallel combined(loop) {
297+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
298+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
299+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
300+
// CHECK-NEXT: acc.loop combined(parallel) tile({%[[STAR_CONST]] : i64}, {%[[ONE_CONST]] : i64, %[[TWO_CONST]] : i64} [#acc.device_type<radeon>], {%[[ONE_CONST]] : i64, %[[TWO_CONST]] : i64} [#acc.device_type<nvidia>]) {
301+
// CHECK: acc.yield
302+
// CHECK-NEXT: } loc
303+
// CHECK-NEXT: acc.yield
304+
// CHECK-NEXT: } loc
305+
#pragma acc serial loop tile(1) device_type(radeon, nvidia) tile(2, 3) device_type(host) tile(*, *, *)
306+
for(unsigned I = 0; I < N; ++I)
307+
for(unsigned J = 0; J < N; ++J)
308+
for(unsigned K = 0; K < N; ++K);
309+
// CHECK-NEXT: acc.serial combined(loop) {
310+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
311+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
312+
// CHECK-NEXT: %[[THREE_CONST:.*]] = arith.constant 3 : i64
313+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
314+
// CHECK-NEXT: %[[STAR2_CONST:.*]] = arith.constant -1 : i64
315+
// CHECK-NEXT: %[[STAR3_CONST:.*]] = arith.constant -1 : i64
316+
// CHECK-NEXT: acc.loop combined(serial) tile({%[[ONE_CONST]] : i64}, {%[[TWO_CONST]] : i64, %[[THREE_CONST]] : i64} [#acc.device_type<radeon>], {%[[TWO_CONST]] : i64, %[[THREE_CONST]] : i64} [#acc.device_type<nvidia>], {%[[STAR_CONST]] : i64, %[[STAR2_CONST]] : i64, %[[STAR3_CONST]] : i64} [#acc.device_type<host>]) {
317+
// CHECK: acc.yield
318+
// CHECK-NEXT: } loc
319+
// CHECK-NEXT: acc.yield
320+
// CHECK-NEXT: } loc
321+
255322
}

0 commit comments

Comments
 (0)