Skip to content

Commit 24db9b5

Browse files
committed
[OpenACC][CIR] Implement 'gang' for combined constructs
Mostly just adding the tests, the implementation is appropriately trivial.
1 parent f4853d7 commit 24db9b5

File tree

2 files changed

+85
-3
lines changed

2 files changed

+85
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -517,10 +517,10 @@ class OpenACCClauseCIREmitter final
517517
operation.addGangOperands(builder.getContext(), lastDeviceTypeValues,
518518
argTypes, values);
519519
}
520+
} else if constexpr (isCombinedType<OpTy>) {
521+
applyToLoopOp(clause);
520522
} else {
521-
// TODO: When we've implemented this for everything, switch this to an
522-
// unreachable. Combined constructs remain.
523-
return clauseNotImplemented(clause);
523+
llvm_unreachable("Unknown construct kind in VisitGangClause");
524524
}
525525
}
526526
};

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,4 +319,86 @@ extern "C" void acc_combined(int N) {
319319
// CHECK-NEXT: acc.yield
320320
// CHECK-NEXT: } loc
321321

322+
#pragma acc parallel loop gang
323+
for(unsigned I = 0; I < N; ++I);
324+
// CHECK-NEXT: acc.parallel combined(loop) {
325+
// CHECK-NEXT: acc.loop combined(parallel) gang {
326+
// CHECK: acc.yield
327+
// CHECK-NEXT: } loc
328+
// CHECK-NEXT: acc.yield
329+
// CHECK-NEXT: } loc
330+
#pragma acc parallel loop gang device_type(nvidia) gang
331+
for(unsigned I = 0; I < N; ++I);
332+
// CHECK-NEXT: acc.parallel combined(loop) {
333+
// CHECK-NEXT: acc.loop combined(parallel) gang([#acc.device_type<none>, #acc.device_type<nvidia>]) {
334+
// CHECK: acc.yield
335+
// CHECK-NEXT: } loc
336+
// CHECK-NEXT: acc.yield
337+
// CHECK-NEXT: } loc
338+
#pragma acc parallel loop gang(dim:1) device_type(nvidia) gang(dim:2)
339+
for(unsigned I = 0; I < N; ++I);
340+
// CHECK-NEXT: acc.parallel combined(loop) {
341+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
342+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
343+
// CHECK-NEXT: acc.loop combined(parallel) gang({dim=%[[ONE_CONST]] : i64}, {dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>]) {
344+
// CHECK: acc.yield
345+
// CHECK-NEXT: } loc
346+
// CHECK-NEXT: acc.yield
347+
// CHECK-NEXT: } loc
348+
#pragma acc parallel loop gang(static:N, dim: 1) device_type(nvidia, radeon) gang(static:*, dim : 2)
349+
for(unsigned I = 0; I < N; ++I);
350+
// CHECK-NEXT: acc.parallel combined(loop) {
351+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
352+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
353+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
354+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
355+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
356+
// CHECK-NEXT: acc.loop combined(parallel) gang({static=%[[N_CONV]] : si32, dim=%[[ONE_CONST]] : i64}, {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>], {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<radeon>]) {
357+
// CHECK: acc.yield
358+
// CHECK-NEXT: } loc
359+
// CHECK-NEXT: acc.yield
360+
// CHECK-NEXT: } loc
361+
362+
#pragma acc kernels loop gang(num:N) device_type(nvidia, radeon) gang(num:N)
363+
for(unsigned I = 0; I < N; ++I);
364+
// CHECK-NEXT: acc.kernels combined(loop) {
365+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
366+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
367+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
368+
// CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
369+
// CHECK-NEXT: acc.loop combined(kernels) gang({num=%[[N_CONV]] : si32}, {num=%[[N_CONV2]] : si32} [#acc.device_type<nvidia>], {num=%[[N_CONV2]] : si32} [#acc.device_type<radeon>]) {
370+
// CHECK: acc.yield
371+
// CHECK-NEXT: } loc
372+
// CHECK-NEXT: acc.terminator
373+
// CHECK-NEXT: } loc
374+
#pragma acc kernels loop gang(static:N) device_type(nvidia) gang(static:*)
375+
for(unsigned I = 0; I < N; ++I);
376+
// CHECK-NEXT: acc.kernels combined(loop) {
377+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
378+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
379+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
380+
// CHECK-NEXT: acc.loop combined(kernels) gang({static=%[[N_CONV]] : si32}, {static=%[[STAR_CONST]] : i64} [#acc.device_type<nvidia>]) {
381+
// CHECK: acc.yield
382+
// CHECK-NEXT: } loc
383+
// CHECK-NEXT: acc.terminator
384+
// CHECK-NEXT: } loc
385+
#pragma acc kernels loop gang(static:N, num: N + 1) device_type(nvidia) gang(static:*, num : N + 2)
386+
for(unsigned I = 0; I < N; ++I);
387+
// CHECK-NEXT: acc.kernels combined(loop) {
388+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
389+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
390+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
391+
// CHECK-NEXT: %[[CIR_ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
392+
// CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[CIR_ONE_CONST]]) nsw : !s32i
393+
// CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
394+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
395+
// CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
396+
// CHECK-NEXT: %[[CIR_TWO_CONST:.*]] = cir.const #cir.int<2> : !s32i
397+
// CHECK-NEXT: %[[N_PLUS_TWO:.*]] = cir.binop(add, %[[N_LOAD3]], %[[CIR_TWO_CONST]]) nsw : !s32i
398+
// CHECK-NEXT: %[[N_PLUS_TWO_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_TWO]] : !s32i to si32
399+
// CHECK-NEXT: acc.loop combined(kernels) gang({static=%[[N_CONV]] : si32, num=%[[N_PLUS_ONE_CONV]] : si32}, {static=%[[STAR_CONST]] : i64, num=%[[N_PLUS_TWO_CONV]] : si32} [#acc.device_type<nvidia>]) {
400+
// CHECK: acc.yield
401+
// CHECK-NEXT: } loc
402+
// CHECK-NEXT: acc.terminator
403+
// CHECK-NEXT: } loc
322404
}

0 commit comments

Comments
 (0)