Skip to content

Commit c9d8ff0

Browse files
committed
[OpenACC][CIR] Implement 'num_gangs' lowering for combined constructs
Another simple one, most of the work is in writing the tests, applies to the compute operation.
1 parent 1b13849 commit c9d8ff0

File tree

2 files changed

+95
-3
lines changed

2 files changed

+95
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,10 @@ class OpenACCClauseCIREmitter final
339339

340340
operation.addNumGangsOperands(builder.getContext(), values,
341341
lastDeviceTypeValues);
342+
} else if constexpr (isCombinedType<OpTy>) {
343+
applyToComputeOp(clause);
342344
} else {
343-
// TODO: When we've implemented this for everything, switch this to an
344-
// unreachable. Combined constructs remain.
345-
return clauseNotImplemented(clause);
345+
llvm_unreachable("Unknown construct kind in VisitNumGangsClause");
346346
}
347347
}
348348

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,4 +721,96 @@ extern "C" void acc_combined(int N, int cond) {
721721
// CHECK-NEXT: acc.yield
722722
// CHECK-NEXT: } loc
723723

724+
#pragma acc parallel loop num_gangs(1)
725+
for(unsigned I = 0; I < N; ++I);
726+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
727+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
728+
// CHECK-NEXT: acc.parallel combined(loop) num_gangs({%[[ONE_CAST]] : si32}) {
729+
// CHECK-NEXT: acc.loop combined(parallel) {
730+
// CHECK: acc.yield
731+
// CHECK-NEXT: } loc
732+
// CHECK-NEXT: acc.yield
733+
// CHECK-NEXT: } loc
734+
735+
#pragma acc kernels loop num_gangs(cond)
736+
for(unsigned I = 0; I < N; ++I);
737+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
738+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
739+
// CHECK-NEXT: acc.kernels combined(loop) num_gangs({%[[CONV_CAST]] : si32}) {
740+
// CHECK-NEXT: acc.loop combined(kernels) {
741+
// CHECK: acc.yield
742+
// CHECK-NEXT: } loc
743+
// CHECK-NEXT: acc.terminator
744+
// CHECK-NEXT: } loc
745+
746+
#pragma acc parallel loop num_gangs(1, cond, 2)
747+
for(unsigned I = 0; I < N; ++I);
748+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
749+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
750+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
751+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
752+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
753+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
754+
// CHECK-NEXT: acc.parallel combined(loop) num_gangs({%[[ONE_CAST]] : si32, %[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32}) {
755+
// CHECK-NEXT: acc.loop combined(parallel) {
756+
// CHECK: acc.yield
757+
// CHECK-NEXT: } loc
758+
// CHECK-NEXT: acc.yield
759+
// CHECK-NEXT: } loc
760+
761+
#pragma acc kernels loop num_gangs(1) device_type(radeon) num_gangs(cond)
762+
for(unsigned I = 0; I < N; ++I);
763+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
764+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
765+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
766+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
767+
// CHECK-NEXT: acc.kernels combined(loop) num_gangs({%[[ONE_CAST]] : si32}, {%[[CONV_CAST]] : si32} [#acc.device_type<radeon>]) {
768+
// CHECK-NEXT: acc.loop combined(kernels) {
769+
// CHECK: acc.yield
770+
// CHECK-NEXT: } loc
771+
// CHECK-NEXT: acc.terminator
772+
// CHECK-NEXT: } loc
773+
774+
#pragma acc parallel loop num_gangs(1, cond, 2) device_type(radeon) num_gangs(4, 5, 6)
775+
for(unsigned I = 0; I < N; ++I);
776+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
777+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
778+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
779+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
780+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
781+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
782+
// CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
783+
// CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
784+
// CHECK-NEXT: %[[FIVE_LITERAL:.*]] = cir.const #cir.int<5> : !s32i
785+
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE_LITERAL]] : !s32i to si32
786+
// CHECK-NEXT: %[[SIX_LITERAL:.*]] = cir.const #cir.int<6> : !s32i
787+
// CHECK-NEXT: %[[SIX_CAST:.*]] = builtin.unrealized_conversion_cast %[[SIX_LITERAL]] : !s32i to si32
788+
// CHECK-NEXT: acc.parallel combined(loop) num_gangs({%[[ONE_CAST]] : si32, %[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32}, {%[[FOUR_CAST]] : si32, %[[FIVE_CAST]] : si32, %[[SIX_CAST]] : si32} [#acc.device_type<radeon>])
789+
// CHECK-NEXT: acc.loop combined(parallel) {
790+
// CHECK: acc.yield
791+
// CHECK-NEXT: } loc
792+
// CHECK-NEXT: acc.yield
793+
// CHECK-NEXT: } loc
794+
795+
#pragma acc parallel loop num_gangs(1, cond, 2) device_type(radeon, nvidia) num_gangs(4, 5, 6)
796+
for(unsigned I = 0; I < N; ++I);
797+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
798+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
799+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
800+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
801+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
802+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
803+
// CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
804+
// CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
805+
// CHECK-NEXT: %[[FIVE_LITERAL:.*]] = cir.const #cir.int<5> : !s32i
806+
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE_LITERAL]] : !s32i to si32
807+
// CHECK-NEXT: %[[SIX_LITERAL:.*]] = cir.const #cir.int<6> : !s32i
808+
// CHECK-NEXT: %[[SIX_CAST:.*]] = builtin.unrealized_conversion_cast %[[SIX_LITERAL]] : !s32i to si32
809+
// CHECK-NEXT: acc.parallel combined(loop) num_gangs({%[[ONE_CAST]] : si32, %[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32}, {%[[FOUR_CAST]] : si32, %[[FIVE_CAST]] : si32, %[[SIX_CAST]] : si32} [#acc.device_type<radeon>], {%[[FOUR_CAST]] : si32, %[[FIVE_CAST]] : si32, %[[SIX_CAST]] : si32} [#acc.device_type<nvidia>])
810+
// CHECK-NEXT: acc.loop combined(parallel) {
811+
// CHECK: acc.yield
812+
// CHECK-NEXT: } loc
813+
// CHECK-NEXT: acc.yield
814+
// CHECK-NEXT: } loc
815+
724816
}

0 commit comments

Comments
 (0)