-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenACC][CIR] Implement 'async' lowering. #136626
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type.
@llvm/pr-subscribers-clangir @llvm/pr-subscribers-clang Author: Erich Keane (erichkeane) ChangesAsync acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, Full diff: https://github.com/llvm/llvm-project/pull/136626.diff 6 Files Affected:
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index e7dd2e74b0864..82fbb49db3bc8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -97,11 +97,13 @@ class OpenACCClauseCIREmitter final
// Handle a clause affected by the 'device-type' to the point that they need
// to have the attributes added in the correct/corresponding order, such as
- // 'num_workers' or 'vector_length' on a compute construct.
- mlir::ArrayAttr
- handleDeviceTypeAffectedClause(mlir::ArrayAttr existingDeviceTypes,
- mlir::Value argument,
- mlir::MutableOperandRange &argCollection) {
+ // 'num_workers' or 'vector_length' on a compute construct. For cases where we
+ // don't have an argument that needs to be added to an additional one (such as
+ // asyncOnly) we can use this with 'argument' as std::nullopt.
+ mlir::ArrayAttr handleDeviceTypeAffectedClause(
+ mlir::ArrayAttr existingDeviceTypes,
+ std::optional<mlir::Value> argument = std::nullopt,
+ mlir::MutableOperandRange *argCollection = nullptr) {
llvm::SmallVector<mlir::Attribute> deviceTypes;
// Collect the 'existing' device-type attributes so we can re-create them
@@ -120,13 +122,19 @@ class OpenACCClauseCIREmitter final
lastDeviceTypeClause->getArchitectures()) {
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
builder.getContext(), decodeDeviceType(arch.getIdentifierInfo())));
- argCollection.append(argument);
+ if (argument) {
+ assert(argCollection);
+ argCollection->append(*argument);
+ }
}
} else {
// Else, we just add a single for 'none'.
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
builder.getContext(), mlir::acc::DeviceType::None));
- argCollection.append(argument);
+ if (argument) {
+ assert(argCollection);
+ argCollection->append(*argument);
+ }
}
return mlir::ArrayAttr::get(builder.getContext(), deviceTypes);
@@ -205,7 +213,7 @@ class OpenACCClauseCIREmitter final
mlir::MutableOperandRange range = operation.getNumWorkersMutable();
operation.setNumWorkersDeviceTypeAttr(handleDeviceTypeAffectedClause(
operation.getNumWorkersDeviceTypeAttr(),
- createIntExpr(clause.getIntExpr()), range));
+ createIntExpr(clause.getIntExpr()), &range));
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
llvm_unreachable("num_workers not valid on serial");
} else {
@@ -218,7 +226,7 @@ class OpenACCClauseCIREmitter final
mlir::MutableOperandRange range = operation.getVectorLengthMutable();
operation.setVectorLengthDeviceTypeAttr(handleDeviceTypeAffectedClause(
operation.getVectorLengthDeviceTypeAttr(),
- createIntExpr(clause.getIntExpr()), range));
+ createIntExpr(clause.getIntExpr()), &range));
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
llvm_unreachable("vector_length not valid on serial");
} else {
@@ -226,6 +234,22 @@ class OpenACCClauseCIREmitter final
}
}
+ void VisitAsyncClause(const OpenACCAsyncClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+ if (!clause.hasIntExpr()) {
+ operation.setAsyncOnlyAttr(
+ handleDeviceTypeAffectedClause(operation.getAsyncOnlyAttr()));
+ } else {
+ mlir::MutableOperandRange range = operation.getAsyncOperandsMutable();
+ operation.setAsyncOperandsDeviceTypeAttr(handleDeviceTypeAffectedClause(
+ operation.getAsyncOperandsDeviceTypeAttr(),
+ createIntExpr(clause.getIntExpr()), &range));
+ }
+ } else {
+ return clauseNotImplemented(clause);
+ }
+ }
+
void VisitSelfClause(const OpenACCSelfClause &clause) {
if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
if (clause.isEmptySelfClause()) {
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index 3694a831b76de..ed437ac62e332 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -639,6 +639,9 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
SemaOpenACC::OpenACCParsedClause &Clause) {
+ if (DisallowSinceLastDeviceType<OpenACCAsyncClause>(Clause))
+ return nullptr;
+
assert(Clause.getNumIntExprs() < 2 &&
"Invalid number of expressions for Async");
return OpenACCAsyncClause::Create(
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index d2da1d18f1534..1744acf0ab223 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -210,5 +210,51 @@ void acc_kernels(int cond) {
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc
+#pragma acc kernels async
+ {}
+ // CHECK-NEXT: acc.kernels {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: acc.kernels {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 61dccc591c252..892d931c880e7 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -209,5 +209,51 @@ void acc_parallel(int cond) {
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
+#pragma acc parallel async
+ {}
+ // CHECK-NEXT: acc.parallel {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: acc.parallel {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c
index b72f44a2ea473..094958f0e3b23 100644
--- a/clang/test/CIR/CodeGenOpenACC/serial.c
+++ b/clang/test/CIR/CodeGenOpenACC/serial.c
@@ -106,5 +106,51 @@ void acc_serial(int cond) {
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
+#pragma acc serial async
+ {}
+ // CHECK-NEXT: acc.serial {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: acc.serial {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.c b/clang/test/SemaOpenACC/compute-construct-async-clause.c
index 4895d7f2209bb..4ca963713254c 100644
--- a/clang/test/SemaOpenACC/compute-construct-async-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.c
@@ -20,6 +20,48 @@ void Test() {
#pragma acc serial async(1, 2)
while(1);
+ // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc kernels async async
+ while(1);
+
+ // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc kernels async(1) async(2)
+ while(1);
+
+ // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'parallel' directive}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc parallel async(1) async(2)
+ while(1);
+
+ // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'serial' directive}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc serial async(1) async(2)
+ while(1);
+
+ // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'kernels' directive}}
+ // expected-note@+2{{previous clause is here}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc kernels async(1) device_type(*) async(1) async(2)
+ while(1);
+ // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
+ // expected-note@+2{{previous clause is here}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc parallel async device_type(*) async async
+ while(1);
+ // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'serial' directive}}
+ // expected-note@+2{{previous clause is here}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc serial async(1) device_type(*) async async(2)
+ while(1);
+
+ // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
+ // expected-note@+2{{previous clause is here}}
+ // expected-note@+1{{previous clause is here}}
+#pragma acc parallel device_type(*) async async
+ while(1);
+
struct NotConvertible{} NC;
// expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
#pragma acc parallel async(NC)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me. Just a couple of nits regarding the comment block.
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type.
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type.
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type.
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand.
However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype.
Additionally, this syncronizes us with the implementation of flang,
which prohibits multiple 'async' clauses per-device_type.