Skip to content

[OpenACC][CIR] Add parallelism determ. to all acc.loops #143751

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ namespace {
class ScalarExprEmitter;
} // namespace

namespace mlir {
namespace acc {
class LoopOp;
} // namespace acc
} // namespace mlir

namespace clang::CIRGen {

class CIRGenFunction : public CIRGenTypeCache {
Expand Down Expand Up @@ -1082,6 +1088,12 @@ class CIRGenFunction : public CIRGenTypeCache {
OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
ArrayRef<const OpenACCClause *> clauses);

// The OpenACC LoopOp requires that we have auto, seq, or independent on all
// LoopOp operations for the 'none' device type case. This function checks if
// the LoopOp has one, else it updates it to have one.
void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan,
OpenACCDirectiveKind dk);

public:
mlir::LogicalResult
emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s);
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(

emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);

updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);

builder.create<TermOp>(end);
}

Expand Down
33 changes: 33 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,36 @@ using namespace clang::CIRGen;
using namespace cir;
using namespace mlir::acc;

void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op,
bool isOrphan,
OpenACCDirectiveKind dk) {
// Check that at least one of auto, independent, or seq is present
// for the device-independent default clauses.
if (op.hasParallelismFlag(mlir::acc::DeviceType::None))
return;

switch (dk) {
default:
llvm_unreachable("Invalid parent directive kind");
case OpenACCDirectiveKind::Invalid:
case OpenACCDirectiveKind::Parallel:
case OpenACCDirectiveKind::ParallelLoop:
op.addIndependent(builder.getContext(), {});
return;
case OpenACCDirectiveKind::Kernels:
case OpenACCDirectiveKind::KernelsLoop:
op.addAuto(builder.getContext(), {});
return;
case OpenACCDirectiveKind::Serial:
case OpenACCDirectiveKind::SerialLoop:
if (op.hasDefaultGangWorkerVector())
op.addAuto(builder.getContext(), {});
else
op.addSeq(builder.getContext(), {});
return;
};
}

mlir::LogicalResult
CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
mlir::Location start = getLoc(s.getSourceRange().getBegin());
Expand Down Expand Up @@ -90,6 +120,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
s.clauses());

updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(),
s.getParentComputeConstructKind());

mlir::LogicalResult stmtRes = mlir::success();
// Emit body.
{
Expand Down
69 changes: 62 additions & 7 deletions clang/test/CIR/CodeGenOpenACC/combined.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
// CHECK: acc.yield
// CHECK-NEXT: } loc
#pragma acc kernels loop seq device_type(nvidia, radeon)
Expand All @@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
// CHECK: acc.yield
// CHECK-NEXT: } loc
#pragma acc kernels loop auto device_type(nvidia, radeon)
Expand All @@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
// CHECK: acc.yield
// CHECK-NEXT: } loc
#pragma acc kernels loop independent device_type(nvidia, radeon)
Expand All @@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.parallel combined(loop) {
// CHECK: acc.loop combined(parallel) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
// CHECK: acc.yield
// CHECK-NEXT: } loc

Expand All @@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]}
// CHECK: acc.yield
// CHECK-NEXT: } loc

Expand All @@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.kernels combined(loop) {
// CHECK: acc.loop combined(kernels) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
// CHECK: acc.terminator
// CHECK-NEXT: } loc
#pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
Expand All @@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.parallel combined(loop) {
// CHECK: acc.loop combined(parallel) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
// CHECK: acc.yield
// CHECK-NEXT: } loc

Expand Down Expand Up @@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) {
// CHECK-NEXT: } loc
// CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg2"}
// CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg1"}

// Checking the automatic-addition of parallelism clauses.
#pragma acc parallel loop
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.parallel combined(loop) {
// CHECK-NEXT: acc.loop combined(parallel) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc kernels loop
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.kernels combined(loop) {
// CHECK-NEXT: acc.loop combined(kernels) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc

#pragma acc serial loop
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial loop worker
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) worker {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial loop vector
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) vector {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial loop gang
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) gang {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
}
101 changes: 91 additions & 10 deletions clang/test/CIR/CodeGenOpenACC/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
#pragma acc loop device_type(radeon) seq
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<radeon>]} loc
#pragma acc loop seq device_type(nvidia, radeon)
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
Expand All @@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
#pragma acc loop device_type(radeon) independent
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>, #acc.device_type<none>]} loc
#pragma acc loop independent device_type(nvidia, radeon)
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
Expand All @@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
#pragma acc loop device_type(radeon) auto
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
#pragma acc loop auto device_type(nvidia, radeon)
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
Expand All @@ -116,30 +116,30 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}

#pragma acc loop collapse(1) device_type(radeon) collapse (2)
for(unsigned I = 0; I < N; ++I)
for(unsigned J = 0; J < N; ++J)
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]}

#pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2)
for(unsigned I = 0; I < N; ++I)
for(unsigned J = 0; J < N; ++J)
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>], independent = [#acc.device_type<none>]}
#pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
for(unsigned I = 0; I < N; ++I)
for(unsigned J = 0; J < N; ++J)
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}

#pragma acc loop tile(1, 2, 3)
for(unsigned I = 0; I < N; ++I)
Expand Down Expand Up @@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
// CHECK: acc.yield
// CHECK-NEXT: } loc
}
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc

// Checking the automatic-addition of parallelism clauses.
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc

#pragma acc parallel
{
// CHECK-NEXT: acc.parallel {
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc kernels
{
// CHECK-NEXT: acc.kernels {
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop worker
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop worker {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop vector
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop vector {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop gang
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop gang {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
}
8 changes: 8 additions & 0 deletions mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
// device_types. This is for the case where there is no expression specified
// in a 'gang'.
void addEmptyGang(MLIRContext *, llvm::ArrayRef<DeviceType>);

// Return whether this LoopOp has an auto, seq, or independent for the
// specified device-type.
bool hasParallelismFlag(DeviceType);

// Return whether this LoopOp has a gang, worker, or vector applying to the
// 'default'/None device-type.
bool hasDefaultGangWorkerVector();
}];

let hasCustomAssemblyFormat = 1;
Expand Down
Loading