Skip to content

Commit ac4bb42

Browse files
authored
[OpenACC][CIR] Implement 'gang' lowering for 'loop' (#138968)
This clause requires an entire additional collection to keep track of the gang 'kind' or 'type'. That work is maintained in the OpenACC dialect functions. Otherwise, this is effectively the same as the worker/vectors.
1 parent 6ade80c commit ac4bb42

File tree

4 files changed

+173
-0
lines changed

4 files changed

+173
-0
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,18 @@ class OpenACCClauseCIREmitter final
107107
.CaseLower("radeon", mlir::acc::DeviceType::Radeon);
108108
}
109109

110+
mlir::acc::GangArgType decodeGangType(OpenACCGangKind gk) {
111+
switch (gk) {
112+
case OpenACCGangKind::Num:
113+
return mlir::acc::GangArgType::Num;
114+
case OpenACCGangKind::Dim:
115+
return mlir::acc::GangArgType::Dim;
116+
case OpenACCGangKind::Static:
117+
return mlir::acc::GangArgType::Static;
118+
}
119+
llvm_unreachable("unknown gang kind");
120+
}
121+
110122
public:
111123
OpenACCClauseCIREmitter(OpTy &operation, CIRGen::CIRGenFunction &cgf,
112124
CIRGen::CIRGenBuilderTy &builder,
@@ -424,6 +436,42 @@ class OpenACCClauseCIREmitter final
424436
return clauseNotImplemented(clause);
425437
}
426438
}
439+
440+
void VisitGangClause(const OpenACCGangClause &clause) {
441+
if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
442+
if (clause.getNumExprs() == 0) {
443+
operation.addEmptyGang(builder.getContext(), lastDeviceTypeValues);
444+
} else {
445+
llvm::SmallVector<mlir::Value> values;
446+
llvm::SmallVector<mlir::acc::GangArgType> argTypes;
447+
for (unsigned i : llvm::index_range(0u, clause.getNumExprs())) {
448+
auto [kind, expr] = clause.getExpr(i);
449+
mlir::Location exprLoc = cgf.cgm.getLoc(expr->getBeginLoc());
450+
argTypes.push_back(decodeGangType(kind));
451+
if (kind == OpenACCGangKind::Dim) {
452+
llvm::APInt curValue =
453+
expr->EvaluateKnownConstInt(cgf.cgm.getASTContext());
454+
// The value is 1, 2, or 3, but the type isn't necessarily smaller
455+
// than 64.
456+
curValue = curValue.sextOrTrunc(64);
457+
values.push_back(
458+
createConstantInt(exprLoc, 64, curValue.getSExtValue()));
459+
} else if (isa<OpenACCAsteriskSizeExpr>(expr)) {
460+
values.push_back(createConstantInt(exprLoc, 64, -1));
461+
} else {
462+
values.push_back(createIntExpr(expr));
463+
}
464+
}
465+
466+
operation.addGangOperands(builder.getContext(), lastDeviceTypeValues,
467+
argTypes, values);
468+
}
469+
} else {
470+
// TODO: When we've implemented this for everything, switch this to an
471+
// unreachable. Combined constructs remain.
472+
return clauseNotImplemented(clause);
473+
}
474+
}
427475
};
428476

429477
template <typename OpTy>

clang/test/CIR/CodeGenOpenACC/loop.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,4 +323,73 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
323323
// CHECK: acc.yield
324324
// CHECK-NEXT: } loc
325325
}
326+
327+
#pragma acc parallel
328+
// CHECK: acc.parallel {
329+
{
330+
#pragma acc loop gang
331+
for(unsigned I = 0; I < N; ++I);
332+
// CHECK-NEXT: acc.loop gang {
333+
// CHECK: acc.yield
334+
// CHECK-NEXT: } loc
335+
#pragma acc loop gang device_type(nvidia) gang
336+
for(unsigned I = 0; I < N; ++I);
337+
// CHECK-NEXT: acc.loop gang([#acc.device_type<none>, #acc.device_type<nvidia>]) {
338+
// CHECK: acc.yield
339+
// CHECK-NEXT: } loc
340+
#pragma acc loop gang(dim:1) device_type(nvidia) gang(dim:2)
341+
for(unsigned I = 0; I < N; ++I);
342+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
343+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
344+
// CHECK-NEXT: acc.loop gang({dim=%[[ONE_CONST]] : i64}, {dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>]) {
345+
// CHECK: acc.yield
346+
// CHECK-NEXT: } loc
347+
#pragma acc loop gang(static:N, dim: 1) device_type(nvidia, radeon) gang(static:*, dim : 2)
348+
for(unsigned I = 0; I < N; ++I);
349+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
350+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
351+
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
352+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
353+
// CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
354+
// CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32, dim=%[[ONE_CONST]] : i64}, {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>], {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<radeon>]) {
355+
// CHECK: acc.yield
356+
// CHECK-NEXT: } loc
357+
}
358+
#pragma acc kernels
359+
// CHECK: acc.kernels {
360+
{
361+
#pragma acc loop gang(num:N) device_type(nvidia, radeon) gang(num:N)
362+
for(unsigned I = 0; I < N; ++I);
363+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
364+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
365+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
366+
// CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
367+
// CHECK-NEXT: acc.loop gang({num=%[[N_CONV]] : si32}, {num=%[[N_CONV2]] : si32} [#acc.device_type<nvidia>], {num=%[[N_CONV2]] : si32} [#acc.device_type<radeon>]) {
368+
// CHECK: acc.yield
369+
// CHECK-NEXT: } loc
370+
#pragma acc loop gang(static:N) device_type(nvidia) gang(static:*)
371+
for(unsigned I = 0; I < N; ++I);
372+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
373+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
374+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
375+
// CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32}, {static=%[[STAR_CONST]] : i64} [#acc.device_type<nvidia>]) {
376+
// CHECK: acc.yield
377+
// CHECK-NEXT: } loc
378+
#pragma acc loop gang(static:N, num: N + 1) device_type(nvidia) gang(static:*, num : N + 2)
379+
for(unsigned I = 0; I < N; ++I);
380+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
381+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
382+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
383+
// CHECK-NEXT: %[[CIR_ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
384+
// CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[CIR_ONE_CONST]]) nsw : !s32i
385+
// CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
386+
// CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
387+
// CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
388+
// CHECK-NEXT: %[[CIR_TWO_CONST:.*]] = cir.const #cir.int<2> : !s32i
389+
// CHECK-NEXT: %[[N_PLUS_TWO:.*]] = cir.binop(add, %[[N_LOAD3]], %[[CIR_TWO_CONST]]) nsw : !s32i
390+
// CHECK-NEXT: %[[N_PLUS_TWO_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_TWO]] : !s32i to si32
391+
// CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32, num=%[[N_PLUS_ONE_CONV]] : si32}, {static=%[[STAR_CONST]] : i64, num=%[[N_PLUS_TWO_CONV]] : si32} [#acc.device_type<nvidia>]) {
392+
// CHECK: acc.yield
393+
// CHECK-NEXT: } loc
394+
}
326395
}

mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2231,6 +2231,16 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
22312231
// device_types. This is for the case where there is no expression specified
22322232
// in a 'worker'.
22332233
void addEmptyWorker(MLIRContext *, llvm::ArrayRef<DeviceType>);
2234+
2235+
// Adds a collection of operands for a 'gang' clause that has various types
2236+
// corresponding to each operand.
2237+
void addGangOperands(MLIRContext *, llvm::ArrayRef<DeviceType>,
2238+
llvm::ArrayRef<GangArgType>, mlir::ValueRange);
2239+
2240+
// Add an empty value to the 'gang' list with a current list of
2241+
// device_types. This is for the case where there is no expression specified
2242+
// in a 'gang'.
2243+
void addEmptyGang(MLIRContext *, llvm::ArrayRef<DeviceType>);
22342244
}];
22352245

22362246
let hasCustomAssemblyFormat = 1;

mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2748,6 +2748,52 @@ void acc::LoopOp::addEmptyWorker(
27482748
effectiveDeviceTypes));
27492749
}
27502750

2751+
void acc::LoopOp::addEmptyGang(
2752+
MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
2753+
setGangAttr(addDeviceTypeAffectedOperandHelper(context, getGangAttr(),
2754+
effectiveDeviceTypes));
2755+
}
2756+
2757+
void acc::LoopOp::addGangOperands(
2758+
MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes,
2759+
llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) {
2760+
llvm::SmallVector<int32_t> segments;
2761+
if (std::optional<ArrayRef<int32_t>> existingSegments =
2762+
getGangOperandsSegments())
2763+
llvm::copy(*existingSegments, std::back_inserter(segments));
2764+
2765+
unsigned beforeCount = segments.size();
2766+
2767+
setGangOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
2768+
context, getGangOperandsDeviceTypeAttr(), effectiveDeviceTypes, values,
2769+
getGangOperandsMutable(), segments));
2770+
2771+
setGangOperandsSegments(segments);
2772+
2773+
// This is a bit of extra work to make sure we update the 'types' correctly by
2774+
// adding to the types collection the correct number of times. We could
2775+
// potentially add something similar to the
2776+
// addDeviceTypeAffectedOperandHelper, but it seems that would be pretty
2777+
// excessive for a one-off case.
2778+
unsigned numAdded = segments.size() - beforeCount;
2779+
2780+
if (numAdded > 0) {
2781+
llvm::SmallVector<mlir::Attribute> gangTypes;
2782+
if (getGangOperandsArgTypeAttr())
2783+
llvm::copy(getGangOperandsArgTypeAttr(), std::back_inserter(gangTypes));
2784+
2785+
for (auto i : llvm::index_range(0u, numAdded)) {
2786+
llvm::transform(argTypes, std::back_inserter(gangTypes),
2787+
[=](mlir::acc::GangArgType gangTy) {
2788+
return mlir::acc::GangArgTypeAttr::get(context, gangTy);
2789+
});
2790+
(void)i;
2791+
}
2792+
2793+
setGangOperandsArgTypeAttr(mlir::ArrayAttr::get(context, gangTypes));
2794+
}
2795+
}
2796+
27512797
//===----------------------------------------------------------------------===//
27522798
// DataOp
27532799
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)