Skip to content

Commit 3d73d9b

Browse files
authored
[NVPTX][AMDGPU] Move annotation creation out of clang (#14634)
This patch refactors the way we lower SYCL attributes and properties to NVVM annotations, through function attributes and metadata. It unifies the flow better with the SPIR-V paths at the same time. Previously we had: 1. Clang handling function attributes in two places: 1. CodeGenFunction did generic lowering of attributes to function metadata 2. NVPTXTargetCodeGenInfo did its own additional lowering of attributes to NVVM annotations 2. Kernel properties being handled in clang. NVPTXTargetCodeGenInfo lowered kernel properties, which had already been converted to function attributes, to NVVM annotations. 3. Kernel properties for HIP/CUDA *not* being turned into function metadata. Because function metadata is how the ComputeModuleRuntimeInfo library creates runtime info, this meant that target backends couldn't act on properties, as they were lost during lowering. Fundamentally, clang is not the correct place for handling SYCL kernel properties as it unnecessarily touches the front-end for what is really library code. With this patch, we now have: 1. Clang handles function attributes in one place, lowering to function metadata 2. SYCLLowerIR lowers kernel properties to the same function metadata: now working for HIP and CUDA kernels too. Because function metadata is consistently present for kernel properties, they behave as expected on HIP/CUDA targets now. 3. A new pass converts function metadata to NVVM annotations, replacing the code we've taken out of clang.
1 parent 727e085 commit 3d73d9b

File tree

18 files changed

+318
-184
lines changed

18 files changed

+318
-184
lines changed

clang/lib/CodeGen/BackendUtil.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include "llvm/SYCLLowerIR/RecordSYCLAspectNames.h"
5757
#include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
5858
#include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h"
59+
#include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h"
5960
#include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
6061
#include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
6162
#include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h"
@@ -1151,6 +1152,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
11511152
// and before cleaning up metadata)
11521153
MPM.addPass(RecordSYCLAspectNamesPass());
11531154

1155+
if (TargetTriple.isNVPTX())
1156+
MPM.addPass(SYCLCreateNVVMAnnotationsPass());
1157+
11541158
// Remove SYCL metadata added by the frontend, like sycl_aspects
11551159
// Note, this pass should be at the end of the pipeline
11561160
MPM.addPass(CleanupSYCLMetadataPass());

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 0 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -291,80 +291,6 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
291291
addNVVMMetadata(F, "grid_constant", GridConstantParamIdxs);
292292
}
293293
}
294-
bool HasMaxWorkGroupSize = false;
295-
bool HasMinWorkGroupPerCU = false;
296-
if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
297-
HasMaxWorkGroupSize = true;
298-
// We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
299-
// with the fastest-moving dimension rightmost, to CUDA's, where X is the
300-
// fastest-moving dimension.
301-
addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
302-
addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
303-
addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
304-
}
305-
306-
if (const auto *RWGS = FD->getAttr<SYCLReqdWorkGroupSizeAttr>()) {
307-
llvm::SmallVector<std::optional<int64_t>, 3> Ops;
308-
// Index-flip and pad out any missing elements. Note the misleading
309-
// nomenclature of the methods: getXDimVal doesn't return the X dimension;
310-
// it returns the left-most dimension (dim0). This could correspond to
311-
// CUDA's X, Y, or Z, depending on the number of operands provided.
312-
if (auto Dim0 = RWGS->getXDimVal())
313-
Ops.push_back(Dim0->getExtValue());
314-
if (auto Dim1 = RWGS->getYDimVal())
315-
Ops.push_back(Dim1->getExtValue());
316-
if (auto Dim2 = RWGS->getZDimVal())
317-
Ops.push_back(Dim2->getExtValue());
318-
std::reverse(Ops.begin(), Ops.end());
319-
Ops.append(3 - Ops.size(), std::nullopt);
320-
321-
// Work-group sizes (in NVVM annotations) must be positive and less than
322-
// INT32_MAX, whereas SYCL can allow for larger work-group sizes (see
323-
// -fno-sycl-id-queries-fit-in-int). If any dimension is too large for
324-
// NVPTX, don't emit any annotation at all.
325-
if (llvm::all_of(Ops, [](std::optional<int64_t> V) {
326-
return !V || llvm::isUInt<31>(*V);
327-
})) {
328-
if (auto X = Ops[0])
329-
addNVVMMetadata(F, "reqntidx", *X);
330-
if (auto Y = Ops[1])
331-
addNVVMMetadata(F, "reqntidy", *Y);
332-
if (auto Z = Ops[2])
333-
addNVVMMetadata(F, "reqntidz", *Z);
334-
}
335-
}
336-
337-
auto attrValue = [&](Expr *E) {
338-
const auto *CE = cast<ConstantExpr>(E);
339-
std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
340-
return Val->getZExtValue();
341-
};
342-
343-
if (const auto *MWGPCU =
344-
FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
345-
if (!HasMaxWorkGroupSize && FD->hasAttr<OpenCLKernelAttr>()) {
346-
M.getDiags().Report(D->getLocation(),
347-
diag::warn_launch_bounds_missing_attr)
348-
<< MWGPCU << 0;
349-
} else {
350-
// The value is guaranteed to be > 0, pass it to the metadata.
351-
addNVVMMetadata(F, "minctasm", attrValue(MWGPCU->getValue()));
352-
HasMinWorkGroupPerCU = true;
353-
}
354-
}
355-
356-
if (const auto *MWGPMP =
357-
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
358-
if ((!HasMaxWorkGroupSize || !HasMinWorkGroupPerCU) &&
359-
FD->hasAttr<OpenCLKernelAttr>()) {
360-
M.getDiags().Report(D->getLocation(),
361-
diag::warn_launch_bounds_missing_attr)
362-
<< MWGPMP << 1;
363-
} else {
364-
// The value is guaranteed to be > 0, pass it to the metadata.
365-
addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
366-
}
367-
}
368294
}
369295

370296
// Perform special handling in CUDA mode.

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4089,11 +4089,17 @@ bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I,
40894089

40904090
void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
40914091
Decl *D, const AttributeCommonInfo &CI, Expr *E) {
4092-
if (Context.getLangOpts().SYCLIsDevice &&
4093-
!Context.getTargetInfo().getTriple().isNVPTX()) {
4094-
Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
4095-
<< CI << E->getSourceRange();
4096-
return;
4092+
if (Context.getLangOpts().SYCLIsDevice) {
4093+
if (!Context.getTargetInfo().getTriple().isNVPTX()) {
4094+
Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
4095+
<< CI << E->getSourceRange();
4096+
return;
4097+
}
4098+
4099+
if (!D->hasAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
4100+
Diag(CI.getLoc(), diag::warn_launch_bounds_missing_attr) << CI << 0;
4101+
return;
4102+
}
40974103
}
40984104
if (!E->isValueDependent()) {
40994105
// Validate that we have an integer constant expression and then store the
@@ -4154,6 +4160,12 @@ void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
41544160
<< CudaArchToString(SM) << CI << E->getSourceRange();
41554161
return;
41564162
}
4163+
4164+
if (!D->hasAttr<SYCLIntelMaxWorkGroupSizeAttr>() ||
4165+
!D->hasAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
4166+
Diag(CI.getLoc(), diag::warn_launch_bounds_missing_attr) << CI << 1;
4167+
return;
4168+
}
41574169
}
41584170
if (!E->isValueDependent()) {
41594171
// Validate that we have an integer constant expression and then store the

clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -48,37 +48,6 @@ int main() {
4848
// CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
4949
// CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]]
5050

51-
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
52-
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
53-
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
54-
// CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
55-
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
56-
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
57-
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
58-
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
59-
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
60-
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
61-
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
62-
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
63-
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
64-
// CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
65-
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
66-
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
67-
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
68-
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
69-
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
70-
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
71-
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
72-
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
73-
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
74-
// CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
75-
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
76-
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
77-
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
78-
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
79-
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
80-
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
81-
8251
// CHECK: ![[MWGPC]] = !{i32 2}
8352
// CHECK: ![[MWGPM]] = !{i32 4}
8453
// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}

clang/test/CodeGenSYCL/reqd-work-group-size.cpp

Lines changed: 2 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
22
// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
3-
// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX
4-
// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-NVPTX
3+
// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
4+
// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple nvptx64-nvidia-cuda -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
55

66
#include "sycl.hpp"
77

@@ -123,55 +123,6 @@ int main() {
123123
// CHECK: define {{.*}} void @{{.*}}kernel_name22() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D22:[0-9]+]]
124124
// CHECK: define {{.*}} void @{{.*}}kernel_name24() #0 {{.*}} !work_group_num_dim ![[NDRWGS1D:[0-9]+]] !reqd_work_group_size ![[WGSIZE1D2:[0-9]+]]
125125

126-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidx", i32 16}
127-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidy", i32 16}
128-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name1, !"reqntidz", i32 32}
129-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidx", i32 8}
130-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidy", i32 8}
131-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name3, !"reqntidz", i32 8}
132-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidx", i32 2}
133-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidy", i32 2}
134-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name4, !"reqntidz", i32 2}
135-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidx", i32 2}
136-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidy", i32 8}
137-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name6, !"reqntidz", i32 1}
138-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidx", i32 16}
139-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidy", i32 16}
140-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name7, !"reqntidz", i32 32}
141-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidx", i32 8}
142-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidy", i32 8}
143-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name9, !"reqntidz", i32 8}
144-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidx", i32 2}
145-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidy", i32 2}
146-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name10, !"reqntidz", i32 2}
147-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidx", i32 2}
148-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidy", i32 8}
149-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name12, !"reqntidz", i32 1}
150-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidx", i32 16}
151-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name13, !"reqntidy", i32 32}
152-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name13, !"reqntidz"
153-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidx", i32 8}
154-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name15, !"reqntidy", i32 8}
155-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name15, !"reqntidz"
156-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidx", i32 2}
157-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name16, !"reqntidy", i32 2}
158-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name16, !"reqntidz"
159-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidx", i32 8}
160-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name18, !"reqntidy", i32 1}
161-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name18, !"reqntidz"
162-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name19, !"reqntidx", i32 32}
163-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidy",
164-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name19, !"reqntidz",
165-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name21, !"reqntidx", i32 8}
166-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidy",
167-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name21, !"reqntidz",
168-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name22, !"reqntidx", i32 2}
169-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidy",
170-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name22, !"reqntidz",
171-
// CHECK-NVPTX: = !{ptr @{{.*}}kernel_name24, !"reqntidx", i32 1}
172-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidy",
173-
// CHECK-NVPTX-NOT: = !{ptr @{{.*}}kernel_name24, !"reqntidz",
174-
175126
// CHECK: ![[NDRWGS3D]] = !{i32 3}
176127
// CHECK: ![[WGSIZE3D32]] = !{i32 16, i32 16, i32 32}
177128
// CHECK: ![[WGSIZE3D88]] = !{i32 8, i32 8, i32 8}

clang/test/SemaSYCL/lb_sm_70.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,14 @@ int main() {
4848
intel::min_work_groups_per_cu(4),
4949
intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
5050

51-
// expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
51+
// expected-warning@+2 {{'min_work_groups_per_cu' attribute ignored, as it requires: maximum work group size to be also specified}}
5252
cgh.single_task<class T5>(
53-
[=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
53+
[=] [[intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
54+
55+
// expected-error@+3 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
56+
cgh.single_task<class T5>(
57+
[=] [[intel::max_work_group_size(1, 1, 256),
58+
intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
5459
});
5560

5661
Q.submit([&](sycl::handler &cgh) {

clang/test/SemaSYCL/lb_sm_90.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ template <int N1, int N2, int N3> class Functor {
1515
operator()() const {}
1616
};
1717

18-
// expected-warning@+1 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
1918
template <int N1, int N2> class Functor_2 {
2019
public:
20+
// expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
2121
[[intel::max_work_group_size(1, 1, N1),
2222
intel::max_work_groups_per_mp(N2)]] void
2323
operator()() const {}
@@ -32,7 +32,7 @@ int main() {
3232
intel::min_work_groups_per_cu(2),
3333
intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
3434

35-
// expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
35+
// expected-warning@+3 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
3636
cgh.single_task<class T2>(
3737
[=] [[intel::max_work_group_size(1, 1, 256),
3838
intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//===- SYCLCreateNVVMAnnotations.h - SYCLCreateNVVMAnnotationsPass --------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass lowers function metadata to NVVM annotations
10+
//
11+
//===----------------------------------------------------------------------===//
12+
//
13+
#ifndef LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
14+
#define LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H
15+
16+
#include "llvm/IR/PassManager.h"
17+
18+
namespace llvm {
19+
20+
class SYCLCreateNVVMAnnotationsPass
21+
: public PassInfoMixin<SYCLCreateNVVMAnnotationsPass> {
22+
public:
23+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
24+
25+
static bool isRequired() { return true; }
26+
};
27+
28+
} // namespace llvm
29+
30+
#endif // LLVM_SYCL_CREATE_NVVM_ANNOTATIONS_H

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
#include "llvm/SYCLLowerIR/RecordSYCLAspectNames.h"
129129
#include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
130130
#include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h"
131+
#include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h"
131132
#include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
132133
#include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
133134
#include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h"

llvm/lib/Passes/PassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ MODULE_PASS("sycl-propagate-joint-matrix-usage", SYCLPropagateJointMatrixUsagePa
160160
MODULE_PASS("sycl-add-opt-level-attribute", SYCLAddOptLevelAttributePass())
161161
MODULE_PASS("compile-time-properties", CompileTimePropertiesPass())
162162
MODULE_PASS("cleanup-sycl-metadata", CleanupSYCLMetadataPass())
163+
MODULE_PASS("sycl-create-nvvm-annotations", SYCLCreateNVVMAnnotationsPass())
163164
MODULE_PASS("lower-slm-reservation-calls", ESIMDLowerSLMReservationCalls())
164165
MODULE_PASS("record-sycl-aspect-names", RecordSYCLAspectNamesPass())
165166
MODULE_PASS("sycl-virtual-functions-analysis",

llvm/lib/SYCLLowerIR/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
6060
SpecConstants.cpp
6161
SYCLAddOptLevelAttribute.cpp
6262
SYCLConditionalCallOnDevice.cpp
63+
SYCLCreateNVVMAnnotations.cpp
6364
SYCLDeviceLibReqMask.cpp
6465
SYCLDeviceRequirements.cpp
6566
SYCLKernelParamOptInfo.cpp

llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "llvm/SYCLLowerIR/DeviceGlobals.h"
1313
#include "llvm/SYCLLowerIR/ESIMD/ESIMDUtils.h"
1414
#include "llvm/SYCLLowerIR/HostPipes.h"
15+
#include "llvm/SYCLLowerIR/TargetHelpers.h"
1516

1617
#include "llvm/ADT/APInt.h"
1718
#include "llvm/ADT/StringMap.h"
@@ -367,7 +368,8 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
367368
SmallVector<StringRef, 3> ValStrs;
368369
Attr.getValueAsString().split(ValStrs, ',');
369370

370-
assert(ValStrs.size() <= 3 &&
371+
size_t NumDims = ValStrs.size();
372+
assert(NumDims <= 3 &&
371373
"sycl-work-group-size and sycl-work-group-size-hint currently only "
372374
"support up to three values");
373375

@@ -384,6 +386,16 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
384386
for (StringRef ValStr : ValStrs)
385387
MDVals.push_back(ConstantAsMetadata::get(
386388
Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, ValStr, 10))));
389+
while (MDVals.size() < 3)
390+
MDVals.push_back(ConstantAsMetadata::get(
391+
Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, 1, 10))));
392+
393+
if (NumDims < 3) {
394+
if (!F.hasMetadata("work_group_num_dim"))
395+
F.setMetadata("work_group_num_dim",
396+
MDNode::get(Ctx, ConstantAsMetadata::get(ConstantInt::get(
397+
Type::getInt32Ty(Ctx), NumDims))));
398+
}
387399

388400
const char *MDName = (AttrKindStr == "sycl-work-group-size")
389401
? "reqd_work_group_size"
@@ -588,9 +600,13 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M,
588600
}
589601

590602
// Process all properties on kernels.
603+
TargetHelpers::KernelCache HIPCUDAKCache;
604+
HIPCUDAKCache.populateKernels(M);
605+
591606
for (Function &F : M) {
592607
// Only consider kernels.
593-
if (F.getCallingConv() != CallingConv::SPIR_KERNEL)
608+
if (F.getCallingConv() != CallingConv::SPIR_KERNEL &&
609+
!HIPCUDAKCache.isKernel(F))
594610
continue;
595611

596612
// Compile time properties on kernel arguments

0 commit comments

Comments
 (0)