Skip to content

Commit f235c44

Browse files
committed
[SYCL] Split device images based on accuracy level provided in option
This PR reuses optional kernel features mechanism to provide this splitting logic based on accuracy level: 1. When frontend emits fp intrinsic call and attaches the maximum error attribute we also attach "sycl_used_aspects" metadata to the call instruction with a value which corresponds to high, medium, low, sycl or cuda. Mapping for those values is needed to be visible for SYCL device compiler only and we intentionally don't put those values to aspects enum because we don't need aspects because of the reasons I described above. 2. Make SYCLPropagateAspectsUsage to propagate sycl_used_aspects metadata from instructions to kernel. 3. Don't add internal aspects into the requirements, because we don't need processing of these fake aspects (with negative values) in the SYCL RT. After these changes splitting functionality based on sycl_used_aspects metadata is available for free. More details: Currently accruracy level can be controlled using the following options. For entire translation unit: -ffp-accuracy=high -ffp-accuracy=medium -ffp-accuracy=low -ffp-accuracy=sycl -ffp-accuracy=cuda For particular funcions in the translation unit: -ffp-accuracy=low:sin,cos Whenever frontend sees a math function in a kernel or a device function it emits fp intrinsic call with attached callsite attribute indicating value of the maximum error. llvm-spirv is going to translate this builtins to regular __ocl intrinsics and translate callsite attribute to decorator (which is a new spirv extension). If that extension is not supported by the backend, it is going to emit an error. Error is emitted also in the case if backend supports the extension but can't compile the kernel because it doesn't have corresponding implemenation of math function complying with required maximum error. Aspects corrsponding to different levels of accuracy are not suitable in this case because aforementioned options are sycl program compilation options, i.e. it doesn't make sense to provide an opportunity to the user to write something like this: if (dev.has(aspect::ext_oneapi_fp_intrinsic_accuracy_high)) { /* submit kernel using high accuracy intrinsics */ } But on our side we still would like to put kernels and device functions to different images based on required accuracy level. It is necessary because some backends may support, for example, low and medium accuracy but don't support high accuracy. In this case we want to make kernels using low and medium accuracy levels buildable, so we can't put kernels requiring high accuracy and low/medidum accuracy together.
1 parent 94ac8d5 commit f235c44

File tree

8 files changed

+207
-18
lines changed

8 files changed

+207
-18
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "CGObjCRuntime.h"
1717
#include "CGOpenCLRuntime.h"
1818
#include "CGRecordLayout.h"
19+
#include "CGSYCLRuntime.h"
1920
#include "CodeGenFunction.h"
2021
#include "CodeGenModule.h"
2122
#include "ConstantEmitter.h"
@@ -513,12 +514,17 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
513514
// TODO: Replace AttrList with a single attribute. The call can only have a
514515
// single FPAccuracy attribute.
515516
llvm::AttributeList AttrList;
517+
// "sycl_used_aspects" metadata associated with the call.
518+
SmallVector<llvm::Metadata *, 4> AspectsMD;
516519
// sincos() doesn't return a value, but it still has a type associated with
517520
// it that corresponds to the operand type.
518521
CGF.CGM.getFPAccuracyFuncAttributes(
519-
Name, AttrList, ID,
522+
Name, AttrList, AspectsMD, ID,
520523
Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
521524
CI->setAttributes(AttrList);
525+
if (!AspectsMD.empty())
526+
CI->setMetadata("sycl_used_aspects",
527+
llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectsMD));
522528
return CI;
523529
}
524530

clang/lib/CodeGen/CGCall.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "CGCXXABI.h"
1818
#include "CGCleanup.h"
1919
#include "CGRecordLayout.h"
20+
#include "CGSYCLRuntime.h"
2021
#include "CodeGenFunction.h"
2122
#include "CodeGenModule.h"
2223
#include "TargetInfo.h"
@@ -1846,8 +1847,18 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
18461847
.Case("cuda", llvm::fp::FPAccuracy::CUDA);
18471848
}
18481849

1850+
static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
1851+
return llvm::StringSwitch<int32_t>(FPAccuracyStr)
1852+
.Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
1853+
.Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
1854+
.Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
1855+
.Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
1856+
.Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
1857+
}
1858+
18491859
void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
1850-
StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
1860+
StringRef Name, llvm::AttrBuilder &FuncAttrs,
1861+
SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
18511862
const llvm::Type *FuncType) {
18521863
// Priority is given to to the accuracy specific to the function.
18531864
// So, if the command line is something like this:
@@ -1864,6 +1875,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
18641875
ID, FuncType, convertFPAccuracy(FuncMapIt->second));
18651876
assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
18661877
FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
1878+
if (getLangOpts().SYCLIsDevice)
1879+
MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
1880+
Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))));
18671881
}
18681882
}
18691883
if (FuncAttrs.attrs().size() == 0)
@@ -1872,6 +1886,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
18721886
ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
18731887
assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
18741888
FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
1889+
if (getLangOpts().SYCLIsDevice)
1890+
MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
1891+
Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))));
18751892
}
18761893
}
18771894

clang/lib/CodeGen/CGSYCLRuntime.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ namespace CodeGen {
2323

2424
class CodeGenModule;
2525

26+
// These aspects are internal and used for device image splitting purposes only.
27+
// They are not exposed to the DPCPP users through "aspect" enum. That's why
28+
// they are intentionally assigned negative values to filter them out at the
29+
// stage of embedding used aspects as device requirements to the executable.
30+
// We don't pass these internal aspects to the DPCPP RT.
31+
enum SYCLInternalAspect : int32_t {
32+
fp_intrinsic_accuracy_high = -1,
33+
fp_intrinsic_accuracy_medium = -2,
34+
fp_intrinsic_accuracy_low = -3,
35+
fp_intrinsic_accuracy_sycl = -4,
36+
fp_intrinsic_accuracy_cuda = -5,
37+
};
38+
2639
class CGSYCLRuntime {
2740
protected:
2841
CodeGenModule &CGM;

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7882,12 +7882,12 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
78827882
NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
78837883
}
78847884

7885-
void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
7886-
llvm::AttributeList &AttrList,
7887-
unsigned ID,
7888-
const llvm::Type *FuncType) {
7885+
void CodeGenModule::getFPAccuracyFuncAttributes(
7886+
StringRef Name, llvm::AttributeList &AttrList,
7887+
SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
7888+
const llvm::Type *FuncType) {
78897889
llvm::AttrBuilder FuncAttrs(getLLVMContext());
7890-
getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
7890+
getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MDs, ID, FuncType);
78917891
AttrList = llvm::AttributeList::get(
78927892
getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
78937893
}

clang/lib/CodeGen/CodeGenModule.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,8 +1594,9 @@ class CodeGenModule : public CodeGenTypeCache {
15941594
void moveLazyEmissionStates(CodeGenModule *NewBuilder);
15951595

15961596
void getFPAccuracyFuncAttributes(StringRef Name,
1597-
llvm::AttributeList &AttrList, unsigned ID,
1598-
const llvm::Type *FuncType);
1597+
llvm::AttributeList &AttrList,
1598+
SmallVector<llvm::Metadata *, 4> &MDs,
1599+
unsigned ID, const llvm::Type *FuncType);
15991600

16001601
private:
16011602
llvm::Constant *GetOrCreateLLVMFunction(
@@ -1791,10 +1792,10 @@ class CodeGenModule : public CodeGenTypeCache {
17911792
bool AttrOnCallSite,
17921793
llvm::AttrBuilder &FuncAttrs);
17931794

1794-
void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
1795-
llvm::AttrBuilder &FuncAttrs,
1796-
unsigned ID,
1797-
const llvm::Type *FuncType);
1795+
void getDefaultFunctionFPAccuracyAttributes(
1796+
StringRef Name, llvm::AttrBuilder &FuncAttrs,
1797+
SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
1798+
const llvm::Type *FuncType);
17981799

17991800
llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
18001801
StringRef Suffix);

llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
255255
Result.insert(Aspects.begin(), Aspects.end());
256256
}
257257

258+
if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
259+
for (const MDOperand &MDOp : InstApsects->operands()) {
260+
const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
261+
Result.insert(cast<ConstantInt>(C)->getSExtValue());
262+
}
263+
}
264+
258265
return Result;
259266
}
260267

llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ void llvm::getSYCLDeviceRequirements(
2222
const module_split::ModuleDesc &MD,
2323
std::map<StringRef, util::PropertyValue> &Requirements) {
2424
auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N,
25-
unsigned OpNo) -> unsigned {
25+
unsigned OpNo) -> int32_t {
2626
Constant *C =
2727
cast<ConstantAsMetadata>(N->getOperand(OpNo).get())->getValue();
28-
return static_cast<uint32_t>(C->getUniqueInteger().getZExtValue());
28+
return static_cast<int32_t>(C->getUniqueInteger().getSExtValue());
2929
};
3030

3131
// { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see:
@@ -41,10 +41,16 @@ void llvm::getSYCLDeviceRequirements(
4141
std::set<uint32_t> Values;
4242
for (const Function &F : MD.getModule()) {
4343
if (const MDNode *MDN = F.getMetadata(MDName)) {
44-
for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
45-
Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I));
44+
for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
45+
// Don't put internal aspects (with negative integer value) into the
46+
// requirements, they are used only for device image splitting.
47+
auto Val = ExtractIntegerFromMDNodeOperand(MDN, I);
48+
if (Val >= 0)
49+
Values.insert(Val);
50+
}
4651
}
4752
}
53+
4854
// We don't need the "fixed_target" property if it's empty
4955
if (std::string(MDName) == "sycl_fixed_targets" && Values.empty())
5056
continue;
@@ -64,10 +70,11 @@ void llvm::getSYCLDeviceRequirements(
6470
if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) {
6571
assert(MDN->getNumOperands() == 1);
6672
auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0);
73+
assert(MDValue >= 0);
6774
if (!SubGroupSize)
6875
SubGroupSize = MDValue;
6976
else
70-
assert(*SubGroupSize == MDValue);
77+
assert(*SubGroupSize == static_cast<uint32_t>(MDValue));
7178
}
7279
}
7380
// Do not attach reqd_sub_group_size if there is no attached metadata
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// RUN: %clangxx %s -o %test.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin -fno-math-errno -fsycl -fsycl-device-only
2+
// RUN: sycl-post-link -split=auto -symbols %test.bc -o %test.table
3+
// RUN: FileCheck %s -input-file=%test.table --check-prefixes CHECK-TABLE
4+
// RUN: FileCheck %s -input-file=%test_0.sym --check-prefixes CHECK-M0-SYMS
5+
// RUN: FileCheck %s -input-file=%test_1.sym --check-prefixes CHECK-M1-SYMS
6+
// RUN: FileCheck %s -input-file=%test_2.sym --check-prefixes CHECK-M2-SYMS
7+
// RUN: FileCheck %s -input-file=%test_3.sym --check-prefixes CHECK-M3-SYMS
8+
// RUN: FileCheck %s -input-file=%test_4.sym --check-prefixes CHECK-M4-SYMS
9+
// RUN: FileCheck %s -input-file=%test_5.sym --check-prefixes CHECK-M5-SYMS
10+
11+
// Tests that kernels which use different fp-accuracy level end up in different
12+
// device images.
13+
14+
// CHECK-TABLE: Code
15+
// CHECK-TABLE-NEXT: _0.sym
16+
// CHECK-TABLE-NEXT: _1.sym
17+
// CHECK-TABLE-NEXT: _2.sym
18+
// CHECK-TABLE-NEXT: _3.sym
19+
// CHECK-TABLE-NEXT: _4.sym
20+
// CHECK-TABLE-NEXT: _5.sym
21+
// CHECK-TABLE-NEXT: _6.sym
22+
// CHECK-TABLE-EMPTY:
23+
24+
// CHECK-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
25+
// CHECK-M0-SYMS-NEXT: Kernel1
26+
// CHECK-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
27+
// CHECK-M0-SYMS-NEXT: Kernel7
28+
// CHECK-M0-SYMS-EMPTY:
29+
30+
// CHECK-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
31+
// CHECK-M1-SYMS-NEXT: Kernel2
32+
// CHECK-M1-SYMS-EMPTY:
33+
34+
// CHECK-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3
35+
// CHECK-M2-SYMS-NEXT: Kernel3
36+
// CHECK-M2-SYMS-EMPTY:
37+
38+
// CHECK-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6
39+
// CHECK-M3-SYMS-NEXT: Kernel6
40+
// CHECK-M3-SYMS-EMPTY:
41+
42+
// CHECK-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
43+
// CHECK-M4-SYMS-NEXT: Kernel4
44+
// CHECK-M4-SYMS-EMPTY:
45+
46+
// CHECK-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5
47+
// CHECK-M5-SYMS-NEXT: Kernel5
48+
// CHECK-M5-SYMS-EMPTY:
49+
50+
// CHECK-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
51+
// CHECK-M6-SYMS-NEXT: Kernel0
52+
// CHECK-M6-SYMS-EMPTY:
53+
54+
#include <array>
55+
#include <cmath>
56+
#include <iostream>
57+
#include <sycl/sycl.hpp>
58+
59+
using namespace sycl;
60+
61+
constexpr access::mode sycl_read = access::mode::read;
62+
constexpr access::mode sycl_write = access::mode::write;
63+
64+
int main() {
65+
const size_t array_size = 4;
66+
std::array<double, array_size> D = {{1., 2., 3., 4.}}, E;
67+
queue deviceQueue;
68+
range<1> numOfItems{array_size};
69+
double Value = 5.;
70+
buffer<double, 1> bufferOut(E.data(), numOfItems);
71+
72+
// Kernel0 doesn't use math functions.
73+
deviceQueue.submit([&](handler &cgh) {
74+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
75+
76+
cgh.parallel_for<class Kernel0>(
77+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = Value; });
78+
});
79+
80+
// Kernel1 uses high-accuracy sin.
81+
deviceQueue.submit([&](handler &cgh) {
82+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
83+
84+
cgh.parallel_for<class Kernel1>(
85+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); });
86+
});
87+
88+
// Kernel2 uses medium-accuracy cos.
89+
deviceQueue.submit([&](handler &cgh) {
90+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
91+
92+
cgh.parallel_for<class Kernel2>(
93+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); });
94+
});
95+
96+
// Kernel3 uses low-accuracy tan.
97+
deviceQueue.submit([&](handler &cgh) {
98+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
99+
100+
cgh.parallel_for<class Kernel3>(
101+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); });
102+
});
103+
104+
// Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
105+
deviceQueue.submit([&](handler &cgh) {
106+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
107+
108+
cgh.parallel_for<class Kernel4>(numOfItems, [=](id<1> wiID) {
109+
accessorOut[wiID] = std::log(std::exp(Value));
110+
});
111+
});
112+
113+
// Kernel5 uses cuda-accuracy acos.
114+
deviceQueue.submit([&](handler &cgh) {
115+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
116+
117+
cgh.parallel_for<class Kernel5>(
118+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); });
119+
});
120+
121+
// Kernel6 uses sycl-accuracy asin.
122+
deviceQueue.submit([&](handler &cgh) {
123+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
124+
125+
cgh.parallel_for<class Kernel6>(
126+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::asin(Value); });
127+
});
128+
129+
// Kernel7 uses high-accuracy sqrt.
130+
deviceQueue.submit([&](handler &cgh) {
131+
auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
132+
133+
cgh.parallel_for<class Kernel7>(
134+
numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sqrt(Value); });
135+
});
136+
137+
return 0;
138+
}

0 commit comments

Comments
 (0)