[SYCL] Split device images based on accuracy level provided in option

againull · againull · commit f235c4441c75 · 2023-07-06T15:19:25.000-07:00
This PR reuses optional kernel features mechanism to provide this
splitting logic based on accuracy level:
1. When frontend emits fp intrinsic call and attaches the maximum error
   attribute we also attach "sycl_used_aspects" metadata to the call
   instruction with a value which corresponds to high, medium, low, sycl
   or cuda. Mapping for those values is needed to be visible for SYCL
   device compiler only and we intentionally don't put those values to
   aspects enum because we don't need aspects because of the reasons I
   described above.
2. Make SYCLPropagateAspectsUsage to propagate sycl_used_aspects
   metadata from instructions to kernel.
3. Don't add internal aspects into the requirements, because we don't need
   processing of these fake aspects (with negative values) in the SYCL RT.
After these changes splitting functionality based on sycl_used_aspects
metadata is available for free.

More details:
Currently accruracy level can be controlled using the following options.
For entire translation unit:
-ffp-accuracy=high
-ffp-accuracy=medium
-ffp-accuracy=low
-ffp-accuracy=sycl
-ffp-accuracy=cuda

For particular funcions in the translation unit:
-ffp-accuracy=low:sin,cos

Whenever frontend sees a math function in a kernel or a device function
it emits fp intrinsic call with attached callsite attribute indicating
value of the maximum error. llvm-spirv is going to translate this
builtins to regular __ocl intrinsics and translate callsite attribute to
decorator (which is a new spirv extension). If that extension is not supported
by the backend, it is going to emit an error. Error is emitted also in
the case if backend supports the extension but can't compile the kernel because
it doesn't have corresponding implemenation of math function complying with
required maximum error.

Aspects corrsponding to different levels of accuracy are not suitable in
this case because aforementioned options are sycl program compilation options, i.e.
it doesn't make sense to provide an opportunity to the user to write
something like this:
if (dev.has(aspect::ext_oneapi_fp_intrinsic_accuracy_high)) {
  /* submit kernel using high accuracy intrinsics */
}

But on our side we still would like to put kernels and device functions
to different images based on required accuracy level. It is necessary because
some backends may support, for example, low and medium accuracy but don't
support high accuracy. In this case we want to make kernels using low
and medium accuracy levels buildable, so we can't put kernels requiring
high accuracy and low/medidum accuracy together.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16,6 +16,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
@@ -513,12 +514,17 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // TODO: Replace AttrList with a single attribute. The call can only have a
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
+  // "sycl_used_aspects" metadata associated with the call.
+  SmallVector<llvm::Metadata *, 4> AspectsMD;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(
-      Name, AttrList, ID,
+      Name, AttrList, AspectsMD, ID,
       Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
   CI->setAttributes(AttrList);
+  if (!AspectsMD.empty())
+    CI->setMetadata("sycl_used_aspects",
+                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectsMD));
   return CI;
 }
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
@@ -17,6 +17,7 @@
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
@@ -1846,8 +1847,18 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
       .Case("cuda", llvm::fp::FPAccuracy::CUDA);
 }
 
+static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
+  return llvm::StringSwitch<int32_t>(FPAccuracyStr)
+      .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
+      .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
+      .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
+      .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
+      .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
+}
+
 void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
-    StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
+    StringRef Name, llvm::AttrBuilder &FuncAttrs,
+    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
     const llvm::Type *FuncType) {
   // Priority is given to to the accuracy specific to the function.
   // So, if the command line is something like this:
@@ -1864,6 +1875,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(FuncMapIt->second));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      if (getLangOpts().SYCLIsDevice)
+        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))));
     }
   }
   if (FuncAttrs.attrs().size() == 0)
@@ -1872,6 +1886,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      if (getLangOpts().SYCLIsDevice)
+        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))));
     }
 }
 
diff --git a/clang/lib/CodeGen/CGSYCLRuntime.h b/clang/lib/CodeGen/CGSYCLRuntime.h
@@ -23,6 +23,19 @@ namespace CodeGen {
 
 class CodeGenModule;
 
+// These aspects are internal and used for device image splitting purposes only.
+// They are not exposed to the DPCPP users through "aspect" enum. That's why
+// they are intentionally assigned negative values to filter them out at the
+// stage of embedding used aspects as device requirements to the executable.
+// We don't pass these internal aspects to the DPCPP RT.
+enum SYCLInternalAspect : int32_t {
+  fp_intrinsic_accuracy_high = -1,
+  fp_intrinsic_accuracy_medium = -2,
+  fp_intrinsic_accuracy_low = -3,
+  fp_intrinsic_accuracy_sycl = -4,
+  fp_intrinsic_accuracy_cuda = -5,
+};
+
 class CGSYCLRuntime {
 protected:
   CodeGenModule &CGM;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7882,12 +7882,12 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
   NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
 }
 
-void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
-                                                llvm::AttributeList &AttrList,
-                                                unsigned ID,
-                                                const llvm::Type *FuncType) {
+void CodeGenModule::getFPAccuracyFuncAttributes(
+    StringRef Name, llvm::AttributeList &AttrList,
+    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
+    const llvm::Type *FuncType) {
   llvm::AttrBuilder FuncAttrs(getLLVMContext());
-  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MDs, ID, FuncType);
   AttrList = llvm::AttributeList::get(
       getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
@@ -1594,8 +1594,9 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   void getFPAccuracyFuncAttributes(StringRef Name,
-                                   llvm::AttributeList &AttrList, unsigned ID,
-                                   const llvm::Type *FuncType);
+                                   llvm::AttributeList &AttrList,
+                                   SmallVector<llvm::Metadata *, 4> &MDs,
+                                   unsigned ID, const llvm::Type *FuncType);
 
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
@@ -1791,10 +1792,10 @@ class CodeGenModule : public CodeGenTypeCache {
                                     bool AttrOnCallSite,
                                     llvm::AttrBuilder &FuncAttrs);
 
-  void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
-                                              llvm::AttrBuilder &FuncAttrs,
-                                              unsigned ID,
-                                              const llvm::Type *FuncType);
+  void getDefaultFunctionFPAccuracyAttributes(
+      StringRef Name, llvm::AttrBuilder &FuncAttrs,
+      SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
+      const llvm::Type *FuncType);
 
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
                                                StringRef Suffix);
diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
     Result.insert(Aspects.begin(), Aspects.end());
   }
 
+  if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
+    for (const MDOperand &MDOp : InstApsects->operands()) {
+      const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
+      Result.insert(cast<ConstantInt>(C)->getSExtValue());
+    }
+  }
+
   return Result;
 }
 
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
@@ -22,10 +22,10 @@ void llvm::getSYCLDeviceRequirements(
     const module_split::ModuleDesc &MD,
     std::map<StringRef, util::PropertyValue> &Requirements) {
   auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N,
-                                             unsigned OpNo) -> unsigned {
+                                             unsigned OpNo) -> int32_t {
     Constant *C =
         cast<ConstantAsMetadata>(N->getOperand(OpNo).get())->getValue();
-    return static_cast<uint32_t>(C->getUniqueInteger().getZExtValue());
+    return static_cast<int32_t>(C->getUniqueInteger().getSExtValue());
   };
 
   // { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see:
@@ -41,10 +41,16 @@ void llvm::getSYCLDeviceRequirements(
     std::set<uint32_t> Values;
     for (const Function &F : MD.getModule()) {
       if (const MDNode *MDN = F.getMetadata(MDName)) {
-        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
-          Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I));
+        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
+          // Don't put internal aspects (with negative integer value) into the
+          // requirements, they are used only for device image splitting.
+          auto Val = ExtractIntegerFromMDNodeOperand(MDN, I);
+          if (Val >= 0)
+            Values.insert(Val);
+        }
       }
     }
+
     // We don't need the "fixed_target" property if it's empty
     if (std::string(MDName) == "sycl_fixed_targets" && Values.empty())
       continue;
@@ -64,10 +70,11 @@ void llvm::getSYCLDeviceRequirements(
     if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) {
       assert(MDN->getNumOperands() == 1);
       auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0);
+      assert(MDValue >= 0);
       if (!SubGroupSize)
         SubGroupSize = MDValue;
       else
-        assert(*SubGroupSize == MDValue);
+        assert(*SubGroupSize == static_cast<uint32_t>(MDValue));
     }
   }
   // Do not attach reqd_sub_group_size if there is no attached metadata
diff --git a/sycl/test/optional_kernel_features/fp-accuracy.cpp b/sycl/test/optional_kernel_features/fp-accuracy.cpp
@@ -0,0 +1,138 @@
+// RUN: %clangxx %s -o %test.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin  -fno-math-errno  -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test.bc -o %test.table
+// RUN: FileCheck %s -input-file=%test.table --check-prefixes CHECK-TABLE
+// RUN: FileCheck %s -input-file=%test_0.sym --check-prefixes CHECK-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_1.sym --check-prefixes CHECK-M1-SYMS
+// RUN: FileCheck %s -input-file=%test_2.sym --check-prefixes CHECK-M2-SYMS
+// RUN: FileCheck %s -input-file=%test_3.sym --check-prefixes CHECK-M3-SYMS
+// RUN: FileCheck %s -input-file=%test_4.sym --check-prefixes CHECK-M4-SYMS
+// RUN: FileCheck %s -input-file=%test_5.sym --check-prefixes CHECK-M5-SYMS
+
+// Tests that kernels which use different fp-accuracy level end up in different
+// device images.
+
+// CHECK-TABLE: Code
+// CHECK-TABLE-NEXT: _0.sym
+// CHECK-TABLE-NEXT: _1.sym
+// CHECK-TABLE-NEXT: _2.sym
+// CHECK-TABLE-NEXT: _3.sym
+// CHECK-TABLE-NEXT: _4.sym
+// CHECK-TABLE-NEXT: _5.sym
+// CHECK-TABLE-NEXT: _6.sym
+// CHECK-TABLE-EMPTY:
+
+// CHECK-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-M0-SYMS-NEXT: Kernel1
+// CHECK-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-M0-SYMS-NEXT: Kernel7
+// CHECK-M0-SYMS-EMPTY:
+
+// CHECK-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-M1-SYMS-NEXT: Kernel2
+// CHECK-M1-SYMS-EMPTY:
+
+// CHECK-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-M2-SYMS-NEXT: Kernel3
+// CHECK-M2-SYMS-EMPTY:
+
+// CHECK-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-M3-SYMS-NEXT: Kernel6
+// CHECK-M3-SYMS-EMPTY:
+
+// CHECK-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-M4-SYMS-NEXT: Kernel4
+// CHECK-M4-SYMS-EMPTY:
+
+// CHECK-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-M5-SYMS-NEXT: Kernel5
+// CHECK-M5-SYMS-EMPTY:
+
+// CHECK-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-M6-SYMS-NEXT: Kernel0
+// CHECK-M6-SYMS-EMPTY:
+
+#include <array>
+#include <cmath>
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr access::mode sycl_read = access::mode::read;
+constexpr access::mode sycl_write = access::mode::write;
+
+int main() {
+  const size_t array_size = 4;
+  std::array<double, array_size> D = {{1., 2., 3., 4.}}, E;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+  double Value = 5.;
+  buffer<double, 1> bufferOut(E.data(), numOfItems);
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel0>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = Value; });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel1>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); });
+  });
+
+  // Kernel2 uses medium-accuracy cos.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel2>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); });
+  });
+
+  // Kernel3 uses low-accuracy tan.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel3>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); });
+  });
+
+  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel4>(numOfItems, [=](id<1> wiID) {
+      accessorOut[wiID] = std::log(std::exp(Value));
+    });
+  });
+
+  // Kernel5 uses cuda-accuracy acos.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel5>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); });
+  });
+
+  // Kernel6 uses sycl-accuracy asin.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel6>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::asin(Value); });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel7>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sqrt(Value); });
+  });
+
+  return 0;
+}

Original file line number	Diff line number	Diff line change
`@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,`
`255`	`255`	`Result.insert(Aspects.begin(), Aspects.end());`
`256`	`256`	`}`
`257`	`257`
	`258`	`+ if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {`
	`259`	`+ for (const MDOperand &MDOp : InstApsects->operands()) {`
	`260`	`+ const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();`
	`261`	`+ Result.insert(cast<ConstantInt>(C)->getSExtValue());`
	`262`	`+ }`
	`263`	`+ }`
	`264`	`+`
`258`	`265`	`return Result;`
`259`	`266`	`}`
`260`	`267`