intel
diff --git a/‎clang/lib/Sema/SemaSYCL.cpp
Lines changed: 36 additions & 24 deletions b/‎clang/lib/Sema/SemaSYCL.cpp
Lines changed: 36 additions & 24 deletions
diff --git a/‎clang/test/CodeGenSYCL/sycl-pf-work-item.cpp
Lines changed: 2 additions & 0 deletions b/‎clang/test/CodeGenSYCL/sycl-pf-work-item.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
Lines changed: 3 additions & 1 deletion b/‎clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/include/llvm/Passes/CodeGenPassBuilder.h
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/Passes/CodeGenPassBuilder.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
Lines changed: 7 additions & 8 deletions b/‎llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
Lines changed: 7 additions & 8 deletions
diff --git a/‎llvm/include/llvm/CodeGen/FPBuiltinFnSelection.h renamed to ‎llvm/include/llvm/Transforms/Scalar/FPBuiltinFnSelection.h
Lines changed: 4 additions & 4 deletions b/‎llvm/include/llvm/CodeGen/FPBuiltinFnSelection.h renamed to ‎llvm/include/llvm/Transforms/Scalar/FPBuiltinFnSelection.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎llvm/lib/CodeGen/CMakeLists.txt
Lines changed: 0 additions & 1 deletion b/‎llvm/lib/CodeGen/CMakeLists.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Lines changed: 50 additions & 18 deletions b/‎llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Lines changed: 50 additions & 18 deletions
diff --git a/‎llvm/lib/Transforms/Scalar/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Transforms/Scalar/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/CodeGen/FPBuiltinFnSelection.cpp renamed to ‎llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
Lines changed: 2 additions & 2 deletions b/‎llvm/lib/CodeGen/FPBuiltinFnSelection.cpp renamed to ‎llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎sycl/doc/design/DeviceConfigFile.md
Lines changed: 0 additions & 1 deletion b/‎sycl/doc/design/DeviceConfigFile.md
Lines changed: 0 additions & 1 deletion
diff --git a/‎sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
Lines changed: 6 additions & 15 deletions b/‎sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
Lines changed: 6 additions & 15 deletions
@@ -754,6 +754,30 @@ static bool isDeclaredInSYCLNamespace(const Decl *D) {
   return ND && ND->getName() == "sycl";
 }
 
+static bool isSYCLPrivateMemoryVar(VarDecl *VD) {
+  return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory);
+}
+
+static void addScopeAttrToLocalVars(FunctionDecl &F) {
+  for (Decl *D : F.decls()) {
+    VarDecl *VD = dyn_cast<VarDecl>(D);
+
+    if (!VD || isa<ParmVarDecl>(VD) ||
+        VD->getStorageDuration() != StorageDuration::SD_Automatic)
+      continue;
+    // Local variables of private_memory type in the WG scope still have WI
+    // scope, all the rest - WG scope. Simple logic
+    // "if no scope than it is WG scope" won't work, because compiler may add
+    // locals not declared in user code (lambda object parameter, byval
+    // arguments) which will result in alloca w/o any attribute, so need WI
+    // scope too.
+    SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD)
+                                 ? SYCLScopeAttr::Level::WorkItem
+                                 : SYCLScopeAttr::Level::WorkGroup;
+    VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L));
+  }
+}
+
 // This type does the heavy lifting for the management of device functions,
 // recursive function detection, and attribute collection for a single
 // kernel/external function. It walks the callgraph to find all functions that
@@ -803,12 +827,24 @@ class SingleDeviceFunctionTracker {
     // Note: Here, we assume that this is called from within a
     // parallel_for_work_group; it is undefined to call it otherwise.
     // We deliberately do not diagnose a violation.
+    // The following changes have also been added:
+    // 1. The function inside which the parallel_for_work_item exists is
+    //    marked with WorkGroup scope attribute, if not present already.
+    // 2. The local variables inside the function are marked with appropriate
+    //    scope.
     if (CurrentDecl->getIdentifier() &&
         CurrentDecl->getIdentifier()->getName() == "parallel_for_work_item" &&
         isDeclaredInSYCLNamespace(CurrentDecl) &&
         !CurrentDecl->hasAttr<SYCLScopeAttr>()) {
       CurrentDecl->addAttr(SYCLScopeAttr::CreateImplicit(
           Parent.SemaSYCLRef.getASTContext(), SYCLScopeAttr::Level::WorkItem));
+      FunctionDecl *Caller = CallStack.back();
+      if (!Caller->hasAttr<SYCLScopeAttr>()) {
+        Caller->addAttr(
+            SYCLScopeAttr::CreateImplicit(Parent.SemaSYCLRef.getASTContext(),
+                                          SYCLScopeAttr::Level::WorkGroup));
+        addScopeAttrToLocalVars(*Caller);
+      }
     }
 
     // We previously thought we could skip this function if we'd seen it before,
@@ -1001,30 +1037,6 @@ class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
   ASTContext &Ctx;
 };
 
-static bool isSYCLPrivateMemoryVar(VarDecl *VD) {
-  return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory);
-}
-
-static void addScopeAttrToLocalVars(CXXMethodDecl &F) {
-  for (Decl *D : F.decls()) {
-    VarDecl *VD = dyn_cast<VarDecl>(D);
-
-    if (!VD || isa<ParmVarDecl>(VD) ||
-        VD->getStorageDuration() != StorageDuration::SD_Automatic)
-      continue;
-    // Local variables of private_memory type in the WG scope still have WI
-    // scope, all the rest - WG scope. Simple logic
-    // "if no scope than it is WG scope" won't work, because compiler may add
-    // locals not declared in user code (lambda object parameter, byval
-    // arguments) which will result in alloca w/o any attribute, so need WI
-    // scope too.
-    SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD)
-                                 ? SYCLScopeAttr::Level::WorkItem
-                                 : SYCLScopeAttr::Level::WorkGroup;
-    VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L));
-  }
-}
-
 /// Return method by name
 static CXXMethodDecl *getMethodByName(const CXXRecordDecl *CRD,
                                       StringRef MethodName) {
 
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -internal-isystem %S/Inputs -emit-llvm %s -o - | FileCheck %s
 // This test checks if the parallel_for_work_item called indirecly from
 // parallel_for_work_group gets the work_item_scope marker on it.
+// It also checks if the calling function gets the work_group_scope marker on it.
 #include <sycl.hpp>
 
 void foo(sycl::group<1> work_group) {
@@ -18,4 +19,5 @@ int main(int argc, char **argv) {
   return 0;
 }
 
+// CHECK: define {{.*}} void {{.*}}foo{{.*}} !work_group_scope
 // CHECK: define {{.*}} void @{{.*}}sycl{{.*}}group{{.*}}parallel_for_work_item{{.*}}(ptr addrspace(4) noundef align 1 dereferenceable_or_null(1) %this) {{.*}}!work_item_scope {{.*}}!parallel_for_work_item
@@ -213,7 +213,9 @@ Expected<StringRef> createOutputFile(const Twine &Prefix, StringRef Extension) {
   std::scoped_lock<decltype(TempFilesMutex)> Lock(TempFilesMutex);
   SmallString<128> OutputFile;
   if (SaveTemps) {
-    (Prefix + "." + Extension).toNullTerminatedStringRef(OutputFile);
+    // Generate a unique path name without creating a file
+    sys::fs::createUniquePath(Prefix + "-%%%%%%." + Extension, OutputFile,
+                              /*MakeAbsolute=*/false);
   } else {
     if (std::error_code EC =
             sys::fs::createTemporaryFile(Prefix, Extension, OutputFile))
 
@@ -29,7 +29,6 @@
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandReductions.h"
-#include "llvm/CodeGen/FPBuiltinFnSelection.h"
 #include "llvm/CodeGen/FinalizeISel.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GlobalMerge.h"
@@ -68,6 +67,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/CFGuard.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/Transforms/Scalar/FPBuiltinFnSelection.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 
@@ -43,7 +43,6 @@ def AspectExt_oneapi_native_assert : Aspect<"ext_oneapi_native_assert">;
 def AspectHost_debuggable : Aspect<"host_debuggable">;
 def AspectExt_intel_gpu_hw_threads_per_eu : Aspect<"ext_intel_gpu_hw_threads_per_eu">;
 def AspectExt_oneapi_cuda_async_barrier : Aspect<"ext_oneapi_cuda_async_barrier">;
-def AspectExt_oneapi_bfloat16_math_functions : Aspect<"ext_oneapi_bfloat16_math_functions">;
 def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">;
 def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">;
 def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">;
@@ -125,7 +124,7 @@ def : TargetInfo<"__TestAspectList",
     AspectExt_intel_max_mem_bandwidth, AspectExt_intel_mem_channel, AspectUsm_atomic_host_allocations,
     AspectUsm_atomic_shared_allocations, AspectAtomic64, AspectExt_intel_device_info_uuid, AspectExt_oneapi_srgb,
     AspectExt_oneapi_native_assert, AspectHost_debuggable, AspectExt_intel_gpu_hw_threads_per_eu,
-    AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_bfloat16_math_functions, AspectExt_intel_free_memory,
+    AspectExt_oneapi_cuda_async_barrier, AspectExt_intel_free_memory,
     AspectExt_intel_device_id, AspectExt_intel_memory_clock_rate, AspectExt_intel_memory_bus_width, AspectEmulated,
     AspectExt_intel_legacy_image, AspectExt_oneapi_bindless_images,
     AspectExt_oneapi_bindless_images_shared_usm, AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm,
@@ -198,17 +197,17 @@ def : CudaTargetInfo<"nvidia_gpu_sm_70", !listconcat(CudaMinAspects, CudaBindles
 def : CudaTargetInfo<"nvidia_gpu_sm_72", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_75", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_80", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_86", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 
 //
 // HIP / AMDGPU device aspects
 
@@ -1,4 +1,4 @@
-//===- FPBuiltinFnSelection.h - Pre-ISel intrinsic lowering pass ----------===//
+//===- FPBuiltinFnSelection.h - fpbuiltin intrinsic lowering pass ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,8 +10,8 @@
 // llvm.fpbuiltin.* intrinsics.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_CODEGEN_FPBUILTINFNSELECTION_H
-#define LLVM_CODEGEN_FPBUILTINFNSELECTION_H
+#ifndef LLVM_TRANSFORMS_SCALAR_FPBUILTINFNSELECTION_H
+#define LLVM_TRANSFORMS_SCALAR_FPBUILTINFNSELECTION_H
 
 #include "llvm/IR/PassManager.h"
 
@@ -25,4 +25,4 @@ struct FPBuiltinFnSelectionPass : PassInfoMixin<FPBuiltinFnSelectionPass> {
 
 } // end namespace llvm
 
-#endif // LLVM_CODEGEN_FPBUILTINFNSELECTION_H
+#endif // LLVM_TRANSFORMS_SCALAR_FPBUILTINFNSELECTION_H
@@ -63,7 +63,6 @@ add_llvm_component_library(LLVMCodeGen
   ExpandVectorPredication.cpp
   FaultMaps.cpp
   FEntryInserter.cpp
-  FPBuiltinFnSelection.cpp
   FinalizeISel.cpp
   FixupStatepointCallerSaved.cpp
   FuncletLayout.cpp
 
@@ -65,19 +65,6 @@
 // (1) - materialization of a PFWI object
 // (2) - "fixup" of the private variable address.
 //
-// TODO: add support for the case when there are other functions between
-// parallel_for_work_group and parallel_for_work_item in the call stack.
-// For example:
-//
-// void foo(sycl::group<1> group, ...) {
-//   group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
-// }
-// ...
-//   cgh.parallel_for_work_group<class kernel>(
-//     range<1>(...), range<1>(...), [=](group<1> g) {
-//       foo(g, ...);
-//     });
-//
 // TODO The approach employed by this pass generates lots of barriers and data
 // copying between private and local memory, which might not be efficient. There
 // are optimization opportunities listed below. Also other approaches can be
@@ -209,11 +196,36 @@ static bool isCallToAFuncMarkedWithMD(const Instruction *I, const char *MD) {
   return F && F->getMetadata(MD);
 }
 
-// Checks is this is a call to parallel_for_work_item.
+// Recursively searches for a call to a function with work_group
+// metadata inside F.
+static bool hasCallToAFuncWithWGMetadata(Function &F) {
+  for (auto &BB : F)
+    for (auto &I : BB) {
+      if (isCallToAFuncMarkedWithMD(&I, WG_SCOPE_MD))
+        return true;
+      const CallInst *Call = dyn_cast<CallInst>(&I);
+      Function *F = dyn_cast_or_null<Function>(Call ? Call->getCalledFunction()
+                                                    : nullptr);
+      if (F && hasCallToAFuncWithWGMetadata(*F))
+        return true;
+    }
+  return false;
+}
+
+// Checks if this is a call to parallel_for_work_item.
 static bool isPFWICall(const Instruction *I) {
   return isCallToAFuncMarkedWithMD(I, PFWI_MD);
 }
 
+// Checks if F has any calls to function marked with PFWI_MD metadata.
+static bool hasPFWICall(Function &F) {
+  for (auto &BB : F)
+    for (auto &I : BB)
+      if (isPFWICall(&I))
+        return true;
+  return false;
+}
+
 // Checks if given instruction must be executed by all work items.
 static bool isWIScopeInst(const Instruction *I) {
   if (I->isTerminator())
@@ -425,6 +437,17 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
   }
 }
 
+// Skip allocas, addrspacecasts associated with allocas and debug insts.
+static Instruction *getFirstInstToProcess(BasicBlock *BB) {
+  Instruction *I = &BB->front();
+  for (;
+       I->getOpcode() == Instruction::Alloca ||
+       I->getOpcode() == Instruction::AddrSpaceCast || I->isDebugOrPseudoInst();
+       I = I->getNextNode()) {
+  }
+  return I;
+}
+
 // Performs the following transformation for each basic block in the input map:
 //
 // BB:
@@ -462,7 +485,11 @@ static void materializeLocalsInWIScopeBlocksImpl(
   for (auto &P : BB2MatLocals) {
     // generate LeaderBB and private<->shadow copies in proper BBs
     BasicBlock *LeaderBB = P.first;
-    BasicBlock *BB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "LeaderMat");
+    // Skip allocas, addrspacecasts associated with allocas and debug insts.
+    // Alloca instructions and it's associated instructions must be in the
+    // beginning of the function.
+    Instruction *LeaderBBFront = getFirstInstToProcess(LeaderBB);
+    BasicBlock *BB = LeaderBB->splitBasicBlock(LeaderBBFront, "LeaderMat");
     // Add a barrier to the original block:
     Instruction *At =
         spirv::genWGBarrier(*BB->getFirstNonPHI(), TT)->getNextNode();
@@ -476,7 +503,8 @@ static void materializeLocalsInWIScopeBlocksImpl(
       // fill the leader BB:
       // fetch data from leader's private copy (which is always up to date) into
       // the corresponding shadow variable
-      Builder.SetInsertPoint(&LeaderBB->front());
+      LeaderBBFront = getFirstInstToProcess(LeaderBB);
+      Builder.SetInsertPoint(LeaderBBFront);
       copyBetweenPrivateAndShadow(L, Shadow, Builder, true /*private->shadow*/);
       // store data to the local variable - effectively "refresh" the value of
       // the local in each work item in the work group
@@ -485,8 +513,8 @@ static void materializeLocalsInWIScopeBlocksImpl(
                                   false /*shadow->private*/);
     }
     // now generate the TestBB and the leader WI guard
-    BasicBlock *TestBB =
-        LeaderBB->splitBasicBlock(&LeaderBB->front(), "TestMat");
+    LeaderBBFront = getFirstInstToProcess(LeaderBB);
+    BasicBlock *TestBB = LeaderBB->splitBasicBlock(LeaderBBFront, "TestMat");
     std::swap(TestBB, LeaderBB);
     guardBlockWithIsLeaderCheck(TestBB, LeaderBB, BB, At->getDebugLoc(), TT);
   }
@@ -752,6 +780,10 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
                                             FunctionAnalysisManager &FAM) {
   if (!F.getMetadata(WG_SCOPE_MD))
     return PreservedAnalyses::all();
+  // If a function does not have any PFWI calls and it has calls to a function
+  // that has work_group metadata, then we do not need to lower such functions.
+  if (!hasPFWICall(F) && hasCallToAFuncWithWGMetadata(F))
+    return PreservedAnalyses::all();
   LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n");
   const auto &TT = llvm::Triple(F.getParent()->getTargetTriple());
   // Ranges of "side effect" instructions
 
@@ -14,6 +14,7 @@ add_llvm_component_library(LLVMScalarOpts
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   Float2Int.cpp
+  FPBuiltinFnSelection.cpp
   GuardWidening.cpp
   GVN.cpp
   GVNHoist.cpp
@@ -97,4 +98,5 @@ add_llvm_component_library(LLVMScalarOpts
   InstCombine
   Support
   TransformUtils
+  TargetParser
   )
@@ -1,4 +1,4 @@
-//===- FPBuiltinFnSelection.cpp - Pre-ISel intrinsic lowering pass --------===//
+//===- FPBuiltinFnSelection.cpp - fpbuiltin intrinsic lowering pass -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/FPBuiltinFnSelection.h"
+#include "llvm/Transforms/Scalar/FPBuiltinFnSelection.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 
@@ -176,7 +176,6 @@ def AspectExt_oneapi_native_assert : Aspect<"ext_oneapi_native_assert">;
 def AspectHost_debuggable : Aspect<"host_debuggable">;
 def AspectExt_intel_gpu_hw_threads_per_eu : Aspect<"ext_intel_gpu_hw_threads_per_eu">;
 def AspectExt_oneapi_cuda_async_barrier : Aspect<"ext_oneapi_cuda_async_barrier">;
-def AspectExt_oneapi_bfloat16_math_functions : Aspect<"ext_oneapi_bfloat16_math_functions">;
 def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">;
 def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">;
 def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">;
 
@@ -67,6 +67,12 @@ The descriptions of the `fma`, `fmin`, `fmax`, `fabs`, `isnan`, `ceil`, `floor`,
 specification:
 https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
 
+[NOTE]
+The bfloat16 type is supported on all devices. DPC++ currently supports some
+bfloat16 type math functions natively on Intel Xe HP GPUs and Nvidia GPUs with
+Compute Capability >= SM80. On other devices, and in host code, such functions
+are emulated in software.
+
 == Specification
 
 === Feature test macro
@@ -86,21 +92,6 @@ supports.
 |1     |The APIs of this experimental extension are not versioned, so the feature-test macro always has this value.
 |===   
 
-=== Extension to `enum class aspect`
-
-[source]
-----
-namespace sycl {
-enum class aspect {
-  ...
-  sycl_ext_oneapi_bfloat16_math_functions
-}
-}
-----
-
-If a SYCL device has the `sycl_ext_oneapi_bfloat16_math_functions` aspect,
-then it supports the `bfloat16` math functions described in the next section.
-
 === Math Functions
 
 ==== isnan