Skip to content

Commit 14abd9b

Browse files
committed
Merge branch 'sycl' into llvmspirv_pulldown
Conflicts: llvm/include/llvm/Passes/CodeGenPassBuilder.h
2 parents 3d2a5ba + b7e8523 commit 14abd9b

File tree

61 files changed

+556
-197
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+556
-197
lines changed

clang/lib/Sema/SemaSYCL.cpp

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,30 @@ static bool isDeclaredInSYCLNamespace(const Decl *D) {
754754
return ND && ND->getName() == "sycl";
755755
}
756756

757+
static bool isSYCLPrivateMemoryVar(VarDecl *VD) {
758+
return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory);
759+
}
760+
761+
static void addScopeAttrToLocalVars(FunctionDecl &F) {
762+
for (Decl *D : F.decls()) {
763+
VarDecl *VD = dyn_cast<VarDecl>(D);
764+
765+
if (!VD || isa<ParmVarDecl>(VD) ||
766+
VD->getStorageDuration() != StorageDuration::SD_Automatic)
767+
continue;
768+
// Local variables of private_memory type in the WG scope still have WI
769+
// scope, all the rest - WG scope. Simple logic
770+
// "if no scope than it is WG scope" won't work, because compiler may add
771+
// locals not declared in user code (lambda object parameter, byval
772+
// arguments) which will result in alloca w/o any attribute, so need WI
773+
// scope too.
774+
SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD)
775+
? SYCLScopeAttr::Level::WorkItem
776+
: SYCLScopeAttr::Level::WorkGroup;
777+
VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L));
778+
}
779+
}
780+
757781
// This type does the heavy lifting for the management of device functions,
758782
// recursive function detection, and attribute collection for a single
759783
// kernel/external function. It walks the callgraph to find all functions that
@@ -803,12 +827,24 @@ class SingleDeviceFunctionTracker {
803827
// Note: Here, we assume that this is called from within a
804828
// parallel_for_work_group; it is undefined to call it otherwise.
805829
// We deliberately do not diagnose a violation.
830+
// The following changes have also been added:
831+
// 1. The function inside which the parallel_for_work_item exists is
832+
// marked with WorkGroup scope attribute, if not present already.
833+
// 2. The local variables inside the function are marked with appropriate
834+
// scope.
806835
if (CurrentDecl->getIdentifier() &&
807836
CurrentDecl->getIdentifier()->getName() == "parallel_for_work_item" &&
808837
isDeclaredInSYCLNamespace(CurrentDecl) &&
809838
!CurrentDecl->hasAttr<SYCLScopeAttr>()) {
810839
CurrentDecl->addAttr(SYCLScopeAttr::CreateImplicit(
811840
Parent.SemaSYCLRef.getASTContext(), SYCLScopeAttr::Level::WorkItem));
841+
FunctionDecl *Caller = CallStack.back();
842+
if (!Caller->hasAttr<SYCLScopeAttr>()) {
843+
Caller->addAttr(
844+
SYCLScopeAttr::CreateImplicit(Parent.SemaSYCLRef.getASTContext(),
845+
SYCLScopeAttr::Level::WorkGroup));
846+
addScopeAttrToLocalVars(*Caller);
847+
}
812848
}
813849

814850
// We previously thought we could skip this function if we'd seen it before,
@@ -1001,30 +1037,6 @@ class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
10011037
ASTContext &Ctx;
10021038
};
10031039

1004-
static bool isSYCLPrivateMemoryVar(VarDecl *VD) {
1005-
return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory);
1006-
}
1007-
1008-
static void addScopeAttrToLocalVars(CXXMethodDecl &F) {
1009-
for (Decl *D : F.decls()) {
1010-
VarDecl *VD = dyn_cast<VarDecl>(D);
1011-
1012-
if (!VD || isa<ParmVarDecl>(VD) ||
1013-
VD->getStorageDuration() != StorageDuration::SD_Automatic)
1014-
continue;
1015-
// Local variables of private_memory type in the WG scope still have WI
1016-
// scope, all the rest - WG scope. Simple logic
1017-
// "if no scope than it is WG scope" won't work, because compiler may add
1018-
// locals not declared in user code (lambda object parameter, byval
1019-
// arguments) which will result in alloca w/o any attribute, so need WI
1020-
// scope too.
1021-
SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD)
1022-
? SYCLScopeAttr::Level::WorkItem
1023-
: SYCLScopeAttr::Level::WorkGroup;
1024-
VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L));
1025-
}
1026-
}
1027-
10281040
/// Return method by name
10291041
static CXXMethodDecl *getMethodByName(const CXXRecordDecl *CRD,
10301042
StringRef MethodName) {

clang/test/CodeGenSYCL/sycl-pf-work-item.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -internal-isystem %S/Inputs -emit-llvm %s -o - | FileCheck %s
22
// This test checks if the parallel_for_work_item called indirecly from
33
// parallel_for_work_group gets the work_item_scope marker on it.
4+
// It also checks if the calling function gets the work_group_scope marker on it.
45
#include <sycl.hpp>
56

67
void foo(sycl::group<1> work_group) {
@@ -18,4 +19,5 @@ int main(int argc, char **argv) {
1819
return 0;
1920
}
2021

22+
// CHECK: define {{.*}} void {{.*}}foo{{.*}} !work_group_scope
2123
// CHECK: define {{.*}} void @{{.*}}sycl{{.*}}group{{.*}}parallel_for_work_item{{.*}}(ptr addrspace(4) noundef align 1 dereferenceable_or_null(1) %this) {{.*}}!work_item_scope {{.*}}!parallel_for_work_item

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,9 @@ Expected<StringRef> createOutputFile(const Twine &Prefix, StringRef Extension) {
213213
std::scoped_lock<decltype(TempFilesMutex)> Lock(TempFilesMutex);
214214
SmallString<128> OutputFile;
215215
if (SaveTemps) {
216-
(Prefix + "." + Extension).toNullTerminatedStringRef(OutputFile);
216+
// Generate a unique path name without creating a file
217+
sys::fs::createUniquePath(Prefix + "-%%%%%%." + Extension, OutputFile,
218+
/*MakeAbsolute=*/false);
217219
} else {
218220
if (std::error_code EC =
219221
sys::fs::createTemporaryFile(Prefix, Extension, OutputFile))

llvm/include/llvm/Passes/CodeGenPassBuilder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
#include "llvm/CodeGen/DwarfEHPrepare.h"
3030
#include "llvm/CodeGen/ExpandMemCmp.h"
3131
#include "llvm/CodeGen/ExpandReductions.h"
32-
#include "llvm/CodeGen/FPBuiltinFnSelection.h"
3332
#include "llvm/CodeGen/FinalizeISel.h"
3433
#include "llvm/CodeGen/GCMetadata.h"
3534
#include "llvm/CodeGen/GlobalMerge.h"
@@ -68,6 +67,7 @@
6867
#include "llvm/Target/TargetMachine.h"
6968
#include "llvm/Transforms/CFGuard.h"
7069
#include "llvm/Transforms/Scalar/ConstantHoisting.h"
70+
#include "llvm/Transforms/Scalar/FPBuiltinFnSelection.h"
7171
#include "llvm/Transforms/Scalar/LoopPassManager.h"
7272
#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
7373
#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"

llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ def AspectExt_oneapi_native_assert : Aspect<"ext_oneapi_native_assert">;
4343
def AspectHost_debuggable : Aspect<"host_debuggable">;
4444
def AspectExt_intel_gpu_hw_threads_per_eu : Aspect<"ext_intel_gpu_hw_threads_per_eu">;
4545
def AspectExt_oneapi_cuda_async_barrier : Aspect<"ext_oneapi_cuda_async_barrier">;
46-
def AspectExt_oneapi_bfloat16_math_functions : Aspect<"ext_oneapi_bfloat16_math_functions">;
4746
def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">;
4847
def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">;
4948
def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">;
@@ -125,7 +124,7 @@ def : TargetInfo<"__TestAspectList",
125124
AspectExt_intel_max_mem_bandwidth, AspectExt_intel_mem_channel, AspectUsm_atomic_host_allocations,
126125
AspectUsm_atomic_shared_allocations, AspectAtomic64, AspectExt_intel_device_info_uuid, AspectExt_oneapi_srgb,
127126
AspectExt_oneapi_native_assert, AspectHost_debuggable, AspectExt_intel_gpu_hw_threads_per_eu,
128-
AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_bfloat16_math_functions, AspectExt_intel_free_memory,
127+
AspectExt_oneapi_cuda_async_barrier, AspectExt_intel_free_memory,
129128
AspectExt_intel_device_id, AspectExt_intel_memory_clock_rate, AspectExt_intel_memory_bus_width, AspectEmulated,
130129
AspectExt_intel_legacy_image, AspectExt_oneapi_bindless_images,
131130
AspectExt_oneapi_bindless_images_shared_usm, AspectExt_oneapi_bindless_images_1d_usm, AspectExt_oneapi_bindless_images_2d_usm,
@@ -198,17 +197,17 @@ def : CudaTargetInfo<"nvidia_gpu_sm_70", !listconcat(CudaMinAspects, CudaBindles
198197
def : CudaTargetInfo<"nvidia_gpu_sm_72", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64])>;
199198
def : CudaTargetInfo<"nvidia_gpu_sm_75", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64])>;
200199
def : CudaTargetInfo<"nvidia_gpu_sm_80", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
201-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
200+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
202201
def : CudaTargetInfo<"nvidia_gpu_sm_86", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
203-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
202+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
204203
def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
205-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
204+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
206205
def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
207-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
206+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
208207
def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
209-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
208+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
210209
def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
211-
[AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
210+
[AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
212211

213212
//
214213
// HIP / AMDGPU device aspects

llvm/include/llvm/CodeGen/FPBuiltinFnSelection.h renamed to llvm/include/llvm/Transforms/Scalar/FPBuiltinFnSelection.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===- FPBuiltinFnSelection.h - Pre-ISel intrinsic lowering pass ----------===//
1+
//===- FPBuiltinFnSelection.h - fpbuiltin intrinsic lowering pass ---------===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -10,8 +10,8 @@
1010
// llvm.fpbuiltin.* intrinsics.
1111
//
1212
//===----------------------------------------------------------------------===//
13-
#ifndef LLVM_CODEGEN_FPBUILTINFNSELECTION_H
14-
#define LLVM_CODEGEN_FPBUILTINFNSELECTION_H
13+
#ifndef LLVM_TRANSFORMS_SCALAR_FPBUILTINFNSELECTION_H
14+
#define LLVM_TRANSFORMS_SCALAR_FPBUILTINFNSELECTION_H
1515

1616
#include "llvm/IR/PassManager.h"
1717

@@ -25,4 +25,4 @@ struct FPBuiltinFnSelectionPass : PassInfoMixin<FPBuiltinFnSelectionPass> {
2525

2626
} // end namespace llvm
2727

28-
#endif // LLVM_CODEGEN_FPBUILTINFNSELECTION_H
28+
#endif // LLVM_TRANSFORMS_SCALAR_FPBUILTINFNSELECTION_H

llvm/lib/CodeGen/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ add_llvm_component_library(LLVMCodeGen
6363
ExpandVectorPredication.cpp
6464
FaultMaps.cpp
6565
FEntryInserter.cpp
66-
FPBuiltinFnSelection.cpp
6766
FinalizeISel.cpp
6867
FixupStatepointCallerSaved.cpp
6968
FuncletLayout.cpp

llvm/lib/SYCLLowerIR/LowerWGScope.cpp

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -65,19 +65,6 @@
6565
// (1) - materialization of a PFWI object
6666
// (2) - "fixup" of the private variable address.
6767
//
68-
// TODO: add support for the case when there are other functions between
69-
// parallel_for_work_group and parallel_for_work_item in the call stack.
70-
// For example:
71-
//
72-
// void foo(sycl::group<1> group, ...) {
73-
// group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
74-
// }
75-
// ...
76-
// cgh.parallel_for_work_group<class kernel>(
77-
// range<1>(...), range<1>(...), [=](group<1> g) {
78-
// foo(g, ...);
79-
// });
80-
//
8168
// TODO The approach employed by this pass generates lots of barriers and data
8269
// copying between private and local memory, which might not be efficient. There
8370
// are optimization opportunities listed below. Also other approaches can be
@@ -209,11 +196,36 @@ static bool isCallToAFuncMarkedWithMD(const Instruction *I, const char *MD) {
209196
return F && F->getMetadata(MD);
210197
}
211198

212-
// Checks is this is a call to parallel_for_work_item.
199+
// Recursively searches for a call to a function with work_group
200+
// metadata inside F.
201+
static bool hasCallToAFuncWithWGMetadata(Function &F) {
202+
for (auto &BB : F)
203+
for (auto &I : BB) {
204+
if (isCallToAFuncMarkedWithMD(&I, WG_SCOPE_MD))
205+
return true;
206+
const CallInst *Call = dyn_cast<CallInst>(&I);
207+
Function *F = dyn_cast_or_null<Function>(Call ? Call->getCalledFunction()
208+
: nullptr);
209+
if (F && hasCallToAFuncWithWGMetadata(*F))
210+
return true;
211+
}
212+
return false;
213+
}
214+
215+
// Checks if this is a call to parallel_for_work_item.
213216
static bool isPFWICall(const Instruction *I) {
214217
return isCallToAFuncMarkedWithMD(I, PFWI_MD);
215218
}
216219

220+
// Checks if F has any calls to function marked with PFWI_MD metadata.
221+
static bool hasPFWICall(Function &F) {
222+
for (auto &BB : F)
223+
for (auto &I : BB)
224+
if (isPFWICall(&I))
225+
return true;
226+
return false;
227+
}
228+
217229
// Checks if given instruction must be executed by all work items.
218230
static bool isWIScopeInst(const Instruction *I) {
219231
if (I->isTerminator())
@@ -425,6 +437,17 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
425437
}
426438
}
427439

440+
// Skip allocas, addrspacecasts associated with allocas and debug insts.
441+
static Instruction *getFirstInstToProcess(BasicBlock *BB) {
442+
Instruction *I = &BB->front();
443+
for (;
444+
I->getOpcode() == Instruction::Alloca ||
445+
I->getOpcode() == Instruction::AddrSpaceCast || I->isDebugOrPseudoInst();
446+
I = I->getNextNode()) {
447+
}
448+
return I;
449+
}
450+
428451
// Performs the following transformation for each basic block in the input map:
429452
//
430453
// BB:
@@ -462,7 +485,11 @@ static void materializeLocalsInWIScopeBlocksImpl(
462485
for (auto &P : BB2MatLocals) {
463486
// generate LeaderBB and private<->shadow copies in proper BBs
464487
BasicBlock *LeaderBB = P.first;
465-
BasicBlock *BB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "LeaderMat");
488+
// Skip allocas, addrspacecasts associated with allocas and debug insts.
489+
// Alloca instructions and it's associated instructions must be in the
490+
// beginning of the function.
491+
Instruction *LeaderBBFront = getFirstInstToProcess(LeaderBB);
492+
BasicBlock *BB = LeaderBB->splitBasicBlock(LeaderBBFront, "LeaderMat");
466493
// Add a barrier to the original block:
467494
Instruction *At =
468495
spirv::genWGBarrier(*BB->getFirstNonPHI(), TT)->getNextNode();
@@ -476,7 +503,8 @@ static void materializeLocalsInWIScopeBlocksImpl(
476503
// fill the leader BB:
477504
// fetch data from leader's private copy (which is always up to date) into
478505
// the corresponding shadow variable
479-
Builder.SetInsertPoint(&LeaderBB->front());
506+
LeaderBBFront = getFirstInstToProcess(LeaderBB);
507+
Builder.SetInsertPoint(LeaderBBFront);
480508
copyBetweenPrivateAndShadow(L, Shadow, Builder, true /*private->shadow*/);
481509
// store data to the local variable - effectively "refresh" the value of
482510
// the local in each work item in the work group
@@ -485,8 +513,8 @@ static void materializeLocalsInWIScopeBlocksImpl(
485513
false /*shadow->private*/);
486514
}
487515
// now generate the TestBB and the leader WI guard
488-
BasicBlock *TestBB =
489-
LeaderBB->splitBasicBlock(&LeaderBB->front(), "TestMat");
516+
LeaderBBFront = getFirstInstToProcess(LeaderBB);
517+
BasicBlock *TestBB = LeaderBB->splitBasicBlock(LeaderBBFront, "TestMat");
490518
std::swap(TestBB, LeaderBB);
491519
guardBlockWithIsLeaderCheck(TestBB, LeaderBB, BB, At->getDebugLoc(), TT);
492520
}
@@ -752,6 +780,10 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
752780
FunctionAnalysisManager &FAM) {
753781
if (!F.getMetadata(WG_SCOPE_MD))
754782
return PreservedAnalyses::all();
783+
// If a function does not have any PFWI calls and it has calls to a function
784+
// that has work_group metadata, then we do not need to lower such functions.
785+
if (!hasPFWICall(F) && hasCallToAFuncWithWGMetadata(F))
786+
return PreservedAnalyses::all();
755787
LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n");
756788
const auto &TT = llvm::Triple(F.getParent()->getTargetTriple());
757789
// Ranges of "side effect" instructions

llvm/lib/Transforms/Scalar/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ add_llvm_component_library(LLVMScalarOpts
1414
EarlyCSE.cpp
1515
FlattenCFGPass.cpp
1616
Float2Int.cpp
17+
FPBuiltinFnSelection.cpp
1718
GuardWidening.cpp
1819
GVN.cpp
1920
GVNHoist.cpp
@@ -97,4 +98,5 @@ add_llvm_component_library(LLVMScalarOpts
9798
InstCombine
9899
Support
99100
TransformUtils
101+
TargetParser
100102
)

llvm/lib/CodeGen/FPBuiltinFnSelection.cpp renamed to llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===- FPBuiltinFnSelection.cpp - Pre-ISel intrinsic lowering pass --------===//
1+
//===- FPBuiltinFnSelection.cpp - fpbuiltin intrinsic lowering pass -------===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -11,7 +11,7 @@
1111
//
1212
//===----------------------------------------------------------------------===//
1313

14-
#include "llvm/CodeGen/FPBuiltinFnSelection.h"
14+
#include "llvm/Transforms/Scalar/FPBuiltinFnSelection.h"
1515
#include "llvm/Analysis/TargetLibraryInfo.h"
1616
#include "llvm/Analysis/TargetTransformInfo.h"
1717
#include "llvm/CodeGen/Passes.h"

sycl/doc/design/DeviceConfigFile.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ def AspectExt_oneapi_native_assert : Aspect<"ext_oneapi_native_assert">;
176176
def AspectHost_debuggable : Aspect<"host_debuggable">;
177177
def AspectExt_intel_gpu_hw_threads_per_eu : Aspect<"ext_intel_gpu_hw_threads_per_eu">;
178178
def AspectExt_oneapi_cuda_async_barrier : Aspect<"ext_oneapi_cuda_async_barrier">;
179-
def AspectExt_oneapi_bfloat16_math_functions : Aspect<"ext_oneapi_bfloat16_math_functions">;
180179
def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">;
181180
def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">;
182181
def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">;

sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ The descriptions of the `fma`, `fmin`, `fmax`, `fabs`, `isnan`, `ceil`, `floor`,
6767
specification:
6868
https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_math_functions.
6969

70+
[NOTE]
71+
The bfloat16 type is supported on all devices. DPC++ currently supports some
72+
bfloat16 type math functions natively on Intel Xe HP GPUs and Nvidia GPUs with
73+
Compute Capability >= SM80. On other devices, and in host code, such functions
74+
are emulated in software.
75+
7076
== Specification
7177

7278
=== Feature test macro
@@ -86,21 +92,6 @@ supports.
8692
|1 |The APIs of this experimental extension are not versioned, so the feature-test macro always has this value.
8793
|===
8894

89-
=== Extension to `enum class aspect`
90-
91-
[source]
92-
----
93-
namespace sycl {
94-
enum class aspect {
95-
...
96-
sycl_ext_oneapi_bfloat16_math_functions
97-
}
98-
}
99-
----
100-
101-
If a SYCL device has the `sycl_ext_oneapi_bfloat16_math_functions` aspect,
102-
then it supports the `bfloat16` math functions described in the next section.
103-
10495
=== Math Functions
10596

10697
==== isnan

0 commit comments

Comments
 (0)