Skip to content

Commit c60ec69

Browse files
committed
Merge remote-tracking branch 'upstream/sycl' into opt_math_builtins
2 parents fc6aa6a + 0eeae2a commit c60ec69

File tree

61 files changed

+2060
-1620
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+2060
-1620
lines changed

clang/lib/Driver/Driver.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,11 +1225,36 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
12251225
continue;
12261226
}
12271227

1228-
if (!isValidSYCLTriple(MakeSYCLDeviceTriple(UserTargetName))) {
1228+
llvm::Triple DeviceTriple(MakeSYCLDeviceTriple(UserTargetName));
1229+
if (!isValidSYCLTriple(DeviceTriple)) {
12291230
Diag(clang::diag::err_drv_invalid_sycl_target) << Val;
12301231
continue;
12311232
}
12321233

1234+
// For any -fsycl-targets=spir64_gen additions, we will scan the
1235+
// additional -X* options for potential -device settings. These
1236+
// need to be added as a known Arch to the packager.
1237+
if (DeviceTriple.isSPIRAOT() && Arch.empty() &&
1238+
DeviceTriple.getSubArch() == llvm::Triple::SPIRSubArch_gen) {
1239+
const ToolChain *HostTC =
1240+
C.getSingleOffloadToolChain<Action::OFK_Host>();
1241+
auto DeviceTC = std::make_unique<toolchains::SYCLToolChain>(
1242+
*this, DeviceTriple, *HostTC, C.getInputArgs());
1243+
assert(DeviceTC && "Device toolchain not defined.");
1244+
ArgStringList TargetArgs;
1245+
DeviceTC->TranslateBackendTargetArgs(DeviceTC->getTriple(),
1246+
C.getInputArgs(), TargetArgs);
1247+
// Look for -device <string> and use that as the known arch to
1248+
// be associated with the current spir64_gen entry. Grab the
1249+
// right most entry.
1250+
for (int i = TargetArgs.size() - 2; i >= 0; --i) {
1251+
if (StringRef(TargetArgs[i]) == "-device") {
1252+
Arch = TargetArgs[i + 1];
1253+
break;
1254+
}
1255+
}
1256+
}
1257+
12331258
// Make sure we don't have a duplicate triple.
12341259
std::string NormalizedName = MakeSYCLDeviceTriple(Val).normalize();
12351260
auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
@@ -1242,7 +1267,6 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
12421267
// Store the current triple so that we can check for duplicates in
12431268
// the following iterations.
12441269
FoundNormalizedTriples[NormalizedName] = Val;
1245-
llvm::Triple DeviceTriple(MakeSYCLDeviceTriple(UserTargetName));
12461270
SYCLTriples.insert(DeviceTriple.normalize());
12471271
if (!Arch.empty())
12481272
DerivedArchs[DeviceTriple.getTriple()].insert(Arch);

clang/test/Driver/sycl-offload-new-driver.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,17 @@
9191
// RUN: | FileCheck -check-prefix=CHK_ARCH \
9292
// RUN: -DTRIPLE=spir64_gen-unknown-unknown -DARCH=pvc %s
9393
// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
94+
// RUN: -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen \
95+
// RUN: "-device pvc" --offload-new-driver %s 2>&1 \
96+
// RUN: | FileCheck -check-prefix=CHK_ARCH \
97+
// RUN: -DTRIPLE=spir64_gen-unknown-unknown -DARCH=pvc %s
98+
// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
99+
// RUN: -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen \
100+
// RUN: "-device pvc" -Xsycl-target-backend=spir64_gen "-device dg1" \
101+
// RUN: --offload-new-driver %s 2>&1 \
102+
// RUN: | FileCheck -check-prefix=CHK_ARCH \
103+
// RUN: -DTRIPLE=spir64_gen-unknown-unknown -DARCH=dg1 %s
104+
// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
94105
// RUN: -fno-sycl-libspirv -fsycl-targets=amd_gpu_gfx900 \
95106
// RUN: -nogpulib --offload-new-driver %s 2>&1 \
96107
// RUN: | FileCheck -check-prefix=CHK_ARCH \

devops/containers/ubuntu2204_build.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda:12.5.0-devel-ubuntu22.04
1+
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
22

33
ENV DEBIAN_FRONTEND=noninteractive
44

devops/cts_exclude_filter

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,3 @@ marray
55
math_builtin_api
66
# https://github.com/intel/llvm/issues/13574
77
hierarchical
8-
accessor

devops/dependencies-igc-dev.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"linux": {
33
"igc_dev": {
4-
"github_tag": "igc-dev-3bd1d5e",
5-
"version": "3bd1d5e",
6-
"updated_at": "2024-06-08T23:45:49Z",
7-
"url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1582291042/zip",
4+
"github_tag": "igc-dev-480f8b6",
5+
"version": "480f8b6",
6+
"updated_at": "2024-06-12T22:42:55Z",
7+
"url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1595870554/zip",
88
"root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
99
}
1010
}

libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,42 +10,56 @@
1010
#include <spirv/spirv.h>
1111
#include <spirv/spirv_types.h>
1212

13-
#define BUILTIN_FENCE(semantics, scope_memory) \
14-
if (semantics & Acquire) \
15-
return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory); \
16-
else if (semantics & Release) \
17-
return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory); \
18-
else if (semantics & AcquireRelease) \
19-
return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory); \
20-
else if (semantics & SequentiallyConsistent) \
21-
return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory); \
22-
else \
23-
return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);
2413

25-
_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory,
26-
unsigned int semantics) {
14+
#define BUILTIN_FENCE(order, scope_memory) \
15+
/* None implies Monotonic (for llvm/AMDGPU), or relaxed in C++. \
16+
* This does not make sense as ordering argument for a fence instruction \
17+
* and is not part of the supported orderings for a fence in AMDGPU. */ \
18+
if (order != None) { \
19+
switch (order) { \
20+
case Acquire: \
21+
return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory); \
22+
case Release: \
23+
return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory); \
24+
case AcquireRelease: \
25+
return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory); \
26+
case SequentiallyConsistent: \
27+
return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory); \
28+
default: \
29+
__builtin_trap(); \
30+
__builtin_unreachable(); \
31+
} \
32+
}
33+
34+
_CLC_INLINE void builtin_fence_order(unsigned int scope_memory,
35+
unsigned int order) {
2736
switch ((enum Scope)scope_memory) {
2837
case CrossDevice:
29-
BUILTIN_FENCE(semantics, "")
38+
BUILTIN_FENCE(order, "")
3039
case Device:
31-
BUILTIN_FENCE(semantics, "agent")
40+
BUILTIN_FENCE(order, "agent")
3241
case Workgroup:
33-
BUILTIN_FENCE(semantics, "workgroup")
42+
BUILTIN_FENCE(order, "workgroup")
3443
case Subgroup:
35-
BUILTIN_FENCE(semantics, "wavefront")
44+
BUILTIN_FENCE(order, "wavefront")
3645
case Invocation:
37-
BUILTIN_FENCE(semantics, "singlethread")
46+
BUILTIN_FENCE(order, "singlethread")
3847
}
3948
}
4049
#undef BUILTIN_FENCE
4150

51+
_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory,
52+
unsigned int semantics) {
53+
builtin_fence_order(scope_memory, semantics & 0x1F);
54+
}
55+
4256
_CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int scope_memory,
4357
unsigned int semantics) {
4458
__mem_fence(scope_memory, semantics);
4559
}
4660

4761
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
48-
__spirv_ControlBarrier(unsigned int scope_execution, unsigned scope_memory,
62+
__spirv_ControlBarrier(unsigned int scope_execution, unsigned int scope_memory,
4963
unsigned int semantics) {
5064
if (semantics) {
5165
__mem_fence(scope_memory, semantics);

libdevice/cmath_wrapper.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,18 @@ extern "C" SYCL_EXTERNAL float __devicelib_fminf(float, float);
3939
DEVICE_EXTERN_C_INLINE
4040
float fminf(float x, float y) { return __devicelib_fminf(x, y); }
4141

42+
DEVICE_EXTERN_C_INLINE
43+
float truncf(float x) { return __devicelib_truncf(x); }
44+
45+
DEVICE_EXTERN_C_INLINE
46+
float sinpif(float x) { return __devicelib_sinpif(x); }
47+
48+
DEVICE_EXTERN_C_INLINE
49+
float rsqrtf(float x) { return __devicelib_rsqrtf(x); }
50+
51+
DEVICE_EXTERN_C_INLINE
52+
float exp10f(float x) { return __devicelib_exp10f(x); }
53+
4254
DEVICE_EXTERN_C_INLINE
4355
div_t div(int x, int y) { return __devicelib_div(x, y); }
4456

libdevice/cmath_wrapper_fp64.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,18 @@ extern "C" SYCL_EXTERNAL double __devicelib_fmin(double, double);
3636
DEVICE_EXTERN_C_INLINE
3737
double fmin(double x, double y) { return __devicelib_fmin(x, y); }
3838

39+
DEVICE_EXTERN_C_INLINE
40+
double trunc(double x) { return __devicelib_trunc(x); }
41+
42+
DEVICE_EXTERN_C_INLINE
43+
double sinpi(double x) { return __devicelib_sinpi(x); }
44+
45+
DEVICE_EXTERN_C_INLINE
46+
double rsqrt(double x) { return __devicelib_rsqrt(x); }
47+
48+
DEVICE_EXTERN_C_INLINE
49+
double exp10(double x) { return __devicelib_exp10(x); }
50+
3951
DEVICE_EXTERN_C_INLINE
4052
double log(double x) { return __devicelib_log(x); }
4153

libdevice/device_math.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,30 @@ float __devicelib_fminf(float x, float y);
7676
DEVICE_EXTERN_C
7777
double __devicelib_fmin(double x, double y);
7878

79+
DEVICE_EXTERN_C
80+
float __devicelib_truncf(float x);
81+
82+
DEVICE_EXTERN_C
83+
double __devicelib_trunc(double x);
84+
85+
DEVICE_EXTERN_C
86+
double __devicelib_sinpi(double x);
87+
88+
DEVICE_EXTERN_C
89+
float __devicelib_sinpif(float x);
90+
91+
DEVICE_EXTERN_C
92+
double __devicelib_rsqrt(double x);
93+
94+
DEVICE_EXTERN_C
95+
float __devicelib_rsqrtf(float x);
96+
97+
DEVICE_EXTERN_C
98+
double __devicelib_exp10(double x);
99+
100+
DEVICE_EXTERN_C
101+
float __devicelib_exp10f(float x);
102+
79103
DEVICE_EXTERN_C
80104
div_t __devicelib_div(int x, int y);
81105

libdevice/fallback-cmath-fp64.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@ double __devicelib_fmax(double x, double y) { return __spirv_ocl_fmax(x, y); }
3535
DEVICE_EXTERN_C_INLINE
3636
double __devicelib_fmin(double x, double y) { return __spirv_ocl_fmin(x, y); }
3737

38+
DEVICE_EXTERN_C_INLINE
39+
double __devicelib_trunc(double x) { return __spirv_ocl_trunc(x); }
40+
41+
DEVICE_EXTERN_C_INLINE
42+
double __devicelib_sinpi(double x) { return __spirv_ocl_sinpi(x); }
43+
44+
DEVICE_EXTERN_C_INLINE
45+
double __devicelib_rsqrt(double x) { return __spirv_ocl_rsqrt(x); }
46+
47+
DEVICE_EXTERN_C_INLINE
48+
double __devicelib_exp10(double x) { return __spirv_ocl_exp10(x); }
49+
3850
DEVICE_EXTERN_C_INLINE
3951
double __devicelib_log(double x) { return __spirv_ocl_log(x); }
4052

libdevice/fallback-cmath.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ float __devicelib_fmaxf(float x, float y) { return __spirv_ocl_fmax(x, y); }
4545
DEVICE_EXTERN_C_INLINE
4646
float __devicelib_fminf(float x, float y) { return __spirv_ocl_fmin(x, y); }
4747

48+
DEVICE_EXTERN_C_INLINE
49+
float __devicelib_truncf(float x) { return __spirv_ocl_trunc(x); }
50+
51+
DEVICE_EXTERN_C_INLINE
52+
float __devicelib_sinpif(float x) { return __spirv_ocl_sinpi(x); }
53+
54+
DEVICE_EXTERN_C_INLINE
55+
float __devicelib_rsqrtf(float x) { return __spirv_ocl_rsqrt(x); }
56+
57+
DEVICE_EXTERN_C_INLINE
58+
float __devicelib_exp10f(float x) { return __spirv_ocl_exp10(x); }
59+
4860
DEVICE_EXTERN_C_INLINE
4961
div_t __devicelib_div(int x, int y) { return {x / y, x % y}; }
5062

mlir/utils/vscode/package-lock.json

Lines changed: 14 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sycl/doc/design/CommandGraph.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,24 @@ Level Zero:
438438
Future work will include exploring L0 API extensions to improve the mapping of
439439
UR command-buffer to L0 command-list.
440440

441+
#### Copy Engine
442+
443+
For performance considerations, the Unified Runtime Level Zero adapter uses
444+
different Level Zero command-queues to submit compute kernels and memory
445+
operations when the device has a dedicated copy engine. To take advantage of the
446+
copy engine when available, the graph workload can also be split between memory
447+
operations and compute kernels. To achieve this, two graph workload
448+
command-lists live simultaneously in a command-buffer.
449+
450+
When the command-buffer is finalized, memory operations (e.g. buffer copy,
451+
buffer fill, ...) are enqueued in the *copy* command-list while the other
452+
commands are enqueued in the compute command-list. On submission, if not empty,
453+
the *copy* command-list is sent to the main copy command-queue while the compute
454+
command-list is sent to the compute command-queue.
455+
456+
Both are executed concurrently. Synchronization between the command-lists is
457+
handled by Level Zero events.
458+
441459
### CUDA
442460

443461
The SYCL Graph CUDA backend relies on the

sycl/doc/developer/ContributeToDPCPP.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,39 @@ These tests verify SYCL specification conformance. All implementation details
164164
are out of scope for the tests.
165165
See DPC++ compiler invocation definitions at
166166
[FindIntel_SYCL](https://github.com/KhronosGroup/SYCL-CTS/blob/SYCL-1.2.1/master/cmake/FindIntel_SYCL.cmake))
167+
168+
## Unified Runtime Updates
169+
170+
To integrate changes from the [Unified Runtime][ur] project into DPC++ there
171+
two main options which depend on the scope of those changes and the current
172+
state of DPC++.
173+
174+
1. Synchronized update:
175+
* When: If the Unified Runtime change touches the API/ABI, more than one
176+
adapter, or common code such as the loader.
177+
* How: Update the `UNIFIED_RUNTIME_TAG` to point at the desired commit or tag
178+
name in the Unified Runtime repository and ensure that any tag for specific
179+
adapters are set to use `${UNIFIED_RUNTIME_TAG}`.
180+
181+
2. Decoupled update:
182+
* When: If only a single Unified Runtime adatper has changed.
183+
* How: Update the tag used in the `fetch_adapter_source()` call for a
184+
specific Unified Runtime adapter, e.g. Level Zero, OpenCL, CUDA, HIP, or
185+
Native CPU.
186+
187+
In general, a synchronized update should be the default. However, when there
188+
are a lot of changes in flight in parallel always synchronizing the tag can be
189+
troublesome. This is when a decoupled update can help sustain the merge
190+
velocity of Unified Runtime changes.
191+
192+
The [intel/unified-runtime-reviewers][ur-reviewers-team] team is responsible
193+
for ensuring that the Unified Runtime tag is updated correctly and will only
194+
provide code owner approval to pull requests once the following criteria are
195+
met:
196+
197+
* Tags are pointing to a valid commit or tag on Unified Runtime main branch.
198+
* Changes to additional code owned files are in a good state.
199+
* GitHub Actions checks are passing.
200+
201+
[ur]: https://github.com/oneapi-src/unified-runtime
202+
[ur-reviewers-team]: https://github.com/orgs/intel/teams/unified-runtime-reviewers

0 commit comments

Comments
 (0)