Skip to content

Commit 12e4dc7

Browse files
author
iclsrc
committed
Merge from 'sycl' to 'sycl-web' (#2)
2 parents f88df8c + 0aac708 commit 12e4dc7

File tree

12 files changed

+420
-1
lines changed

12 files changed

+420
-1
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,19 @@ def SYCLIntelNumSimdWorkItems : InheritableAttr {
12361236
let PragmaAttributeSupport = 0;
12371237
}
12381238

1239+
def SYCLIntelStallEnable : InheritableAttr {
1240+
let Spellings = [CXX11<"intel","stall_enable">];
1241+
let LangOpts = [SYCLIsHost, SYCLIsDevice];
1242+
let Subjects = SubjectList<[Function], ErrorDiag>;
1243+
let AdditionalMembers = [{
1244+
static const char *getName() {
1245+
return "stall_enable";
1246+
}
1247+
}];
1248+
let Documentation = [SYCLIntelStallEnableAttrDocs];
1249+
let PragmaAttributeSupport = 0;
1250+
}
1251+
12391252
def SYCLIntelSchedulerTargetFmaxMhz : InheritableAttr {
12401253
let Spellings = [CXX11<"intelfpga","scheduler_target_fmax_mhz">,
12411254
CXX11<"intel","scheduler_target_fmax_mhz">];

clang/include/clang/Basic/AttrDocs.td

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2229,6 +2229,39 @@ device kernel, the attribute is ignored and it is not propagated to a kernel.
22292229
}];
22302230
}
22312231

2232+
def SYCLIntelStallEnableAttrDocs : Documentation {
2233+
let Category = DocCatFunction;
2234+
let Heading = "intel::stall_enable";
2235+
let Content = [{
2236+
When applied to a lambda or function call operator (of a function object)
2237+
on device, this requests, to the extent possible, that statically-scheduled
2238+
clusters handle stalls using a stall-enable signal to freeze computation
2239+
within the cluster. This attribute is ignored on the host.
2240+
2241+
If ``intel::stall_enable`` is applied to a function called from a device
2242+
kernel, the attribute is ignored and it is not propagated to a kernel.
2243+
2244+
The ``intel::stall_enable`` attribute takes no argument and has an effect
2245+
when applied to a function, and no effect otherwise.
2246+
2247+
.. code-block:: c++
2248+
2249+
class Functor
2250+
{
2251+
[[intel::stall_enable]] void operator()(item<1> item)
2252+
{
2253+
/* kernel code */
2254+
}
2255+
}
2256+
2257+
kernel<class kernel_name>(
2258+
[]() [[intel::stall_enable]] {
2259+
/* kernel code */
2260+
});
2261+
2262+
}];
2263+
}
2264+
22322265
def ReqdWorkGroupSizeAttrDocs : Documentation {
22332266
let Category = DocCatFunction;
22342267
let Heading = "reqd_work_group_size";

clang/include/clang/Basic/AttributeCommonInfo.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,8 @@ class AttributeCommonInfo {
165165
ParsedAttr == AT_SYCLIntelSchedulerTargetFmaxMhz ||
166166
ParsedAttr == AT_SYCLIntelMaxWorkGroupSize ||
167167
ParsedAttr == AT_SYCLIntelMaxGlobalWorkDim ||
168-
ParsedAttr == AT_SYCLIntelNoGlobalWorkOffset)
168+
ParsedAttr == AT_SYCLIntelNoGlobalWorkOffset ||
169+
ParsedAttr == AT_SYCLIntelStallEnable)
169170
return true;
170171

171172
return false;

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,12 @@ void CodeGenFunction::EmitOpenCLKernelMetadata(const FunctionDecl *FD,
693693
if (A->getEnabled())
694694
Fn->setMetadata("no_global_work_offset", llvm::MDNode::get(Context, {}));
695695
}
696+
697+
if (FD->hasAttr<SYCLIntelStallEnableAttr>()) {
698+
llvm::Metadata *AttrMDArgs[] = {
699+
llvm::ConstantAsMetadata::get(Builder.getInt32(1))};
700+
Fn->setMetadata("stall_enable", llvm::MDNode::get(Context, AttrMDArgs));
701+
}
696702
}
697703

698704
/// Determine whether the function F ends with a return stmt.

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3029,6 +3029,20 @@ static void handleNumSimdWorkItemsAttr(Sema &S, Decl *D,
30293029
E);
30303030
}
30313031

3032+
// Handles stall_enable
3033+
static void handleStallEnableAttr(Sema &S, Decl *D, const ParsedAttr &Attr) {
3034+
if (D->isInvalidDecl())
3035+
return;
3036+
3037+
unsigned NumArgs = Attr.getNumArgs();
3038+
if (NumArgs > 0) {
3039+
S.Diag(Attr.getLoc(), diag::warn_attribute_too_many_arguments) << Attr << 0;
3040+
return;
3041+
}
3042+
3043+
handleSimpleAttribute<SYCLIntelStallEnableAttr>(S, D, Attr);
3044+
}
3045+
30323046
// Add scheduler_target_fmax_mhz
30333047
void Sema::addSYCLIntelSchedulerTargetFmaxMhzAttr(
30343048
Decl *D, const AttributeCommonInfo &Attr, Expr *E) {
@@ -8401,6 +8415,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
84018415
case ParsedAttr::AT_SYCLIntelNoGlobalWorkOffset:
84028416
handleNoGlobalWorkOffsetAttr(S, D, AL);
84038417
break;
8418+
case ParsedAttr::AT_SYCLIntelStallEnable:
8419+
handleStallEnableAttr(S, D, AL);
8420+
break;
84048421
case ParsedAttr::AT_VecTypeHint:
84058422
handleVecTypeHint(S, D, AL);
84068423
break;

clang/lib/Sema/SemaSYCL.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,20 @@ class MarkDeviceFunction : public RecursiveASTVisitor<MarkDeviceFunction> {
545545

546546
if (auto *A = FD->getAttr<SYCLSimdAttr>())
547547
Attrs.insert(A);
548+
549+
// Allow the kernel attribute "stall_enable" only on lambda functions
550+
// and function objects that are called directly from a kernel
551+
// (i.e. the one passed to the single_task or parallel_for functions).
552+
// For all other cases, emit a warning and ignore.
553+
if (auto *A = FD->getAttr<SYCLIntelStallEnableAttr>()) {
554+
if (ParentFD == SYCLKernel) {
555+
Attrs.insert(A);
556+
} else {
557+
SemaRef.Diag(A->getLocation(), diag::warn_attribute_ignored) << A;
558+
FD->dropAttr<SYCLIntelStallEnableAttr>();
559+
}
560+
}
561+
548562
// Propagate the explicit SIMD attribute through call graph - it is used
549563
// to distinguish ESIMD code in ESIMD LLVM passes.
550564
if (KernelBody && KernelBody->hasAttr<SYCLSimdAttr>() &&
@@ -3222,6 +3236,7 @@ void Sema::MarkDevice(void) {
32223236
case attr::Kind::SYCLIntelSchedulerTargetFmaxMhz:
32233237
case attr::Kind::SYCLIntelMaxGlobalWorkDim:
32243238
case attr::Kind::SYCLIntelNoGlobalWorkOffset:
3239+
case attr::Kind::SYCLIntelStallEnable:
32253240
case attr::Kind::SYCLSimd: {
32263241
if ((A->getKind() == attr::Kind::SYCLSimd) && KernelBody &&
32273242
!KernelBody->getAttr<SYCLSimdAttr>()) {
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// RUN: %clang_cc1 -fsycl -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown-sycldevice -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
2+
3+
#include "sycl.hpp"
4+
5+
using namespace cl::sycl;
6+
queue q;
7+
8+
class Foo {
9+
public:
10+
[[intel::stall_enable]] void operator()() const {}
11+
};
12+
13+
int main() {
14+
q.submit([&](handler &h) {
15+
Foo f;
16+
h.single_task<class test_kernel1>(f);
17+
18+
h.single_task<class test_kernel2>(
19+
[]() [[intel::stall_enable]]{});
20+
});
21+
return 0;
22+
}
23+
24+
// CHECK: define spir_kernel void @"{{.*}}test_kernel1"() #0 {{.*}} !stall_enable ![[NUM5:[0-9]+]]
25+
// CHECK: define spir_kernel void @"{{.*}}test_kernel2"() #0 {{.*}} !stall_enable ![[NUM5]]
26+
// CHECK: ![[NUM5]] = !{i32 1}

clang/test/SemaSYCL/stall_enable.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 %s -fsyntax-only -fsycl -internal-isystem %S/Inputs -fsycl-is-device -Wno-sycl-2017-compat -DTRIGGER_ERROR -verify
2+
// RUN: %clang_cc1 -fsycl -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -Wno-sycl-2017-compat %s | FileCheck %s
3+
4+
#include "sycl.hpp"
5+
6+
using namespace cl::sycl;
7+
queue q;
8+
9+
[[intel::stall_enable]] void test() {} //expected-warning{{'stall_enable' attribute ignored}}
10+
11+
#ifdef TRIGGER_ERROR
12+
[[intel::stall_enable(1)]] void bar1() {} // expected-error{{'stall_enable' attribute takes no arguments}}
13+
[[intel::stall_enable]] int N; // expected-error{{'stall_enable' attribute only applies to functions}}
14+
#endif
15+
16+
struct FuncObj {
17+
[[intel::stall_enable]] void operator()() const {}
18+
};
19+
20+
int main() {
21+
q.submit([&](handler &h) {
22+
// CHECK-LABEL: FunctionDecl {{.*}}test_kernel1
23+
// CHECK: SYCLIntelStallEnableAttr {{.*}}
24+
h.single_task<class test_kernel1>(
25+
FuncObj());
26+
27+
// CHECK-LABEL: FunctionDecl {{.*}}test_kernel2
28+
// CHECK: SYCLIntelStallEnableAttr {{.*}}
29+
h.single_task<class test_kernel2>(
30+
[]() [[intel::stall_enable]]{});
31+
32+
// CHECK-LABEL: FunctionDecl {{.*}}test_kernel3
33+
// CHECK-NOT: SYCLIntelStallEnableAttr {{.*}}
34+
h.single_task<class test_kernel3>(
35+
[]() { test(); });
36+
});
37+
return 0;
38+
}

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,15 @@ class ESIMDIntrinDescTable {
252252
{"flat_write", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}},
253253
{"flat_write4",
254254
{"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}},
255+
256+
// surface index-based gather/scatter:
257+
// num blocks, scale, surface index, global offset, elem offsets
258+
{"surf_read", {"gather.scaled2", {t(3), c16(0), aSI(1), a(2), a(3)}}},
259+
// pred, num blocks, scale, surface index, global offset, elem offsets,
260+
// data to write
261+
{"surf_write",
262+
{"scatter.scaled", {ai1(0), t(3), c16(0), aSI(2), a(3), a(4), a(5)}}},
263+
255264
// intrinsics to query thread's coordinates:
256265
{"group_id_x", {"group.id.x", {}}},
257266
{"group_id_y", {"group.id.y", {}}},

sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,92 @@ SYCL_EXTERNAL void __esimd_flat_write4(
8181
sycl::INTEL::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
8282
sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred = 1);
8383

84+
// Low-level surface-based gather. Collects elements located at given offsets in
85+
// a surface and returns them as a single \ref simd object. Element can be
86+
// 1, 2 or 4-byte value, but is always returned as a 4-byte value within the
87+
// resulting simd object, with upper 2 or 3 bytes undefined.
88+
// Template (compile-time constant) parameters:
89+
// @tparam Ty - element type; can only be a 4-byte integer or \c float,
90+
// @tparam N - the number of elements
91+
// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
92+
// accessor used to denote the surface
93+
// @tparam TySizeLog2 - Log2 of the number of bytes read per element:
94+
// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
95+
// @tparam L1H - L1 cache hint
96+
// @tparam L3H - L3 cache hint
97+
//
98+
// Formal parameters:
99+
// @param scale - the scale; must be 0
100+
// @param surf_ind - the surface index, taken from the SYCL memory object
101+
// @param global_offset - offset added to each individual element's offset to
102+
// compute actual memory access offset for that element
103+
// @param elem_offsets - per-element offsets
104+
//
105+
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
106+
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
107+
sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
108+
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<Ty, N>
109+
__esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind,
110+
uint32_t global_offset,
111+
sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets)
112+
#ifdef __SYCL_DEVICE_ONLY__
113+
;
114+
#else
115+
{
116+
static_assert(N == 1 || N == 8 || N == 16);
117+
static_assert(TySizeLog2 <= 2);
118+
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
119+
throw cl::sycl::feature_not_supported();
120+
}
121+
#endif // __SYCL_DEVICE_ONLY__
122+
123+
// Low-level surface-based scatter. Writes elements of a \ref simd object into a
124+
// surface at given offsets. Element can be a 1, 2 or 4-byte value, but it is
125+
// always represented as a 4-byte value within the input simd object,
126+
// unused (not written) upper bytes are ignored.
127+
// Template (compile-time constant) parameters:
128+
// @tparam Ty - element type; can only be a 4-byte integer or \c float,
129+
// @tparam N - the number of elements to write
130+
// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
131+
// accessor used to denote the surface
132+
// @tparam TySizeLog2 - Log2 of the number of bytes written per element:
133+
// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
134+
// @tparam L1H - L1 cache hint
135+
// @tparam L3H - L3 cache hint
136+
//
137+
// Formal parameters:
138+
// @param pred - per-element predicates; elements with zero corresponding
139+
// predicates are not written
140+
// @param scale - the scale; must be 0
141+
// @param surf_ind - the surface index, taken from the SYCL memory object
142+
// @param global_offset - offset added to each individual element's offset to
143+
// compute actual memory access offset for that element
144+
// @param elem_offsets - per-element offsets
145+
// @param vals - values to write
146+
//
147+
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
148+
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
149+
sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
150+
SYCL_EXTERNAL void
151+
__esimd_surf_write(sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred,
152+
int16_t scale, SurfIndAliasTy surf_ind,
153+
uint32_t global_offset,
154+
sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets,
155+
sycl::INTEL::gpu::vector_type_t<Ty, N> vals)
156+
#ifdef __SYCL_DEVICE_ONLY__
157+
;
158+
#else
159+
{
160+
static_assert(N == 1 || N == 8 || N == 16);
161+
static_assert(TySizeLog2 <= 2);
162+
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
163+
throw cl::sycl::feature_not_supported();
164+
}
165+
#endif // __SYCL_DEVICE_ONLY__
166+
167+
// TODO bring the parameter order of __esimd* intrinsics in accordance with the
168+
// correponsing BE intrinsicics parameter order.
169+
84170
// flat_atomic: flat-address atomic
85171
template <sycl::INTEL::gpu::EsimdAtomicOpType Op, typename Ty, int N,
86172
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,

0 commit comments

Comments
 (0)