Skip to content

Commit 7303f8e

Browse files
frasercrmckjsji
authored andcommitted
[libclc] Support the generic address space (#137183)
This commit provides definitions of builtins with the generic address space. One concept to consider is the difference between supporting the generic address space from the user's perspective and the requirement for libclc as a compiler implementation detail to define separate generic address space builtins. In practice a target (like NVPTX) might notionally support the generic address space, but it's mapped to the same LLVM target address space as another address space (often the private one). In such cases libclc must be careful not to define both private and generic overloads of the same builtin. We track these two concepts separately, and make the assumption that if the generic address space does clash with another, it's with the private one. We track the concepts separately because there are some builtins such as atomics that are defined for the generic address space but not the private address space. Conflicts: libclc/clc/include/clc/clcfunc.h libclc/clc/include/clc/math/remquo_decl.inc libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc libclc/clc/include/clc/math/unary_decl_with_ptr.inc libclc/clc/lib/generic/math/clc_remquo.cl libclc/opencl/include/clc/opencl/shared/vload.h libclc/opencl/include/clc/opencl/shared/vstore.h libclc/opencl/lib/generic/shared/vload.cl libclc/opencl/lib/generic/shared/vload_half.inc
1 parent 2133dc5 commit 7303f8e

File tree

17 files changed

+107
-292
lines changed

17 files changed

+107
-292
lines changed

libclc/CMakeLists.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,32 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
531531
list( APPEND build_flags -mcpu=${cpu} )
532532
endif()
533533

534+
# Generic address space support.
535+
# Note: when declaring builtins, we must consider that even if a target
536+
# formally/nominally supports the generic address space, in practice that
537+
# target may map it to the same target address space as another address
538+
# space (often the private one). In such cases we must be careful not to
539+
# multiply-define a builtin in a single target address space, as it would
540+
# result in a mangling clash.
541+
# For this reason we must consider the target support of the generic
542+
# address space separately from the *implementation* decision about whether
543+
# to declare certain builtins in that address space.
544+
# Note: we assume that if there is no distinct generic address space, it
545+
# maps to the private address space.
546+
set ( private_addrspace_val 0 )
547+
set ( generic_addrspace_val 0 )
548+
if( ARCH STREQUAL amdgcn OR ARCH STREQUAL r600 OR ARCH STREQUAL amdgcn-amdhsa )
549+
set ( private_addrspace_val 5 )
550+
endif()
551+
if( ARCH STREQUAL spirv OR ARCH STREQUAL spirv64
552+
OR ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 )
553+
set ( generic_addrspace_val 4 )
554+
endif()
555+
list( APPEND build_flags
556+
-D__CLC_PRIVATE_ADDRSPACE_VAL=${private_addrspace_val}
557+
-D__CLC_GENERIC_ADDRSPACE_VAL=${generic_addrspace_val}
558+
)
559+
534560
set( clc_build_flags ${build_flags} -DCLC_INTERNAL )
535561

536562
add_libclc_builtin_set(

libclc/clc/include/clc/clcfunc.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@
3030
(__OPENCL_C_VERSION__ >= CL_VERSION_3_0 && \
3131
defined(__opencl_c_generic_address_space))
3232
#define _CLC_GENERIC_AS_SUPPORTED 1
33-
// Note that we hard-code the assumption that a non-distinct address space means
34-
// that the target maps the generic address space to the private address space.
35-
#ifdef __CLC_DISTINCT_GENERIC_ADDRSPACE__
33+
#if __CLC_PRIVATE_ADDRSPACE_VAL != __CLC_GENERIC_ADDRSPACE_VAL
3634
#define _CLC_DISTINCT_GENERIC_AS_SUPPORTED 1
3735
#else
3836
#define _CLC_DISTINCT_GENERIC_AS_SUPPORTED 0

libclc/clc/include/clc/math/remquo_decl.inc

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,19 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9-
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(
10-
__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_ADDRESS_SPACE __CLC_INTN *q);
9+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
10+
__CLC_GENTYPE y,
11+
private __CLC_INTN *q);
12+
13+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
14+
__CLC_GENTYPE y,
15+
global __CLC_INTN *q);
16+
17+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
18+
__CLC_GENTYPE y,
19+
local __CLC_INTN *q);
20+
#if _CLC_GENERIC_AS_SUPPORTED
21+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
22+
__CLC_GENTYPE y,
23+
generic __CLC_INTN *q);
24+
#endif

libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
1212
local __CLC_INTN *iptr);
1313
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
1414
private __CLC_INTN *iptr);
15-
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
15+
#if _CLC_GENERIC_AS_SUPPORTED
1616
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
1717
generic __CLC_INTN *iptr);
1818
#endif

libclc/clc/include/clc/math/unary_decl_with_ptr.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
1313
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
1414
__CLC_FUNCTION(__CLC_GENTYPE x, private __CLC_GENTYPE *ptr);
1515

16-
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
16+
#if _CLC_GENERIC_AS_SUPPORTED
1717
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
1818
__CLC_FUNCTION(__CLC_GENTYPE x, generic __CLC_GENTYPE *ptr);
1919
#endif

libclc/clc/lib/generic/math/clc_fract.inc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_fract(__CLC_GENTYPE x,
3434

3535
FRACT_DEF(local);
3636
FRACT_DEF(global);
37+
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
38+
FRACT_DEF(generic);
39+
#endif
3740

3841
#undef MIN_CONSTANT

libclc/clc/lib/generic/math/clc_frexp.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include <clc/clc_convert.h>
10+
#include <clc/clcfunc.h>
1011
#include <clc/internal/clc.h>
1112
#include <clc/math/math.h>
1213
#include <clc/relational/clc_isinf.h>

libclc/clc/lib/generic/math/clc_remquo.cl

Lines changed: 16 additions & 258 deletions
Original file line numberDiff line numberDiff line change
@@ -18,262 +18,20 @@
1818
#include <clc/math/math.h>
1919
#include <clc/shared/clc_max.h>
2020

21-
_CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
22-
__private int *quo) {
23-
x = __clc_flush_denormal_if_not_supported(x);
24-
y = __clc_flush_denormal_if_not_supported(y);
25-
int ux = __clc_as_int(x);
26-
int ax = ux & EXSIGNBIT_SP32;
27-
float xa = __clc_as_float(ax);
28-
int sx = ux ^ ax;
29-
int ex = ax >> EXPSHIFTBITS_SP32;
30-
31-
int uy = __clc_as_int(y);
32-
int ay = uy & EXSIGNBIT_SP32;
33-
float ya = __clc_as_float(ay);
34-
int sy = uy ^ ay;
35-
int ey = ay >> EXPSHIFTBITS_SP32;
36-
37-
float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
38-
float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
39-
int c;
40-
int k = ex - ey;
41-
42-
uint q = 0;
43-
44-
while (k > 0) {
45-
c = xr >= yr;
46-
q = (q << 1) | c;
47-
xr -= c ? yr : 0.0f;
48-
xr += xr;
49-
--k;
50-
}
51-
52-
c = xr > yr;
53-
q = (q << 1) | c;
54-
xr -= c ? yr : 0.0f;
55-
56-
int lt = ex < ey;
57-
58-
q = lt ? 0 : q;
59-
xr = lt ? xa : xr;
60-
yr = lt ? ya : yr;
61-
62-
c = (yr < 2.0f * xr) | ((yr == 2.0f * xr) & ((q & 0x1) == 0x1));
63-
xr -= c ? yr : 0.0f;
64-
q += c;
65-
66-
float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
67-
xr *= lt ? 1.0f : s;
68-
69-
int qsgn = sx == sy ? 1 : -1;
70-
int quot = (q & 0x7f) * qsgn;
71-
72-
c = ax == ay;
73-
quot = c ? qsgn : quot;
74-
xr = c ? 0.0f : xr;
75-
76-
xr = __clc_as_float(sx ^ __clc_as_int(xr));
77-
78-
c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
79-
ay == 0;
80-
quot = c ? 0 : quot;
81-
xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
82-
83-
*quo = quot;
84-
85-
return xr;
86-
}
87-
// remquo signature is special, we don't have macro for this
88-
#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \
89-
_CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo( \
90-
TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, __private int##VEC_SIZE *quo) { \
91-
int##HALF_VEC_SIZE lo, hi; \
92-
TYPE##VEC_SIZE ret; \
93-
ret.lo = __clc_remquo(x.lo, y.lo, &lo); \
94-
ret.hi = __clc_remquo(x.hi, y.hi, &hi); \
95-
(*quo).lo = lo; \
96-
(*quo).hi = hi; \
97-
return ret; \
98-
}
99-
100-
#define __VEC3_REMQUO(TYPE) \
101-
_CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo(TYPE##3 x, TYPE##3 y, \
102-
__private int##3 * quo) { \
103-
int2 lo; \
104-
int hi; \
105-
TYPE##3 ret; \
106-
ret.s01 = __clc_remquo(x.s01, y.s01, &lo); \
107-
ret.s2 = __clc_remquo(x.s2, y.s2, &hi); \
108-
(*quo).s01 = lo; \
109-
(*quo).s2 = hi; \
110-
return ret; \
111-
}
112-
__VEC_REMQUO(float, 2, )
113-
__VEC3_REMQUO(float)
114-
__VEC_REMQUO(float, 4, 2)
115-
__VEC_REMQUO(float, 8, 4)
116-
__VEC_REMQUO(float, 16, 8)
117-
118-
#ifdef cl_khr_fp64
119-
120-
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
121-
122-
_CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
123-
__private int *pquo) {
124-
ulong ux = __clc_as_ulong(x);
125-
ulong ax = ux & ~SIGNBIT_DP64;
126-
ulong xsgn = ux ^ ax;
127-
double dx = __clc_as_double(ax);
128-
int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
129-
int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
130-
xexp1 = xexp < 1 ? xexp1 : xexp;
131-
132-
ulong uy = __clc_as_ulong(y);
133-
ulong ay = uy & ~SIGNBIT_DP64;
134-
double dy = __clc_as_double(ay);
135-
int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
136-
int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
137-
yexp1 = yexp < 1 ? yexp1 : yexp;
138-
139-
int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
140-
141-
// First assume |x| > |y|
142-
143-
// Set ntimes to the number of times we need to do a
144-
// partial remainder. If the exponent of x is an exact multiple
145-
// of 53 larger than the exponent of y, and the mantissa of x is
146-
// less than the mantissa of y, ntimes will be one too large
147-
// but it doesn't matter - it just means that we'll go round
148-
// the loop below one extra time.
149-
int ntimes = __clc_max(0, (xexp1 - yexp1) / 53);
150-
double w = __clc_ldexp(dy, ntimes * 53);
151-
w = ntimes == 0 ? dy : w;
152-
double scale = ntimes == 0 ? 1.0 : 0x1.0p-53;
153-
154-
// Each time round the loop we compute a partial remainder.
155-
// This is done by subtracting a large multiple of w
156-
// from x each time, where w is a scaled up version of y.
157-
// The subtraction must be performed exactly in quad
158-
// precision, though the result at each stage can
159-
// fit exactly in a double precision number.
160-
int i;
161-
double t, v, p, pp;
162-
163-
for (i = 0; i < ntimes; i++) {
164-
// Compute integral multiplier
165-
t = __clc_trunc(dx / w);
166-
167-
// Compute w * t in quad precision
168-
p = w * t;
169-
pp = __clc_fma(w, t, -p);
170-
171-
// Subtract w * t from dx
172-
v = dx - p;
173-
dx = v + (((dx - v) - p) - pp);
174-
175-
// If t was one too large, dx will be negative. Add back one w.
176-
dx += dx < 0.0 ? w : 0.0;
177-
178-
// Scale w down by 2^(-53) for the next iteration
179-
w *= scale;
180-
}
181-
182-
// One more time
183-
// Variable todd says whether the integer t is odd or not
184-
t = __clc_floor(dx / w);
185-
long lt = (long)t;
186-
int todd = lt & 1;
187-
188-
p = w * t;
189-
pp = __clc_fma(w, t, -p);
190-
v = dx - p;
191-
dx = v + (((dx - v) - p) - pp);
192-
i = dx < 0.0;
193-
todd ^= i;
194-
dx += i ? w : 0.0;
195-
196-
lt -= i;
197-
198-
// At this point, dx lies in the range [0,dy)
199-
200-
// For the remainder function, we need to adjust dx
201-
// so that it lies in the range (-y/2, y/2] by carefully
202-
// subtracting w (== dy == y) if necessary. The rigmarole
203-
// with todd is to get the correct sign of the result
204-
// when x/y lies exactly half way between two integers,
205-
// when we need to choose the even integer.
206-
207-
int al = (2.0 * dx > w) | (todd & (2.0 * dx == w));
208-
double dxl = dx - (al ? w : 0.0);
209-
210-
int ag = (dx > 0.5 * w) | (todd & (dx == 0.5 * w));
211-
double dxg = dx - (ag ? w : 0.0);
212-
213-
dx = dy < 0x1.0p+1022 ? dxl : dxg;
214-
lt += dy < 0x1.0p+1022 ? al : ag;
215-
int quo = ((int)lt & 0x7f) * qsgn;
216-
217-
double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
218-
dx = __clc_as_double(ax);
219-
220-
// Now handle |x| == |y|
221-
int c = dx == dy;
222-
t = __clc_as_double(xsgn);
223-
quo = c ? qsgn : quo;
224-
ret = c ? t : ret;
225-
226-
// Next, handle |x| < |y|
227-
c = dx < dy;
228-
quo = c ? 0 : quo;
229-
ret = c ? x : ret;
230-
231-
c &= (yexp<1023 & 2.0 * dx> dy) | (dx > 0.5 * dy);
232-
quo = c ? qsgn : quo;
233-
// we could use a conversion here instead since qsgn = +-1
234-
p = qsgn == 1 ? -1.0 : 1.0;
235-
t = __clc_fma(y, p, x);
236-
ret = c ? t : ret;
237-
238-
// We don't need anything special for |x| == 0
239-
240-
// |y| is 0
241-
c = dy == 0.0;
242-
quo = c ? 0 : quo;
243-
ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
244-
245-
// y is +-Inf, NaN
246-
c = yexp > BIASEDEMAX_DP64;
247-
quo = c ? 0 : quo;
248-
t = y == y ? x : y;
249-
ret = c ? t : ret;
250-
251-
// x is +=Inf, NaN
252-
c = xexp > BIASEDEMAX_DP64;
253-
quo = c ? 0 : quo;
254-
ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
255-
256-
*pquo = quo;
257-
return ret;
258-
}
259-
__VEC_REMQUO(double, 2, )
260-
__VEC3_REMQUO(double)
261-
__VEC_REMQUO(double, 4, 2)
262-
__VEC_REMQUO(double, 8, 4)
263-
__VEC_REMQUO(double, 16, 8)
264-
#endif
265-
266-
#ifdef cl_khr_fp16
267-
268-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
269-
270-
_CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y, __private int *pquo) {
271-
return (half)__clc_remquo((float)x, (float)y, pquo);
272-
}
273-
__VEC_REMQUO(half, 2, )
274-
__VEC3_REMQUO(half)
275-
__VEC_REMQUO(half, 4, 2)
276-
__VEC_REMQUO(half, 8, 4)
277-
__VEC_REMQUO(half, 16, 8)
278-
21+
#define __CLC_ADDRESS_SPACE private
22+
#include <clc_remquo.inc>
23+
#undef __CLC_ADDRESS_SPACE
24+
25+
#define __CLC_ADDRESS_SPACE global
26+
#include <clc_remquo.inc>
27+
#undef __CLC_ADDRESS_SPACE
28+
29+
#define __CLC_ADDRESS_SPACE local
30+
#include <clc_remquo.inc>
31+
#undef __CLC_ADDRESS_SPACE
32+
33+
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
34+
#define __CLC_ADDRESS_SPACE generic
35+
#include <clc_remquo.inc>
36+
#undef __CLC_ADDRESS_SPACE
27937
#endif

0 commit comments

Comments
 (0)