8
8
9
9
#include <clc/clc.h>
10
10
11
- #define VLOAD_VECTORIZE (PRIM_TYPE , ADDR_SPACE ) \
12
- typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
13
- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
14
- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[2*offset])); \
15
- } \
16
- \
17
- typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
18
- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
19
- PRIM_TYPE##2 vec = *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&x[3*offset])); \
20
- return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset*3+2]); \
21
- } \
22
- \
23
- typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
24
- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
25
- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&x[4*offset])); \
26
- } \
27
- \
28
- typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
29
- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
30
- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&x[8*offset])); \
31
- } \
32
- \
33
- typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
34
- _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
35
- return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&x[16*offset])); \
36
- } \
11
+ #define VLOAD_VECTORIZE (PRIM_TYPE , ADDR_SPACE ) \
12
+ typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
13
+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
14
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, \
15
+ const ADDR_SPACE PRIM_TYPE *x) { \
16
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
17
+ *)(&x[2 * offset])); \
18
+ } \
19
+ \
20
+ typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \
21
+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
22
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, \
23
+ const ADDR_SPACE PRIM_TYPE *x) { \
24
+ PRIM_TYPE##2 vec = \
25
+ *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
26
+ *)(&x[3 * offset])); \
27
+ return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \
28
+ } \
29
+ \
30
+ typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
31
+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
32
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, \
33
+ const ADDR_SPACE PRIM_TYPE *x) { \
34
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
35
+ *)(&x[4 * offset])); \
36
+ } \
37
+ \
38
+ typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
39
+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
40
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, \
41
+ const ADDR_SPACE PRIM_TYPE *x) { \
42
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
43
+ *)(&x[8 * offset])); \
44
+ } \
45
+ \
46
+ typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
47
+ __attribute__((aligned(sizeof(PRIM_TYPE)))); \
48
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16( \
49
+ size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
50
+ return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
51
+ *)(&x[16 * offset])); \
52
+ }
37
53
38
- #define VLOAD_ADDR_SPACES (__CLC_SCALAR_GENTYPE ) \
39
- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
40
- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
41
- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
42
- VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
54
+ #define VLOAD_ADDR_SPACES (__CLC_SCALAR_GENTYPE ) \
55
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
56
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
57
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
58
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global)
43
59
44
- #define VLOAD_TYPES () \
45
- VLOAD_ADDR_SPACES(char) \
46
- VLOAD_ADDR_SPACES(uchar) \
47
- VLOAD_ADDR_SPACES(short) \
48
- VLOAD_ADDR_SPACES(ushort) \
49
- VLOAD_ADDR_SPACES(int) \
50
- VLOAD_ADDR_SPACES(uint) \
51
- VLOAD_ADDR_SPACES(long) \
52
- VLOAD_ADDR_SPACES(ulong) \
53
- VLOAD_ADDR_SPACES(float) \
60
+ #define VLOAD_TYPES () \
61
+ VLOAD_ADDR_SPACES(char) \
62
+ VLOAD_ADDR_SPACES(uchar) \
63
+ VLOAD_ADDR_SPACES(short) \
64
+ VLOAD_ADDR_SPACES(ushort) \
65
+ VLOAD_ADDR_SPACES(int) \
66
+ VLOAD_ADDR_SPACES(uint) \
67
+ VLOAD_ADDR_SPACES(long) \
68
+ VLOAD_ADDR_SPACES(ulong) \
69
+ VLOAD_ADDR_SPACES(float)
54
70
55
71
VLOAD_TYPES ()
56
72
57
73
#ifdef cl_khr_fp64
58
74
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
59
- VLOAD_ADDR_SPACES (double )
75
+ VLOAD_ADDR_SPACES (double )
60
76
#endif
61
77
#ifdef cl_khr_fp16
62
78
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
63
- VLOAD_ADDR_SPACES (half )
79
+ VLOAD_ADDR_SPACES (half )
64
80
#endif
65
81
66
82
/* vload_half are legal even without cl_khr_fp16 */
@@ -71,43 +87,45 @@ float __clc_vload_half_float_helper__global(const __global half *);
71
87
float __clc_vload_half_float_helper__local (const __local half * );
72
88
float __clc_vload_half_float_helper__private (const __private half * );
73
89
74
- #define VEC_LOAD1 (val , AS ) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
90
+ #define VEC_LOAD1 (val , AS ) \
91
+ val = __clc_vload_half_float_helper##AS(&mem[offset++]);
75
92
#else
76
93
#define VEC_LOAD1 (val , AS ) val = __builtin_load_halff(&mem[offset++]);
77
94
#endif
78
95
79
- #define VEC_LOAD2 (val , AS ) \
80
- VEC_LOAD1(val.lo, AS) \
81
- VEC_LOAD1(val.hi, AS)
82
- #define VEC_LOAD3 (val , AS ) \
83
- VEC_LOAD1(val.s0, AS) \
84
- VEC_LOAD1(val.s1, AS) \
85
- VEC_LOAD1(val.s2, AS)
86
- #define VEC_LOAD4 (val , AS ) \
87
- VEC_LOAD2(val.lo, AS) \
88
- VEC_LOAD2(val.hi, AS)
89
- #define VEC_LOAD8 (val , AS ) \
90
- VEC_LOAD4(val.lo, AS) \
91
- VEC_LOAD4(val.hi, AS)
92
- #define VEC_LOAD16 (val , AS ) \
93
- VEC_LOAD8(val.lo, AS) \
94
- VEC_LOAD8(val.hi, AS)
96
+ #define VEC_LOAD2 (val , AS ) \
97
+ VEC_LOAD1(val.lo, AS) \
98
+ VEC_LOAD1(val.hi, AS)
99
+ #define VEC_LOAD3 (val , AS ) \
100
+ VEC_LOAD1(val.s0, AS) \
101
+ VEC_LOAD1(val.s1, AS) \
102
+ VEC_LOAD1(val.s2, AS)
103
+ #define VEC_LOAD4 (val , AS ) \
104
+ VEC_LOAD2(val.lo, AS) \
105
+ VEC_LOAD2(val.hi, AS)
106
+ #define VEC_LOAD8 (val , AS ) \
107
+ VEC_LOAD4(val.lo, AS) \
108
+ VEC_LOAD4(val.hi, AS)
109
+ #define VEC_LOAD16 (val , AS ) \
110
+ VEC_LOAD8(val.lo, AS) \
111
+ VEC_LOAD8(val.hi, AS)
95
112
96
- #define __FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) \
97
- _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
98
- offset *= VEC_SIZE; \
99
- TYPE __tmp; \
100
- VEC_LOAD##VEC_SIZE( __tmp, AS) \
101
- return __tmp; \
102
- } \
103
- _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, const AS half *mem) { \
104
- offset *= OFFSET_SIZE; \
105
- TYPE __tmp; \
106
- VEC_LOAD##VEC_SIZE( __tmp, AS) \
107
- return __tmp; \
113
+ #define __FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) \
114
+ _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, \
115
+ const AS half *mem) { \
116
+ offset *= VEC_SIZE; \
117
+ TYPE __tmp; \
118
+ VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
119
+ } \
120
+ _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset, \
121
+ const AS half *mem) { \
122
+ offset *= OFFSET_SIZE; \
123
+ TYPE __tmp; \
124
+ VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
108
125
}
109
126
110
- #define FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
127
+ #define FUNC (SUFFIX , VEC_SIZE , OFFSET_SIZE , TYPE , AS ) \
128
+ __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
111
129
112
130
#define __CLC_BODY "vload_half.inc"
113
131
#include <clc/math/gentype.inc>
0 commit comments