@@ -15,37 +15,39 @@ namespace {
15
15
// Value are taken from LSC_OP enum
16
16
// Source/visa/include/visa_igc_common_header.h
17
17
enum class AtomicOp : char {
18
+ Inc = 0x08 ,
19
+ Dec = 0x09 ,
20
+ Load = 0x0A ,
21
+ Xchg = 0x0B ,
18
22
Add = 0x0C ,
19
23
Sub = 0x0D ,
20
- Xchg = 0x0B ,
21
- And = 0x18 ,
22
- Or = 0x19 ,
23
- Xor = 0x1A ,
24
24
SMin = 0x0E ,
25
25
SMax = 0x0F ,
26
26
UMin = 0x10 ,
27
27
UMax = 0x11 ,
28
- Dec = 0x09 ,
29
- Inc = 0x08 ,
30
- Load = 0x0A ,
28
+ Cas = 0x12 ,
29
+ Fadd = 0x13 ,
30
+ Fsub = 0x14 ,
31
+ Fmin = 0x15 ,
32
+ Fmax = 0x16 ,
33
+ Fcas = 0x17 ,
34
+ And = 0x18 ,
35
+ Or = 0x19 ,
36
+ Xor = 0x1A ,
31
37
};
32
38
33
39
template <int N>
34
40
CM_NODEBUG CM_INLINE vector<uint64_t , N>
35
- __impl_atomic_local_binop (mask<N> pred, AtomicOp op, char l1cachecontrol,
36
- char l3cachecontrol, int base, vector<int , N> index,
37
- short scale, int offset, vector<uint64_t , N> src ,
38
- vector<uint64_t , N> passthru) {
41
+ __impl_atomic_local (mask<N> pred, AtomicOp op, char l1cachecontrol,
42
+ char l3cachecontrol, int base, vector<int , N> index,
43
+ short scale, int offset, vector<uint64_t , N> src1 ,
44
+ vector< uint64_t , N> src2, vector<uint64_t , N> passthru) {
39
45
vector<int , N> addr = base + index * scale + offset;
40
46
vector<uint64_t , N> laddr = addr;
41
47
vector<uint64_t , N> orig =
42
48
detail::__cm_cl_gather (3 , laddr.cl_vector (), sizeof (uint64_t ),
43
49
pred.cl_vector (), passthru.cl_vector ());
44
50
45
- // Value should be equal to LSC_ATOMIC_ICAS from
46
- // Source/visa/include/visa_igc_common_header.h
47
- constexpr char OpcodeICAS = 0x12 ;
48
-
49
51
// Value should be equal to LSC_ADDR_SIZE_32b from
50
52
// Source/visa/include/visa_igc_common_header.h
51
53
constexpr char AddrSize = 2 ;
@@ -58,38 +60,38 @@ __impl_atomic_local_binop(mask<N> pred, AtomicOp op, char l1cachecontrol,
58
60
vector<uint64_t , N> newval = orig;
59
61
switch (op) {
60
62
case AtomicOp::Add:
61
- newval += src ;
63
+ newval += src1 ;
62
64
break ;
63
65
case AtomicOp::Sub:
64
- newval -= src ;
66
+ newval -= src1 ;
65
67
break ;
66
68
case AtomicOp::And:
67
- newval &= src ;
69
+ newval &= src1 ;
68
70
break ;
69
71
case AtomicOp::Or:
70
- newval |= src ;
72
+ newval |= src1 ;
71
73
break ;
72
74
case AtomicOp::Xor:
73
- newval ^= src ;
75
+ newval ^= src1 ;
74
76
break ;
75
77
case AtomicOp::Xchg:
76
- newval = src ;
78
+ newval = src1 ;
77
79
break ;
78
80
case AtomicOp::SMin: {
79
- vector<int64_t , N> ssrc = src .template format <int64_t >();
81
+ vector<int64_t , N> ssrc1 = src1 .template format <int64_t >();
80
82
vector<int64_t , N> snewval = newval.template format <int64_t >();
81
- newval.merge (src, ssrc < snewval);
83
+ newval.merge (src1, ssrc1 < snewval);
82
84
} break ;
83
85
case AtomicOp::SMax: {
84
- vector<int64_t , N> ssrc = src .template format <int64_t >();
86
+ vector<int64_t , N> ssrc1 = src1 .template format <int64_t >();
85
87
vector<int64_t , N> snewval = newval.template format <int64_t >();
86
- newval.merge (src, ssrc > snewval);
88
+ newval.merge (src1, ssrc1 > snewval);
87
89
} break ;
88
90
case AtomicOp::UMin:
89
- newval.merge (src, src < newval);
91
+ newval.merge (src1, src1 < newval);
90
92
break ;
91
93
case AtomicOp::UMax:
92
- newval.merge (src, src > newval);
94
+ newval.merge (src1, src1 > newval);
93
95
break ;
94
96
case AtomicOp::Inc:
95
97
newval = newval + 1 ;
@@ -104,32 +106,184 @@ __impl_atomic_local_binop(mask<N> pred, AtomicOp op, char l1cachecontrol,
104
106
}
105
107
106
108
vector<uint64_t , N> res = detail::__cm_cl_vector_atomic_slm (
107
- pred.cl_vector (), OpcodeICAS , AddrSize, DataSize, l1cachecontrol ,
108
- l3cachecontrol, 0 , addr.cl_vector (), 1 , 0 , orig. cl_vector () ,
109
- newval.cl_vector (), orig.cl_vector ());
109
+ pred.cl_vector (), static_cast < char >(AtomicOp::Cas) , AddrSize, DataSize,
110
+ l1cachecontrol, l3cachecontrol, 0 , addr.cl_vector (), 1 , 0 ,
111
+ orig. cl_vector (), newval.cl_vector (), orig.cl_vector ());
110
112
pred &= res != orig;
111
113
orig = res;
112
114
} while (pred.any ());
113
115
114
116
return orig;
115
117
}
116
118
119
+ template <int N>
120
+ CM_NODEBUG CM_INLINE vector<double , N>
121
+ __impl_atomic_global (mask<N> pred, AtomicOp op, char l1cachecontrol,
122
+ char l3cachecontrol, long base, vector<long , N> index,
123
+ short scale, int offset, vector<double , N> src1,
124
+ vector<double , N> src2, vector<double , N> passthru) {
125
+ vector<long , N> addr = base + index * scale + offset;
126
+ vector<uint64_t , N> laddr = addr;
127
+ vector<double , N> orig =
128
+ detail::__cm_cl_gather (1 , laddr.cl_vector (), sizeof (double ),
129
+ pred.cl_vector (), passthru.cl_vector ());
130
+
131
+ // Value should be equal to LSC_ADDR_SIZE_64b from
132
+ // Source/visa/include/visa_igc_common_header.h
133
+ constexpr char AddrSize = 3 ;
134
+
135
+ // Value should be equal to LSC_DATA_SIZE_64b from
136
+ // Source/visa/include/visa_igc_common_header.h
137
+ constexpr char DataSize = 4 ;
138
+
139
+ do {
140
+ vector<double , N> newval = orig;
141
+ switch (op) {
142
+ case AtomicOp::Fadd:
143
+ newval += src1;
144
+ break ;
145
+ case AtomicOp::Fsub:
146
+ newval -= src1;
147
+ break ;
148
+ case AtomicOp::Fmin:
149
+ newval.merge (src1, src1 < newval);
150
+ break ;
151
+ case AtomicOp::Fmax:
152
+ newval.merge (src1, src1 > newval);
153
+ break ;
154
+ case AtomicOp::Fcas:
155
+ newval.merge (src2, src1 == newval);
156
+ break ;
157
+ default :
158
+ break ;
159
+ }
160
+
161
+ vector<uint64_t , N> iorig = orig.template format <uint64_t >();
162
+ vector<uint64_t , N> inewval = newval.template format <uint64_t >();
163
+
164
+ vector<uint64_t , N> res = detail::__cm_cl_vector_atomic_ugm (
165
+ pred.cl_vector (), static_cast <char >(AtomicOp::Cas), AddrSize, DataSize,
166
+ l1cachecontrol, l3cachecontrol, 0 , addr.cl_vector (), 1 , 0 ,
167
+ iorig.cl_vector (), inewval.cl_vector (), iorig.cl_vector ());
168
+ vector<double , N> fres = res.template format <double >();
169
+ pred &= fres != orig;
170
+ orig = fres;
171
+ } while (pred.any ());
172
+
173
+ return orig;
174
+ }
175
+
176
+ template <int N>
177
+ CM_NODEBUG CM_INLINE vector<uint32_t , N>
178
+ __impl_atomic_global (mask<N> pred, AtomicOp op, char l1cachecontrol,
179
+ char l3cachecontrol, long base, vector<long , N> index,
180
+ short scale, int offset, vector<uint32_t , N> src1,
181
+ vector<uint32_t , N> src2, vector<uint32_t , N> passthru) {
182
+ vector<long , N> addr = base + index * scale + offset;
183
+ vector<uint64_t , N> laddr = addr;
184
+
185
+ vector<half, N> hpassthru =
186
+ passthru.template format <half>().template select <N, 2 >(0 );
187
+
188
+ vector<half, N> orig =
189
+ detail::__cm_cl_gather (1 , laddr.cl_vector (), sizeof (half),
190
+ pred.cl_vector (), hpassthru.cl_vector ());
191
+
192
+ vector<uint32_t , N> iorig;
193
+
194
+ // Value should be equal to LSC_ADDR_SIZE_64b from
195
+ // Source/visa/include/visa_igc_common_header.h
196
+ constexpr char AddrSize = 3 ;
197
+
198
+ // Value should be equal to LSC_DATA_SIZE_16c32b from
199
+ // Source/visa/include/visa_igc_common_header.h
200
+ constexpr char DataSize = 6 ;
201
+
202
+ vector<half, N> hsrc = src1.template format <half>().template select <N, 2 >(0 );
203
+
204
+ do {
205
+ vector<half, N> newval = orig;
206
+ switch (op) {
207
+ case AtomicOp::Fadd:
208
+ newval += hsrc;
209
+ break ;
210
+ case AtomicOp::Fsub:
211
+ newval -= hsrc;
212
+ break ;
213
+ default :
214
+ break ;
215
+ }
216
+
217
+ iorig.template format <half>().template select <N, 2 >(0 ) = orig;
218
+
219
+ vector<uint32_t , N> inewval;
220
+ inewval.template format <half>().template select <N, 2 >(0 ) = newval;
221
+
222
+ vector<uint32_t , N> res = detail::__cm_cl_vector_atomic_ugm (
223
+ pred.cl_vector (), static_cast <char >(AtomicOp::Cas), AddrSize, DataSize,
224
+ l1cachecontrol, l3cachecontrol, 0 , addr.cl_vector (), 1 , 0 ,
225
+ iorig.cl_vector (), inewval.cl_vector (), iorig.cl_vector ());
226
+
227
+ vector<half, N> hres = res.template format <half>().template select <N, 2 >(0 );
228
+
229
+ pred &= hres != orig;
230
+ orig = hres;
231
+ } while (pred.any ());
232
+
233
+ return iorig;
234
+ }
235
+
117
236
} // namespace
118
237
119
238
#define ATOMIC (WIDTH ) \
120
239
CM_NODEBUG CM_INLINE extern " C" cl_vector<uint64_t , WIDTH> \
121
240
__vc_builtin_atomic_slm_v##WIDTH##i64 ( \
122
241
cl_vector<char , WIDTH> pred, AtomicOp op, char l1cachecontrol, \
123
242
char l3cachecontrol, int base, cl_vector<int , WIDTH> index, \
124
- short scale, int offset, cl_vector<uint64_t , WIDTH> src, \
243
+ short scale, int offset, cl_vector<uint64_t , WIDTH> src1, \
244
+ cl_vector<uint64_t , WIDTH> src2, \
125
245
cl_vector<uint64_t , WIDTH> passthru) { \
126
246
mask<WIDTH> vpred{pred}; \
127
247
vector<int , WIDTH> vindex{index}; \
128
- vector<uint64_t , WIDTH> vsrc{src}; \
248
+ vector<uint64_t , WIDTH> vsrc1{src1}; \
249
+ vector<uint64_t , WIDTH> vsrc2{src2}; \
129
250
vector<uint64_t , WIDTH> vpassthru{passthru}; \
130
- return __impl_atomic_local_binop<WIDTH>(vpred, op, l1cachecontrol, \
131
- l3cachecontrol, base, vindex, \
132
- scale, offset, vsrc, vpassthru) \
251
+ return __impl_atomic_local<WIDTH>(vpred, op, l1cachecontrol, \
252
+ l3cachecontrol, base, vindex, scale, \
253
+ offset, vsrc1, vsrc2, vpassthru) \
254
+ .cl_vector (); \
255
+ } \
256
+ CM_NODEBUG CM_INLINE extern " C" cl_vector<double , WIDTH> \
257
+ __vc_builtin_atomic_ugm_v##WIDTH##f64 ( \
258
+ cl_vector<char , WIDTH> pred, AtomicOp op, char l1cachecontrol, \
259
+ char l3cachecontrol, long base, cl_vector<long , WIDTH> index, \
260
+ short scale, int offset, cl_vector<double , WIDTH> src1, \
261
+ cl_vector<double , WIDTH> src2, cl_vector<double , WIDTH> passthru) { \
262
+ mask<WIDTH> vpred{pred}; \
263
+ vector<long , WIDTH> vindex{index}; \
264
+ vector<double , WIDTH> vsrc1{src1}; \
265
+ vector<double , WIDTH> vsrc2{src2}; \
266
+ vector<double , WIDTH> vpassthru{passthru}; \
267
+ return __impl_atomic_global<WIDTH>(vpred, op, l1cachecontrol, \
268
+ l3cachecontrol, base, vindex, scale, \
269
+ offset, vsrc1, vsrc2, vpassthru) \
270
+ .cl_vector (); \
271
+ } \
272
+ CM_NODEBUG CM_INLINE extern " C" cl_vector<uint32_t , WIDTH> \
273
+ __vc_builtin_atomic_ugm_v##WIDTH##i32 ( \
274
+ cl_vector<char , WIDTH> pred, AtomicOp op, char l1cachecontrol, \
275
+ char l3cachecontrol, long base, cl_vector<long , WIDTH> index, \
276
+ short scale, int offset, cl_vector<uint32_t , WIDTH> src1, \
277
+ cl_vector<uint32_t , WIDTH> src2, \
278
+ cl_vector<uint32_t , WIDTH> passthru) { \
279
+ mask<WIDTH> vpred{pred}; \
280
+ vector<long , WIDTH> vindex{index}; \
281
+ vector<uint32_t , WIDTH> vsrc1{src1}; \
282
+ vector<uint32_t , WIDTH> vsrc2{src2}; \
283
+ vector<uint32_t , WIDTH> vpassthru{passthru}; \
284
+ return __impl_atomic_global<WIDTH>(vpred, op, l1cachecontrol, \
285
+ l3cachecontrol, base, vindex, scale, \
286
+ offset, vsrc1, vsrc2, vpassthru) \
133
287
.cl_vector (); \
134
288
}
135
289
0 commit comments