@@ -190,64 +190,58 @@ static __device__ void no_device_code(
190
190
#define NO_DEVICE_CODE // GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
191
191
#endif // __CUDA_ARCH__
192
192
193
+ template <int width = WARP_SIZE>
193
194
static __device__ __forceinline__ int warp_reduce_sum (int x) {
194
195
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
195
196
return __reduce_add_sync (0xffffffff , x);
196
197
#else
197
198
#pragma unroll
198
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
199
- x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
199
+ for (int offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
200
+ x += __shfl_xor_sync (0xffffffff , x, offset, width );
200
201
}
201
202
return x;
202
203
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
203
204
}
204
205
206
+ template <int width = WARP_SIZE>
205
207
static __device__ __forceinline__ float warp_reduce_sum (float x) {
206
208
#pragma unroll
207
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
208
- x += __shfl_xor_sync (0xffffffff , x, offset, 32 );
209
+ for (int offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
210
+ x += __shfl_xor_sync (0xffffffff , x, offset, width );
209
211
}
210
212
return x;
211
213
}
212
214
215
+ template <int width = WARP_SIZE>
213
216
static __device__ __forceinline__ float2 warp_reduce_sum (float2 a) {
214
217
#pragma unroll
215
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
216
- a.x += __shfl_xor_sync (0xffffffff , a.x , offset, 32 );
217
- a.y += __shfl_xor_sync (0xffffffff , a.y , offset, 32 );
218
+ for (int offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
219
+ a.x += __shfl_xor_sync (0xffffffff , a.x , offset, width );
220
+ a.y += __shfl_xor_sync (0xffffffff , a.y , offset, width );
218
221
}
219
222
return a;
220
223
}
221
224
225
+ template <int width = WARP_SIZE>
222
226
static __device__ __forceinline__ half2 warp_reduce_sum (half2 a) {
223
227
#ifdef FP16_AVAILABLE
224
-
225
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
226
- #pragma unroll
227
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
228
- const half2 a_other = __shfl_xor_sync (0xffffffff , a, offset, 32 );
229
- reinterpret_cast <half&>(a.x ) += __low2half (a_other);
230
- reinterpret_cast <half&>(a.y ) += __high2half (a_other);
231
- }
232
- return a;
233
- #else
234
228
#pragma unroll
235
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
236
- a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, offset, 32 ));
229
+ for (int offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
230
+ a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, offset, width ));
237
231
}
238
232
return a;
239
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
240
233
241
234
#else
242
235
NO_DEVICE_CODE;
243
236
return a;
244
237
#endif // FP16_AVAILABLE
245
238
}
246
239
240
+ template <int width = WARP_SIZE>
247
241
static __device__ __forceinline__ float warp_reduce_max (float x) {
248
242
#pragma unroll
249
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
250
- x = fmaxf (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
243
+ for (int offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
244
+ x = fmaxf (x, __shfl_xor_sync (0xffffffff , x, offset, width ));
251
245
}
252
246
return x;
253
247
}
@@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
269
263
}
270
264
271
265
static __device__ __forceinline__ half2 ggml_cuda_hmax2 (const half2 a, const half2 b) {
272
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
273
-
274
- #if CUDART_VERSION >= CUDART_HMAX
266
+ #if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
267
+ return half2 ( __hmax (a. x , b. x ), __hmax (a. y , b. y ));
268
+ #elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
275
269
return __hmax2 (a, b);
276
- #else
270
+ #elif !defined(GGML_USE_HIP)
277
271
half2 ret;
278
272
reinterpret_cast <half&>(ret.x ) = __float2half (fmaxf ( __low2float (a), __low2float (b)));
279
273
reinterpret_cast <half&>(ret.y ) = __float2half (fmaxf (__high2float (a), __high2float (b)));
280
274
return ret;
281
- #endif // CUDART_VERSION >= CUDART_HMAX
282
-
283
275
#else
284
276
GGML_UNUSED (a);
285
277
GGML_UNUSED (b);
286
278
NO_DEVICE_CODE;
287
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
279
+ #endif
288
280
}
289
281
282
+ template <int width = WARP_SIZE>
290
283
static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
291
- #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
284
+ #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
292
285
#pragma unroll
293
- for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
294
- x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, 32 ));
286
+ for (int offset = width/ 2 ; offset > 0 ; offset >>= 1 ) {
287
+ x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, width ));
295
288
}
296
289
return x;
297
290
#else
298
291
GGML_UNUSED (x);
299
292
NO_DEVICE_CODE;
300
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
293
+ #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
301
294
}
302
295
303
296
#if CUDART_VERSION < CUDART_HMASK
0 commit comments