Skip to content

Commit 6af1ca4

Browse files
committed
HIP: Prepare reduction operators for wave 64
1 parent c300e68 commit 6af1ca4

File tree

2 files changed

+28
-35
lines changed

2 files changed

+28
-35
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -190,64 +190,58 @@ static __device__ void no_device_code(
190190
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
191191
#endif // __CUDA_ARCH__
192192

193+
template<int width = WARP_SIZE>
193194
static __device__ __forceinline__ int warp_reduce_sum(int x) {
194195
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
195196
return __reduce_add_sync(0xffffffff, x);
196197
#else
197198
#pragma unroll
198-
for (int offset = 16; offset > 0; offset >>= 1) {
199-
x += __shfl_xor_sync(0xffffffff, x, offset, 32);
199+
for (int offset = width/2; offset > 0; offset >>= 1) {
200+
x += __shfl_xor_sync(0xffffffff, x, offset, width);
200201
}
201202
return x;
202203
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
203204
}
204205

206+
template<int width = WARP_SIZE>
205207
static __device__ __forceinline__ float warp_reduce_sum(float x) {
206208
#pragma unroll
207-
for (int offset = 16; offset > 0; offset >>= 1) {
208-
x += __shfl_xor_sync(0xffffffff, x, offset, 32);
209+
for (int offset = width/2; offset > 0; offset >>= 1) {
210+
x += __shfl_xor_sync(0xffffffff, x, offset, width);
209211
}
210212
return x;
211213
}
212214

215+
template<int width = WARP_SIZE>
213216
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
214217
#pragma unroll
215-
for (int offset = 16; offset > 0; offset >>= 1) {
216-
a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
217-
a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
218+
for (int offset = width/2; offset > 0; offset >>= 1) {
219+
a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
220+
a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
218221
}
219222
return a;
220223
}
221224

225+
template<int width = WARP_SIZE>
222226
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
223227
#ifdef FP16_AVAILABLE
224-
225-
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
226-
#pragma unroll
227-
for (int offset = 16; offset > 0; offset >>= 1) {
228-
const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
229-
reinterpret_cast<half&>(a.x) += __low2half(a_other);
230-
reinterpret_cast<half&>(a.y) += __high2half(a_other);
231-
}
232-
return a;
233-
#else
234228
#pragma unroll
235-
for (int offset = 16; offset > 0; offset >>= 1) {
236-
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
229+
for (int offset = width/2; offset > 0; offset >>= 1) {
230+
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
237231
}
238232
return a;
239-
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
240233

241234
#else
242235
NO_DEVICE_CODE;
243236
return a;
244237
#endif // FP16_AVAILABLE
245238
}
246239

240+
template<int width = WARP_SIZE>
247241
static __device__ __forceinline__ float warp_reduce_max(float x) {
248242
#pragma unroll
249-
for (int offset = 16; offset > 0; offset >>= 1) {
250-
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
243+
for (int offset = width/2; offset > 0; offset >>= 1) {
244+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
251245
}
252246
return x;
253247
}
@@ -269,35 +263,34 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
269263
}
270264

271265
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
272-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
273-
274-
#if CUDART_VERSION >= CUDART_HMAX
266+
#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
267+
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
268+
#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
275269
return __hmax2(a, b);
276-
#else
270+
#elif !defined(GGML_USE_HIP)
277271
half2 ret;
278272
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
279273
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
280274
return ret;
281-
#endif // CUDART_VERSION >= CUDART_HMAX
282-
283275
#else
284276
GGML_UNUSED(a);
285277
GGML_UNUSED(b);
286278
NO_DEVICE_CODE;
287-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
279+
#endif
288280
}
289281

282+
template<int width = WARP_SIZE>
290283
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
291-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
284+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
292285
#pragma unroll
293-
for (int offset = 16; offset > 0; offset >>= 1) {
294-
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
286+
for (int offset = width/2; offset > 0; offset >>= 1) {
287+
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
295288
}
296289
return x;
297290
#else
298291
GGML_UNUSED(x);
299292
NO_DEVICE_CODE;
300-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
293+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
301294
}
302295

303296
#if CUDART_VERSION < CUDART_HMASK

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,8 @@ static ggml_cuda_device_info ggml_cuda_init() {
240240
info.default_tensor_split[id] = total_vram;
241241
total_vram += prop.totalGlobalMem;
242242

243-
info.devices[id].nsm = prop.multiProcessorCount;
244-
info.devices[id].smpb = prop.sharedMemPerBlock;
243+
info.devices[id].nsm = prop.multiProcessorCount;
244+
info.devices[id].smpb = prop.sharedMemPerBlock;
245245
info.devices[id].warp_size = prop.warpSize;
246246
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
247247
info.devices[id].smpbo = prop.sharedMemPerBlock;

0 commit comments

Comments
 (0)