@@ -233,6 +233,98 @@ typedef float dfloat; // dequantize float
233
233
typedef float2 dfloat2;
234
234
#endif // GGML_CUDA_F16
235
235
236
+ #if defined(GGML_USE_HIPBLAS)
237
+ #define __CUDA_ARCH__ 1300
238
+
239
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
240
+ defined (__gfx1150__) || defined(__gfx1151__)
241
+ #define RDNA3
242
+ #endif
243
+
244
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
245
+ defined (__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
246
+ #define RDNA2
247
+ #endif
248
+
249
+ #ifndef __has_builtin
250
+ #define __has_builtin (x ) 0
251
+ #endif
252
+
253
+ typedef int8_t int8x4_t __attribute__ ((ext_vector_type(4 )));
254
+ typedef uint8_t uint8x4_t __attribute__ ((ext_vector_type(4 )));
255
+ static __device__ __forceinline__ int __vsubss4 (const int a, const int b) {
256
+ const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
257
+ const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
258
+ #if __has_builtin(__builtin_elementwise_sub_sat)
259
+ const int8x4_t c = __builtin_elementwise_sub_sat (va, vb);
260
+ return reinterpret_cast <const int &>(c);
261
+ #else
262
+ int8x4_t c;
263
+ int16_t tmp;
264
+ #pragma unroll
265
+ for (int i = 0 ; i < 4 ; i++) {
266
+ tmp = va[i] - vb[i];
267
+ if (tmp > std::numeric_limits<int8_t >::max ()) tmp = std::numeric_limits<int8_t >::max ();
268
+ if (tmp < std::numeric_limits<int8_t >::min ()) tmp = std::numeric_limits<int8_t >::min ();
269
+ c[i] = tmp;
270
+ }
271
+ return reinterpret_cast <int &>(c);
272
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
273
+ }
274
+
275
+ static __device__ __forceinline__ int __vsub4 (const int a, const int b) {
276
+ return __vsubss4 (a, b);
277
+ }
278
+
279
+ static __device__ __forceinline__ unsigned int __vcmpeq4 (unsigned int a, unsigned int b) {
280
+ const uint8x4_t & va = reinterpret_cast <const uint8x4_t &>(a);
281
+ const uint8x4_t & vb = reinterpret_cast <const uint8x4_t &>(b);
282
+ unsigned int c;
283
+ uint8x4_t & vc = reinterpret_cast <uint8x4_t &>(c);
284
+ #pragma unroll
285
+ for (int i = 0 ; i < 4 ; ++i) {
286
+ vc[i] = va[i] == vb[i] ? 0xff : 0x00 ;
287
+ }
288
+ return c;
289
+ }
290
+
291
+ static __device__ __forceinline__ int __dp4a (const int a, const int b, int c) {
292
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
293
+ c = __builtin_amdgcn_sdot4 (a, b, c, false );
294
+ #elif defined(RDNA3)
295
+ c = __builtin_amdgcn_sudot4 ( true , a, true , b, c, false );
296
+ #elif defined(__gfx1010__) || defined(__gfx900__)
297
+ int tmp1;
298
+ int tmp2;
299
+ asm (" \n \
300
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
301
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
302
+ v_add3_u32 %0, %1, %2, %0 \n \
303
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
304
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
305
+ v_add3_u32 %0, %1, %2, %0 \n \
306
+ "
307
+ : " +v" (c), " =&v" (tmp1), " =&v" (tmp2)
308
+ : " v" (a), " v" (b)
309
+ );
310
+ #else
311
+ const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
312
+ const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
313
+ c += va[0 ] * vb[0 ] + va[1 ] * vb[1 ] + va[2 ] * vb[2 ] + va[3 ] * vb[3 ];
314
+ #endif
315
+ return c;
316
+ }
317
+ #endif // defined(GGML_USE_HIPBLAS)
318
+
319
+ #define FP16_AVAILABLE defined (GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
320
+ defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
321
+
322
+ #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
323
+
324
+ static bool fp16_mma_available (const int cc) {
325
+ return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
326
+ }
327
+
236
328
[[noreturn]]
237
329
static __device__ void no_device_code (
238
330
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
@@ -274,16 +366,28 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
274
366
}
275
367
276
368
static __device__ __forceinline__ half2 warp_reduce_sum (half2 a) {
277
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
369
+ #if FP16_AVAILABLE
370
+
371
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
278
372
#pragma unroll
279
- for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
280
- a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, mask, 32 ));
281
- }
282
- return a;
373
+ for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
374
+ const half2 a_other = __shfl_xor_sync (0xffffffff , a, mask, 32 );
375
+ reinterpret_cast <half&>(a.x ) += __low2half (a_other);
376
+ reinterpret_cast <half&>(a.y ) += __high2half (a_other);
377
+ }
378
+ return a;
283
379
#else
284
- GGML_UNUSED (a);
285
- NO_DEVICE_CODE;
286
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
380
+ #pragma unroll
381
+ for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
382
+ a = __hadd2 (a, __shfl_xor_sync (0xffffffff , a, mask, 32 ));
383
+ }
384
+ return a;
385
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
386
+
387
+ #else
388
+ NO_DEVICE_CODE;
389
+ return a;
390
+ #endif // FP16_AVAILABLE
287
391
}
288
392
289
393
static __device__ __forceinline__ float warp_reduce_max (float x) {
@@ -295,29 +399,30 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
295
399
}
296
400
297
401
static __device__ __forceinline__ half ggml_cuda_hmax (const half a, const half b) {
298
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
402
+ #if FP16_AVAILABLE
299
403
300
- #if CUDART_VERSION >= CUDART_HMAX
301
- return __hmax (a, b );
404
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
405
+ return __float2half ( fmaxf ( __half2float (a), __half2float (b)) );
302
406
#else
303
- return __half2float (a) > __half2float (b) ? a : b ;
304
- #endif // CUDART_VERSION >= CUDART_HMAX
407
+ return __hmax (a, b) ;
408
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
305
409
306
410
#else
307
- GGML_UNUSED (a) ;
308
- GGML_UNUSED (b);
309
- NO_DEVICE_CODE ;
310
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
411
+ NO_DEVICE_CODE ;
412
+ GGML_UNUSED (b);
413
+ return a ;
414
+ #endif // FP16_AVAILABLE
311
415
}
416
+
312
417
static __device__ __forceinline__ half2 ggml_cuda_hmax2 (const half2 a, const half2 b) {
313
418
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
314
419
315
420
#if CUDART_VERSION >= CUDART_HMAX
316
421
return __hmax2 (a, b);
317
422
#else
318
423
half2 ret;
319
- reinterpret_cast <half&>(ret.x ) = __low2float (a) > __low2float (b) ? __low2half (a) : __low2half (b );
320
- reinterpret_cast <half&>(ret.y ) = __high2float (a) > __high2float (b) ? __high2half (a) : __high2half (b );
424
+ reinterpret_cast <half&>(ret.x ) = __float2half ( fmaxf ( __low2float (a), __low2float (b)) );
425
+ reinterpret_cast <half&>(ret.y ) = __float2half ( fmaxf ( __high2float (a), __high2float (b)) );
321
426
return ret;
322
427
#endif // CUDART_VERSION >= CUDART_HMAX
323
428
@@ -349,94 +454,6 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
349
454
}
350
455
#endif // CUDART_VERSION < 12000
351
456
352
- #if defined(GGML_USE_HIPBLAS)
353
- #define __CUDA_ARCH__ 1300
354
-
355
- #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
356
- defined (__gfx1150__) || defined(__gfx1151__)
357
- #define RDNA3
358
- #endif
359
-
360
- #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
361
- defined (__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
362
- #define RDNA2
363
- #endif
364
-
365
- #ifndef __has_builtin
366
- #define __has_builtin (x ) 0
367
- #endif
368
-
369
- typedef int8_t int8x4_t __attribute__ ((ext_vector_type(4 )));
370
- typedef uint8_t uint8x4_t __attribute__ ((ext_vector_type(4 )));
371
- static __device__ __forceinline__ int __vsubss4 (const int a, const int b) {
372
- const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
373
- const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
374
- #if __has_builtin(__builtin_elementwise_sub_sat)
375
- const int8x4_t c = __builtin_elementwise_sub_sat (va, vb);
376
- return reinterpret_cast <const int &>(c);
377
- #else
378
- int8x4_t c;
379
- int16_t tmp;
380
- #pragma unroll
381
- for (int i = 0 ; i < 4 ; i++) {
382
- tmp = va[i] - vb[i];
383
- if (tmp > std::numeric_limits<int8_t >::max ()) tmp = std::numeric_limits<int8_t >::max ();
384
- if (tmp < std::numeric_limits<int8_t >::min ()) tmp = std::numeric_limits<int8_t >::min ();
385
- c[i] = tmp;
386
- }
387
- return reinterpret_cast <int &>(c);
388
- #endif // __has_builtin(__builtin_elementwise_sub_sat)
389
- }
390
-
391
- static __device__ __forceinline__ int __vsub4 (const int a, const int b) {
392
- return __vsubss4 (a, b);
393
- }
394
-
395
- static __device__ __forceinline__ unsigned int __vcmpeq4 (unsigned int a, unsigned int b) {
396
- const uint8x4_t & va = reinterpret_cast <const uint8x4_t &>(a);
397
- const uint8x4_t & vb = reinterpret_cast <const uint8x4_t &>(b);
398
- unsigned int c;
399
- uint8x4_t & vc = reinterpret_cast <uint8x4_t &>(c);
400
- #pragma unroll
401
- for (int i = 0 ; i < 4 ; ++i) {
402
- vc[i] = va[i] == vb[i] ? 0xff : 0x00 ;
403
- }
404
- return c;
405
- }
406
-
407
- static __device__ __forceinline__ int __dp4a (const int a, const int b, int c) {
408
- #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
409
- c = __builtin_amdgcn_sdot4 (a, b, c, false );
410
- #elif defined(RDNA3)
411
- c = __builtin_amdgcn_sudot4 ( true , a, true , b, c, false );
412
- #elif defined(__gfx1010__) || defined(__gfx900__)
413
- int tmp1;
414
- int tmp2;
415
- asm (" \n \
416
- v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
417
- v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
418
- v_add3_u32 %0, %1, %2, %0 \n \
419
- v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
420
- v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
421
- v_add3_u32 %0, %1, %2, %0 \n \
422
- "
423
- : " +v" (c), " =&v" (tmp1), " =&v" (tmp2)
424
- : " v" (a), " v" (b)
425
- );
426
- #else
427
- const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
428
- const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
429
- c += va[0 ] * vb[0 ] + va[1 ] * vb[1 ] + va[2 ] * vb[2 ] + va[3 ] * vb[3 ];
430
- #endif
431
- return c;
432
- }
433
- #endif // defined(GGML_USE_HIPBLAS)
434
-
435
- #define FP16_AVAILABLE defined (GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
436
- defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
437
-
438
- #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
439
-
440
457
// TODO: move to ggml-common.h
441
458
static const __device__ int8_t kvalues_iq4nl[16 ] = {-127 , -104 , -83 , -65 , -49 , -35 , -22 , -10 , 1 , 13 , 25 , 38 , 53 , 69 , 89 , 113 };
442
459
0 commit comments