Skip to content

Commit ff067db

Browse files
committed
ggml : simplify Arm fp16 CPU logic (ggml/1177)
* ggml : simlpify Arm fp16 CPU logic ggml-ci * cont : bring back CUDA/MUSA checks ggml-ci
1 parent 36ca8b3 commit ff067db

File tree

3 files changed

+23
-42
lines changed

3 files changed

+23
-42
lines changed

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
#include "ggml.h"
66
#include "ggml-impl.h"
7+
78
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
89
//#include <stddef.h>
910
#include <stdbool.h>
1011
#include <string.h> // memcpy
1112
#include <math.h> // fabsf
1213

13-
1414
#ifdef __cplusplus
1515
extern "C" {
1616
#endif
@@ -69,33 +69,16 @@ struct ggml_compute_params {
6969
#endif
7070

7171
#if defined(__ARM_FEATURE_SVE)
72-
#include <arm_sve.h>
7372
#include <sys/prctl.h>
7473
#endif
7574

76-
// 16-bit float
77-
// on Arm, we use __fp16
78-
// on x86, we use uint16_t
7975
#if defined(__ARM_NEON)
8076

81-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
82-
//
83-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
84-
//
85-
#include <arm_neon.h>
86-
77+
// ref: https://github.com/ggml-org/llama.cpp/pull/5404
8778
#ifdef _MSC_VER
88-
89-
typedef uint16_t ggml_fp16_internal_t;
90-
9179
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
92-
9380
#else
94-
95-
typedef __fp16 ggml_fp16_internal_t;
96-
9781
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
98-
9982
#endif // _MSC_VER
10083

10184
#if !defined(__aarch64__)

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
#define GGML_F16x8 float16x8_t
7272
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
7373
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
74-
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
74+
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
7575
#define GGML_F16x8_STORE vst1q_f16
7676
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
7777
#define GGML_F16x8_ADD vaddq_f16
@@ -99,7 +99,7 @@
9999
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
100100
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
101101
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
102-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
102+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
103103
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
104104
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
105105
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
@@ -114,7 +114,7 @@
114114
#define GGML_F32Cx4 float32x4_t
115115
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
116116
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
117-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
117+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
118118
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
119119
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
120120
#define GGML_F32Cx4_ADD vaddq_f32
@@ -125,7 +125,7 @@
125125
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
126126
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
127127
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
128-
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
128+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
129129
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
130130
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
131131
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL

ggml/src/ggml-impl.h

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,6 @@
1616
#include <arm_sve.h>
1717
#endif // __ARM_FEATURE_SVE
1818

19-
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
20-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
21-
//
22-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
23-
//
24-
#include <arm_neon.h>
25-
#endif
26-
2719
#if defined(__F16C__)
2820
#include <immintrin.h>
2921
#endif
@@ -311,29 +303,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
311303

312304
// FP16 to FP32 conversion
313305

314-
#if defined(__ARM_NEON)
315-
#if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
316-
typedef uint16_t ggml_fp16_internal_t;
317-
#else
318-
typedef __fp16 ggml_fp16_internal_t;
319-
#endif
320-
#endif
306+
// 16-bit float
307+
// on Arm, we use __fp16
308+
// on x86, we use uint16_t
309+
//
310+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
311+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
312+
//
313+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
314+
315+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
316+
//
317+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
318+
//
319+
#include <arm_neon.h>
321320

322-
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
323321
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
324322
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
325323

326324
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
327325

328326
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
329-
ggml_fp16_internal_t tmp;
327+
__fp16 tmp;
330328
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
331329
return (float)tmp;
332330
}
333331

334332
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
335333
ggml_fp16_t res;
336-
ggml_fp16_internal_t tmp = f;
334+
__fp16 tmp = f;
337335
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
338336
return res;
339337
}
@@ -485,7 +483,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
485483
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
486484
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
487485

488-
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
486+
#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
489487

490488
// precomputed f32 table for f16 (256 KB)
491489
// defined in ggml.c, initialized in ggml_init()

0 commit comments

Comments
 (0)