Skip to content

Commit 4b6575c

Browse files
SongXiaoXipockers21
authored andcommitted
ggml: move fp16/bf16 conversion optimizations to CPU backend + export conversion APIs (ggml-org#13107)
* ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion * move fp converter to ggml-cpu * Switch ggml_compute_forward_get_rows_f16/bf16 to new ggml_cpu_fp16/bf16_to_fp32
1 parent dd23581 commit 4b6575c

File tree

4 files changed

+101
-50
lines changed

4 files changed

+101
-50
lines changed

ggml/include/ggml-cpu.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ extern "C" {
133133

134134
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135135

136+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137+
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138+
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
139+
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
140+
136141
#ifdef __cplusplus
137142
}
138143
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
215215
.nrows = 1,
216216
},
217217
[GGML_TYPE_F16] = {
218-
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
218+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
219219
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
220220
.vec_dot_type = GGML_TYPE_F16,
221221
.nrows = 1,
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
356356
.from_float = quantize_row_q8_K,
357357
},
358358
[GGML_TYPE_BF16] = {
359-
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
359+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
360360
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
361361
.vec_dot_type = GGML_TYPE_BF16,
362362
.nrows = 1,
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
31663166
return ggml_graph_compute(cgraph, &cplan);
31673167
}
31683168

3169+
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3170+
int64_t i = 0;
3171+
#if defined(__F16C__)
3172+
#if defined(__AVX512F__)
3173+
for (; i + 15 < n; i += 16) {
3174+
__m512 x_vec = _mm512_loadu_ps(x + i);
3175+
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3176+
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
3177+
}
3178+
#endif
3179+
for (; i + 7 < n; i += 8) {
3180+
__m256 x_vec = _mm256_loadu_ps(x + i);
3181+
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3182+
_mm_storeu_si128((__m128i *)(y + i), y_vec);
3183+
}
3184+
for (; i + 3 < n; i += 4) {
3185+
__m128 x_vec = _mm_loadu_ps(x + i);
3186+
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3187+
_mm_storel_epi64((__m128i *)(y + i), y_vec);
3188+
}
3189+
#endif
3190+
for (; i < n; ++i) {
3191+
y[i] = GGML_FP32_TO_FP16(x[i]);
3192+
}
3193+
}
3194+
3195+
void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3196+
int64_t i = 0;
3197+
#if defined(__F16C__)
3198+
#if defined(__AVX512F__)
3199+
for (; i + 15 < n; i += 16) {
3200+
__m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
3201+
__m512 y_vec = _mm512_cvtph_ps(x_vec);
3202+
_mm512_storeu_ps(y + i, y_vec);
3203+
}
3204+
#endif
3205+
for (; i + 7 < n; i += 8) {
3206+
__m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
3207+
__m256 y_vec = _mm256_cvtph_ps(x_vec);
3208+
_mm256_storeu_ps(y + i, y_vec);
3209+
}
3210+
for (; i + 3 < n; i += 4) {
3211+
__m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
3212+
__m128 y_vec = _mm_cvtph_ps(x_vec);
3213+
_mm_storeu_ps(y + i, y_vec);
3214+
}
3215+
#endif
3216+
for (; i < n; ++i) {
3217+
y[i] = GGML_FP16_TO_FP32(x[i]);
3218+
}
3219+
}
3220+
3221+
void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
3222+
int64_t i = 0;
3223+
for (; i < n; ++i) {
3224+
y[i] = GGML_FP32_TO_BF16(x[i]);
3225+
}
3226+
}
3227+
3228+
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3229+
int64_t i = 0;
3230+
#if defined(__AVX2__)
3231+
#if defined(__AVX512F__)
3232+
for (; i + 15 < n; i += 16) {
3233+
_mm512_storeu_ps(y + i,
3234+
_mm512_castsi512_ps(
3235+
_mm512_slli_epi32(
3236+
_mm512_cvtepu16_epi32(
3237+
_mm256_loadu_si256(
3238+
(const __m256i *)(x + i))),
3239+
16)));
3240+
}
3241+
#endif
3242+
for (; i + 7 < n; i += 8) {
3243+
_mm256_storeu_ps(y + i,
3244+
_mm256_castsi256_ps(
3245+
_mm256_slli_epi32(
3246+
_mm256_cvtepu16_epi32(
3247+
_mm_loadu_si128(
3248+
(const __m128i *)(x + i))),
3249+
16)));
3250+
}
3251+
#endif
3252+
for (; i < n; i++) {
3253+
y[i] = GGML_BF16_TO_FP32(x[i]);
3254+
}
3255+
}
31693256

31703257
int ggml_cpu_has_avx(void) {
31713258
#if defined(__AVX__)

ggml/src/ggml-cpu/ops.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
42224222

42234223
GGML_ASSERT(i01 >= 0 && i01 < ne01);
42244224

4225-
ggml_fp16_to_fp32_row(
4225+
ggml_cpu_fp16_to_fp32(
42264226
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
42274227
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
42284228
}
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
42634263

42644264
GGML_ASSERT(i01 >= 0 && i01 < ne01);
42654265

4266-
ggml_bf16_to_fp32_row(
4266+
ggml_cpu_bf16_to_fp32(
42674267
(const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
42684268
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
42694269
}

ggml/src/ggml.c

Lines changed: 5 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "ggml-backend.h"
55
#include "ggml-impl.h"
66
#include "ggml-threading.h"
7+
#include "ggml-cpu.h"
78
#include "ggml.h"
89

910
// FIXME: required here for quantization functions
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
382383
}
383384
}
384385

385-
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
386-
// currently, the ggml_cpu_has_* functions are entirely compile-time
387386
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
388-
int64_t i = 0;
389-
#if defined(__F16C__)
390-
//if (ggml_cpu_has_f16c()) {
391-
for (; i + 7 < n; i += 8) {
392-
__m256 x_vec = _mm256_loadu_ps(x + i);
393-
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
394-
_mm_storeu_si128((__m128i *)(y + i), y_vec);
395-
}
396-
for(; i + 3 < n; i += 4) {
397-
__m128 x_vec = _mm_loadu_ps(x + i);
398-
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
399-
_mm_storel_epi64((__m128i *)(y + i), y_vec);
400-
}
401-
//}
402-
#endif
403-
for (; i < n; i++) {
387+
int i = 0;
388+
for (; i < n; ++i) {
404389
y[i] = GGML_FP32_TO_FP16(x[i]);
405390
}
406391
}
407392

408393
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
409-
int64_t i = 0;
410-
#if defined(__AVX512F__)
411-
//if (ggml_cpu_has_avx512()) {
412-
for (; i + 16 <= n; i += 16) {
413-
_mm512_storeu_ps(y + i,
414-
_mm512_castsi512_ps(
415-
_mm512_slli_epi32(
416-
_mm512_cvtepu16_epi32(
417-
_mm256_loadu_si256(
418-
(const __m256i *)(x + i))),
419-
16)));
420-
}
421-
//}
422-
#endif
423-
#if defined(__AVX2__)
424-
//if (ggml_cpu_has_avx2()) {
425-
for (; i + 8 <= n; i += 8) {
426-
_mm256_storeu_ps(y + i,
427-
_mm256_castsi256_ps(
428-
_mm256_slli_epi32(
429-
_mm256_cvtepu16_epi32(
430-
_mm_loadu_si128(
431-
(const __m128i *)(x + i))),
432-
16)));
433-
}
434-
//}
435-
#endif
436-
for (; i < n; i++) {
394+
int i = 0;
395+
for (; i < n; ++i) {
437396
y[i] = GGML_BF16_TO_FP32(x[i]);
438397
}
439398
}

0 commit comments

Comments
 (0)