Skip to content

Commit 59084ff

Browse files
committed
ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion
1 parent 13be08d commit 59084ff

File tree

1 file changed

+174
-46
lines changed

1 file changed

+174
-46
lines changed

ggml/src/ggml.c

Lines changed: 174 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@
4242
#include <TargetConditionals.h>
4343
#endif
4444

45+
#if defined(__x86_64__)
46+
#include <immintrin.h>
47+
#endif
48+
4549
#if defined(_WIN32)
4650
#define WIN32_LEAN_AND_MEAN
4751
#ifndef NOMINMAX
@@ -382,61 +386,185 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
382386
}
383387
}
384388

385-
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
386-
// currently, the ggml_cpu_has_* functions are entirely compile-time
387-
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
388-
int64_t i = 0;
389-
#if defined(__F16C__)
390-
//if (ggml_cpu_has_f16c()) {
391-
for (; i + 7 < n; i += 8) {
392-
__m256 x_vec = _mm256_loadu_ps(x + i);
393-
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
394-
_mm_storeu_si128((__m128i *)(y + i), y_vec);
395-
}
396-
for(; i + 3 < n; i += 4) {
397-
__m128 x_vec = _mm_loadu_ps(x + i);
398-
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
399-
_mm_storel_epi64((__m128i *)(y + i), y_vec);
400-
}
401-
//}
389+
#if defined(__x86_64__)
390+
391+
#if defined(_MSC_VER)
392+
#include <intrin.h>
393+
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
394+
int regs[4];
395+
__cpuidex(regs, leaf, subleaf);
396+
*eax = regs[0];
397+
*ebx = regs[1];
398+
*ecx = regs[2];
399+
*edx = regs[3];
400+
}
401+
#elif defined(__GNUC__) || defined(__clang__)
402+
static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) {
403+
__asm__ volatile (
404+
"cpuid"
405+
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
406+
: "a"(leaf), "c"(subleaf)
407+
);
408+
}
409+
#else
410+
#error Unsupported compiler
402411
#endif
403-
for (; i < n; i++) {
412+
413+
static bool x86_64_supports_f16c(void) {
414+
int eax, ebx, ecx, edx;
415+
cpuid(1, 0, &eax, &ebx, &ecx, &edx);
416+
return (ecx & (1 << 29)) != 0;
417+
}
418+
419+
static bool x86_64_supports_avx2(void) {
420+
int eax, ebx, ecx, edx;
421+
cpuid(0, 0, &eax, &ebx, &ecx, &edx);
422+
if (eax < 7)
423+
return 0;
424+
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
425+
return (ebx & (1 << 5)) != 0;
426+
}
427+
428+
static bool x86_64_supports_avx512f(void) {
429+
int eax, ebx, ecx, edx;
430+
cpuid(0, 0, &eax, &ebx, &ecx, &edx);
431+
if (eax < 7) return 0;
432+
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
433+
return (ebx & (1 << 16)) != 0;
434+
}
435+
436+
static struct ggml_type_traits type_traits[GGML_TYPE_COUNT];
437+
438+
static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) {
439+
for (int64_t i = 0; i < n; i++) {
404440
y[i] = GGML_FP32_TO_FP16(x[i]);
405441
}
406442
}
407443

408-
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
444+
static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) {
409445
int64_t i = 0;
410-
#if defined(__AVX512F__)
411-
//if (ggml_cpu_has_avx512()) {
412-
for (; i + 16 <= n; i += 16) {
413-
_mm512_storeu_ps(y + i,
414-
_mm512_castsi512_ps(
415-
_mm512_slli_epi32(
416-
_mm512_cvtepu16_epi32(
417-
_mm256_loadu_si256(
418-
(const __m256i *)(x + i))),
419-
16)));
420-
}
421-
//}
422-
#endif
423-
#if defined(__AVX2__)
424-
//if (ggml_cpu_has_avx2()) {
425-
for (; i + 8 <= n; i += 8) {
426-
_mm256_storeu_ps(y + i,
427-
_mm256_castsi256_ps(
428-
_mm256_slli_epi32(
429-
_mm256_cvtepu16_epi32(
430-
_mm_loadu_si128(
431-
(const __m128i *)(x + i))),
432-
16)));
433-
}
434-
//}
446+
for (; i + 7 < n; i += 8) {
447+
__m256 x_vec = _mm256_loadu_ps(x + i);
448+
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
449+
_mm_storeu_si128((__m128i *)(y + i), y_vec);
450+
}
451+
for (; i + 3 < n; i += 4) {
452+
__m128 x_vec = _mm_loadu_ps(x + i);
453+
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
454+
_mm_storel_epi64((__m128i *)(y + i), y_vec);
455+
}
456+
ggml_fp32_to_fp16_generic(x + i, y + i, n - i);
457+
}
458+
459+
static inline void __attribute__((target("avx512f"), target("f16c"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) {
460+
int64_t i = 0;
461+
for (; i + 15 < n; i += 16) {
462+
__m512 x_vec = _mm512_loadu_ps(x + i);
463+
__m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
464+
_mm256_storeu_si256((__m256i *)(y + i), y_vec);
465+
}
466+
ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i);
467+
}
468+
469+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
470+
static ggml_from_float_t from_float_ref = NULL;
471+
if (from_float_ref != NULL) {
472+
from_float_ref(x, y, n);
473+
return;
474+
}
475+
476+
bool has_avx512f = x86_64_supports_avx512f();
477+
bool has_f16c = x86_64_supports_f16c();
478+
if (has_avx512f && has_f16c) {
479+
// use AVX512F
480+
from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f;
481+
} else if (has_f16c) {
482+
// use F16C
483+
from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c;
484+
} else {
485+
// fallback to generic implementation
486+
from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic;
487+
}
488+
type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref;
489+
from_float_ref(x, y, n);
490+
}
491+
492+
#else
493+
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
494+
for (int64_t i = 0; i < n; i++) {
495+
y[i] = GGML_FP32_TO_FP16(x[i]);
496+
}
497+
}
498+
435499
#endif
436-
for (; i < n; i++) {
500+
501+
#if defined(__x86_64__)
502+
503+
504+
static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) {
505+
for (int64_t i = 0; i < n; i++) {
506+
y[i] = GGML_BF16_TO_FP32(x[i]);
507+
}
508+
}
509+
510+
static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) {
511+
int64_t i = 0;
512+
for (; i + 7 < n; i += 8) {
513+
_mm256_storeu_ps(y + i,
514+
_mm256_castsi256_ps(
515+
_mm256_slli_epi32(
516+
_mm256_cvtepu16_epi32(
517+
_mm_loadu_si128(
518+
(const __m128i *)(x + i))),
519+
16)));
520+
}
521+
ggml_bf16_to_fp32_generic(x + i, y + i, n - i);
522+
}
523+
524+
static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) {
525+
int64_t i = 0;
526+
for (; i + 15 < n; i += 16) {
527+
_mm512_storeu_ps(y + i,
528+
_mm512_castsi512_ps(
529+
_mm512_slli_epi32(
530+
_mm512_cvtepu16_epi32(
531+
_mm256_loadu_si256(
532+
(const __m256i *)(x + i))),
533+
16)));
534+
}
535+
ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i);
536+
}
537+
538+
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
539+
static ggml_to_float_t to_float = NULL;
540+
if (to_float != NULL) {
541+
to_float(x, y, n);
542+
return;
543+
}
544+
bool has_avx512f = x86_64_supports_avx512f();
545+
bool has_avx2 = x86_64_supports_avx2();
546+
if (has_avx512f) {
547+
// use AVX512F
548+
to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f;
549+
} else if (has_avx2) {
550+
// use AVX2
551+
to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2;
552+
} else {
553+
// fallback to generic implementation
554+
to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic;
555+
}
556+
type_traits[GGML_TYPE_BF16].to_float = to_float;
557+
to_float(x, y, n);
558+
}
559+
560+
#else
561+
562+
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
563+
for (int64_t i = 0; i < n; i++) {
437564
y[i] = GGML_BF16_TO_FP32(x[i]);
438565
}
439566
}
567+
#endif
440568

441569
void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
442570
for (int i = 0; i < n; i++) {
@@ -569,7 +697,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
569697
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
570698
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
571699

572-
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
700+
static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
573701
[GGML_TYPE_I8] = {
574702
.type_name = "i8",
575703
.blck_size = 1,

0 commit comments

Comments
 (0)