|
42 | 42 | #include <TargetConditionals.h>
|
43 | 43 | #endif
|
44 | 44 |
|
| 45 | +#if defined(__x86_64__) |
| 46 | +#include <immintrin.h> |
| 47 | +#endif |
| 48 | + |
45 | 49 | #if defined(_WIN32)
|
46 | 50 | #define WIN32_LEAN_AND_MEAN
|
47 | 51 | #ifndef NOMINMAX
|
@@ -382,61 +386,185 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
382 | 386 | }
|
383 | 387 | }
|
384 | 388 |
|
385 |
| -// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library |
386 |
| -// currently, the ggml_cpu_has_* functions are entirely compile-time |
387 |
| -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { |
388 |
| - int64_t i = 0; |
389 |
| -#if defined(__F16C__) |
390 |
| - //if (ggml_cpu_has_f16c()) { |
391 |
| - for (; i + 7 < n; i += 8) { |
392 |
| - __m256 x_vec = _mm256_loadu_ps(x + i); |
393 |
| - __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
394 |
| - _mm_storeu_si128((__m128i *)(y + i), y_vec); |
395 |
| - } |
396 |
| - for(; i + 3 < n; i += 4) { |
397 |
| - __m128 x_vec = _mm_loadu_ps(x + i); |
398 |
| - __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
399 |
| - _mm_storel_epi64((__m128i *)(y + i), y_vec); |
400 |
| - } |
401 |
| - //} |
| 389 | +#if defined(__x86_64__) |
| 390 | + |
| 391 | +#if defined(_MSC_VER) |
| 392 | +#include <intrin.h> |
| 393 | +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { |
| 394 | + int regs[4]; |
| 395 | + __cpuidex(regs, leaf, subleaf); |
| 396 | + *eax = regs[0]; |
| 397 | + *ebx = regs[1]; |
| 398 | + *ecx = regs[2]; |
| 399 | + *edx = regs[3]; |
| 400 | +} |
| 401 | +#elif defined(__GNUC__) || defined(__clang__) |
| 402 | +static void cpuid(int leaf, int subleaf, int *eax, int *ebx, int *ecx, int *edx) { |
| 403 | + __asm__ volatile ( |
| 404 | + "cpuid" |
| 405 | + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) |
| 406 | + : "a"(leaf), "c"(subleaf) |
| 407 | + ); |
| 408 | +} |
| 409 | +#else |
| 410 | + #error Unsupported compiler |
402 | 411 | #endif
|
403 |
| - for (; i < n; i++) { |
| 412 | + |
| 413 | +static bool x86_64_supports_f16c(void) { |
| 414 | + int eax, ebx, ecx, edx; |
| 415 | + cpuid(1, 0, &eax, &ebx, &ecx, &edx); |
| 416 | + return (ecx & (1 << 29)) != 0; |
| 417 | +} |
| 418 | + |
| 419 | +static bool x86_64_supports_avx2(void) { |
| 420 | + int eax, ebx, ecx, edx; |
| 421 | + cpuid(0, 0, &eax, &ebx, &ecx, &edx); |
| 422 | + if (eax < 7) |
| 423 | + return 0; |
| 424 | + cpuid(7, 0, &eax, &ebx, &ecx, &edx); |
| 425 | + return (ebx & (1 << 5)) != 0; |
| 426 | +} |
| 427 | + |
| 428 | +static bool x86_64_supports_avx512f(void) { |
| 429 | + int eax, ebx, ecx, edx; |
| 430 | + cpuid(0, 0, &eax, &ebx, &ecx, &edx); |
| 431 | + if (eax < 7) return 0; |
| 432 | + cpuid(7, 0, &eax, &ebx, &ecx, &edx); |
| 433 | + return (ebx & (1 << 16)) != 0; |
| 434 | +} |
| 435 | + |
| 436 | +static struct ggml_type_traits type_traits[GGML_TYPE_COUNT]; |
| 437 | + |
| 438 | +static inline void ggml_fp32_to_fp16_generic(const float * x, ggml_fp16_t * y, int64_t n) { |
| 439 | + for (int64_t i = 0; i < n; i++) { |
404 | 440 | y[i] = GGML_FP32_TO_FP16(x[i]);
|
405 | 441 | }
|
406 | 442 | }
|
407 | 443 |
|
408 |
| -void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { |
| 444 | +static inline void __attribute__((target("f16c"))) ggml_fp32_to_fp16_row_f16c(const float * x, ggml_fp16_t * y, int64_t n) { |
409 | 445 | int64_t i = 0;
|
410 |
| -#if defined(__AVX512F__) |
411 |
| - //if (ggml_cpu_has_avx512()) { |
412 |
| - for (; i + 16 <= n; i += 16) { |
413 |
| - _mm512_storeu_ps(y + i, |
414 |
| - _mm512_castsi512_ps( |
415 |
| - _mm512_slli_epi32( |
416 |
| - _mm512_cvtepu16_epi32( |
417 |
| - _mm256_loadu_si256( |
418 |
| - (const __m256i *)(x + i))), |
419 |
| - 16))); |
420 |
| - } |
421 |
| - //} |
422 |
| -#endif |
423 |
| -#if defined(__AVX2__) |
424 |
| - //if (ggml_cpu_has_avx2()) { |
425 |
| - for (; i + 8 <= n; i += 8) { |
426 |
| - _mm256_storeu_ps(y + i, |
427 |
| - _mm256_castsi256_ps( |
428 |
| - _mm256_slli_epi32( |
429 |
| - _mm256_cvtepu16_epi32( |
430 |
| - _mm_loadu_si128( |
431 |
| - (const __m128i *)(x + i))), |
432 |
| - 16))); |
433 |
| - } |
434 |
| - //} |
| 446 | + for (; i + 7 < n; i += 8) { |
| 447 | + __m256 x_vec = _mm256_loadu_ps(x + i); |
| 448 | + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
| 449 | + _mm_storeu_si128((__m128i *)(y + i), y_vec); |
| 450 | + } |
| 451 | + for (; i + 3 < n; i += 4) { |
| 452 | + __m128 x_vec = _mm_loadu_ps(x + i); |
| 453 | + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
| 454 | + _mm_storel_epi64((__m128i *)(y + i), y_vec); |
| 455 | + } |
| 456 | + ggml_fp32_to_fp16_generic(x + i, y + i, n - i); |
| 457 | +} |
| 458 | + |
| 459 | +static inline void __attribute__((target("avx512f"), target("f16c"))) ggml_fp32_to_fp16_row_avx512f(const float * x, ggml_fp16_t * y, int64_t n) { |
| 460 | + int64_t i = 0; |
| 461 | + for (; i + 15 < n; i += 16) { |
| 462 | + __m512 x_vec = _mm512_loadu_ps(x + i); |
| 463 | + __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); |
| 464 | + _mm256_storeu_si256((__m256i *)(y + i), y_vec); |
| 465 | + } |
| 466 | + ggml_fp32_to_fp16_row_f16c(x + i, y + i, n - i); |
| 467 | +} |
| 468 | + |
| 469 | +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { |
| 470 | +static ggml_from_float_t from_float_ref = NULL; |
| 471 | + if (from_float_ref != NULL) { |
| 472 | + from_float_ref(x, y, n); |
| 473 | + return; |
| 474 | + } |
| 475 | + |
| 476 | + bool has_avx512f = x86_64_supports_avx512f(); |
| 477 | + bool has_f16c = x86_64_supports_f16c(); |
| 478 | + if (has_avx512f && has_f16c) { |
| 479 | + // use AVX512F |
| 480 | + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_avx512f; |
| 481 | + } else if (has_f16c) { |
| 482 | + // use F16C |
| 483 | + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_row_f16c; |
| 484 | + } else { |
| 485 | + // fallback to generic implementation |
| 486 | + from_float_ref = (ggml_from_float_t)ggml_fp32_to_fp16_generic; |
| 487 | + } |
| 488 | + type_traits[GGML_TYPE_F16].from_float_ref = from_float_ref; |
| 489 | + from_float_ref(x, y, n); |
| 490 | +} |
| 491 | + |
| 492 | +#else |
| 493 | +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { |
| 494 | + for (int64_t i = 0; i < n; i++) { |
| 495 | + y[i] = GGML_FP32_TO_FP16(x[i]); |
| 496 | + } |
| 497 | +} |
| 498 | + |
435 | 499 | #endif
|
436 |
| - for (; i < n; i++) { |
| 500 | + |
| 501 | +#if defined(__x86_64__) |
| 502 | + |
| 503 | + |
| 504 | +static inline void ggml_bf16_to_fp32_generic(const ggml_bf16_t * x, float * y, int64_t n) { |
| 505 | + for (int64_t i = 0; i < n; i++) { |
| 506 | + y[i] = GGML_BF16_TO_FP32(x[i]); |
| 507 | + } |
| 508 | +} |
| 509 | + |
| 510 | +static inline void __attribute__((target("avx2"))) ggml_bf16_to_fp32_row_avx2(const ggml_bf16_t * x, float * y, int64_t n) { |
| 511 | + int64_t i = 0; |
| 512 | + for (; i + 7 < n; i += 8) { |
| 513 | + _mm256_storeu_ps(y + i, |
| 514 | + _mm256_castsi256_ps( |
| 515 | + _mm256_slli_epi32( |
| 516 | + _mm256_cvtepu16_epi32( |
| 517 | + _mm_loadu_si128( |
| 518 | + (const __m128i *)(x + i))), |
| 519 | + 16))); |
| 520 | + } |
| 521 | + ggml_bf16_to_fp32_generic(x + i, y + i, n - i); |
| 522 | +} |
| 523 | + |
| 524 | +static inline void __attribute__((target("avx512f"))) ggml_bf16_to_fp32_row_avx512f(const ggml_bf16_t * x, float * y, int64_t n) { |
| 525 | + int64_t i = 0; |
| 526 | + for (; i + 15 < n; i += 16) { |
| 527 | + _mm512_storeu_ps(y + i, |
| 528 | + _mm512_castsi512_ps( |
| 529 | + _mm512_slli_epi32( |
| 530 | + _mm512_cvtepu16_epi32( |
| 531 | + _mm256_loadu_si256( |
| 532 | + (const __m256i *)(x + i))), |
| 533 | + 16))); |
| 534 | + } |
| 535 | + ggml_bf16_to_fp32_row_avx2(x + i, y + i, n - i); |
| 536 | +} |
| 537 | + |
| 538 | +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { |
| 539 | + static ggml_to_float_t to_float = NULL; |
| 540 | + if (to_float != NULL) { |
| 541 | + to_float(x, y, n); |
| 542 | + return; |
| 543 | + } |
| 544 | + bool has_avx512f = x86_64_supports_avx512f(); |
| 545 | + bool has_avx2 = x86_64_supports_avx2(); |
| 546 | + if (has_avx512f) { |
| 547 | + // use AVX512F |
| 548 | + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx512f; |
| 549 | + } else if (has_avx2) { |
| 550 | + // use AVX2 |
| 551 | + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_row_avx2; |
| 552 | + } else { |
| 553 | + // fallback to generic implementation |
| 554 | + to_float = (ggml_to_float_t)ggml_bf16_to_fp32_generic; |
| 555 | + } |
| 556 | + type_traits[GGML_TYPE_BF16].to_float = to_float; |
| 557 | + to_float(x, y, n); |
| 558 | +} |
| 559 | + |
| 560 | +#else |
| 561 | + |
| 562 | +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { |
| 563 | + for (int64_t i = 0; i < n; i++) { |
437 | 564 | y[i] = GGML_BF16_TO_FP32(x[i]);
|
438 | 565 | }
|
439 | 566 | }
|
| 567 | +#endif |
440 | 568 |
|
441 | 569 | void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
|
442 | 570 | for (int i = 0; i < n; i++) {
|
@@ -569,7 +697,7 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
|
569 | 697 | static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
570 | 698 | static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
571 | 699 |
|
572 |
| -static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { |
| 700 | +static struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { |
573 | 701 | [GGML_TYPE_I8] = {
|
574 | 702 | .type_name = "i8",
|
575 | 703 | .blck_size = 1,
|
|
0 commit comments