Skip to content

ggml : always define ggml_fp16_t as uint16_t #5666

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"

#include <algorithm>
#include <assert.h>
#include <atomic>
Expand Down Expand Up @@ -121,11 +125,6 @@

#endif // defined(GGML_USE_HIPBLAS)

// ggml-cuda need half type so keep ggml headers include at last
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"

#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

#define CC_PASCAL 600
Expand Down
27 changes: 20 additions & 7 deletions ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,23 @@ extern "C" {
//
#include <arm_neon.h>

#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)

static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
__fp16 tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}

#define GGML_FP16_TO_FP32(x) ((float) (x))
#define GGML_FP32_TO_FP16(x) (x)
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
__fp16 tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}

#else

Expand Down Expand Up @@ -214,17 +226,18 @@ extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)

#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}

#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif

#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif

#define GGML_HASHTABLE_FULL ((size_t)-1)
Expand Down
30 changes: 15 additions & 15 deletions ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -5629,8 +5629,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

for (int i = 0; i < nb; ++i) {

const float d = y[i].d * (float)x[i].d;
const float dmin = -y[i].d * (float)x[i].dmin;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

const uint8_t * restrict q2 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
Expand Down Expand Up @@ -5779,8 +5779,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

for (int i = 0; i < nb; ++i) {

const float d = y[i].d * (float)x[i].d;
const float dmin = -y[i].d * (float)x[i].dmin;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

const uint8_t * restrict q2 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
Expand Down Expand Up @@ -6433,7 +6433,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r

int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);

const float d = y[i].d * (float)x[i].d;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);

const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
Expand Down Expand Up @@ -6635,7 +6635,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r

int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);

const float d = y[i].d * (float)x[i].d;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);

vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);

Expand Down Expand Up @@ -7138,9 +7138,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
aux16[1] = (a[0] >> 4) & 0x0f0f;

const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
sum_mins += y[i].d * (float)x[i].d[1] * summi;
sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;

const float d = y[i].d * (float)x[i].d[0];
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);

const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);

Expand Down Expand Up @@ -7798,7 +7798,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r

for (int i = 0; i < nb; ++i) {

const float d = y[i].d * (float)x[i].d;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const int8_t * sc = x[i].scales;

const uint8_t * restrict q5 = x[i].qs;
Expand Down Expand Up @@ -7940,7 +7940,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r

for (int i = 0; i < nb; ++i) {

const float d = y[i].d * (float)x[i].d;
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const int8_t * sc = x[i].scales;

const uint8_t * restrict q5 = x[i].qs;
Expand Down Expand Up @@ -8508,7 +8508,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r

for (int i = 0; i < nb; ++i) {

const float d_all = (float)x[i].d;
const float d_all = GGML_FP16_TO_FP32(x[i].d);

const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
Expand Down Expand Up @@ -8679,7 +8679,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r

for (int i = 0; i < nb; ++i) {

const float d_all = (float)x[i].d;
const float d_all = GGML_FP16_TO_FP32(x[i].d);

const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
Expand Down Expand Up @@ -9498,7 +9498,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
float sumf = 0;

for (int ib = 0; ib < nb; ib += 2) {

q4bits.val[0] = vld1q_u8(x[ib+0].qs);
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
q8b.val[0] = vld1q_s8(y[ib+0].qs);
Expand All @@ -9514,8 +9513,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);

sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);

sumf +=
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
}

*s = sumf;
Expand Down
6 changes: 3 additions & 3 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
// note: do not use these inside ggml.c
// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
return (float) GGML_FP16_TO_FP32(x);
return GGML_FP16_TO_FP32(x);
}

ggml_fp16_t ggml_fp32_to_fp16(float x) {
Expand Down Expand Up @@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
#define GGML_F16x8 float16x8_t
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
#define GGML_F16x8_LOAD vld1q_f16
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
#define GGML_F16x8_STORE vst1q_f16
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16
Expand Down Expand Up @@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
#define GGML_F32Cx4 float32x4_t
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32Cx4_ADD vaddq_f32
Expand Down
6 changes: 0 additions & 6 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,13 +315,7 @@
extern "C" {
#endif

#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
#endif

// convert FP16 <-> FP32
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
Expand Down