Skip to content

ggml : fix loongson compile warnings #7537

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

const uint8_t * restrict q2 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;

const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
Expand Down Expand Up @@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
for (int i = 0; i < nb; ++i) {

const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const uint8_t * restrict q3 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
// Set up scales
memcpy(aux, x[i].scales, 12);
__m128i scales128 = lsx_set_w(
Expand All @@ -6830,8 +6833,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
int is = 0;
__m256i xvbit;

const uint8_t * restrict q3 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;

for (int j = 0; j < QK_K/128; ++j) {
// load low 2 bits
Expand Down Expand Up @@ -7404,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
*s = vec_extract(vsumf0, 0);

#elif defined __loongarch_asx
GGML_UNUSED(kmask1);
GGML_UNUSED(kmask2);
GGML_UNUSED(kmask3);

const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);

Expand All @@ -7416,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;

const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
Expand Down Expand Up @@ -7455,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r

__m256 vd = __lasx_xvreplfr2vr_s(d);
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);

}

acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
__m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);


ft_union fi;
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
*s = hsum_float_8(acc) + fi.f ;

#else

const uint8_t * scales = (const uint8_t*)&utmp[0];
Expand Down Expand Up @@ -8002,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
*s = vec_extract(vsumf0, 0);

#elif defined __loongarch_asx
GGML_UNUSED(kmask1);
GGML_UNUSED(kmask2);
GGML_UNUSED(kmask3);

const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
const __m128i mzero = __lsx_vldi(0);
Expand All @@ -8020,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;

const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));

Expand Down Expand Up @@ -8069,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
p16_1 = lasx_madd_h(scale_1, p16_1);

sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));

}

__m256 vd = __lasx_xvreplfr2vr_s(d);
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);

}

*s = hsum_float_8(acc) + summs;
Expand Down
13 changes: 7 additions & 6 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1576,11 +1576,11 @@ do { \

// F16 arithmetic is not supported by AVX, so we use F32 instead

#define GGML_F32Cx8 __m256
#define GGML_F32Cx8 __m256
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))

static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8];

for (int i = 0; i < 8; i++) {
Expand All @@ -1589,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {

return (__m256)__lasx_xvld(tmp, 0);
}
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
float arr[8];

__lasx_xvst(y, arr, 0);

for (int i = 0; i < 8; i++)
for (int i = 0; i < 8; i++) {
x[i] = GGML_FP32_TO_FP16(arr[i]);
}
}
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
Expand Down Expand Up @@ -1671,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F16_STEP 32
#define GGML_F16_EPR 4

static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
float tmp[4];

tmp[0] = GGML_FP16_TO_FP32(x[0]);
Expand All @@ -1682,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
return __lsx_vld(tmp, 0);
}

static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
float arr[4];

__lsx_vst(y, arr, 0);
Expand Down
7 changes: 5 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,11 @@ llama_target_and_test(test-rope.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model")

llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
endif()

# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
Expand Down
Loading