Skip to content

Commit d5c0582

Browse files
ggml : fix loongarch build (O2 issue) (#7636)
1 parent 972b555 commit d5c0582

File tree

2 files changed

+15
-7
lines changed

2 files changed

+15
-7
lines changed

ggml-quants.c

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6828,6 +6828,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
68286828

68296829
int bit = 0;
68306830
int is = 0;
6831+
__m256i xvbit;
68316832

68326833
const uint8_t * restrict q3 = x[i].qs;
68336834
const int8_t * restrict q8 = y[i].qs;
@@ -6836,21 +6837,25 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
68366837
// load low 2 bits
68376838
const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
68386839

6840+
xvbit = __lasx_xvreplgr2vr_h(bit);
68396841
// prepare low and high bits
68406842
const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
6841-
const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6843+
const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
68426844
++bit;
68436845

6846+
xvbit = __lasx_xvreplgr2vr_h(bit);
68446847
const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
6845-
const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6848+
const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
68466849
++bit;
68476850

6851+
xvbit = __lasx_xvreplgr2vr_h(bit);
68486852
const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
6849-
const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6853+
const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
68506854
++bit;
68516855

6856+
xvbit = __lasx_xvreplgr2vr_h(bit);
68526857
const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
6853-
const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6858+
const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
68546859
++bit;
68556860

68566861
// load Q8 quants
@@ -8033,6 +8038,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
80338038
__m256i sumi = __lasx_xvldi(0);
80348039

80358040
int bit = 0;
8041+
__m256i xvbit;
80368042

80378043
for (int j = 0; j < QK_K/64; ++j) {
80388044

@@ -8041,13 +8047,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
80418047

80428048
const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
80438049

8050+
xvbit = __lasx_xvreplgr2vr_h(bit++);
80448051
const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
8045-
const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
8052+
const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
80468053
const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
80478054
hmask = __lasx_xvslli_h(hmask, 1);
80488055

8056+
xvbit = __lasx_xvreplgr2vr_h(bit++);
80498057
const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
8050-
const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
8058+
const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
80518059
const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
80528060
hmask = __lasx_xvslli_h(hmask, 1);
80538061

ggml.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1580,7 +1580,7 @@ do { \
15801580
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
15811581
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
15821582

1583-
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
1583+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
15841584
float tmp[8];
15851585

15861586
for (int i = 0; i < 8; i++) {

0 commit comments

Comments
 (0)