@@ -11745,6 +11745,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11745
11745
11746
11746
const int nb = n / QK4_NL;
11747
11747
11748
+ int ib = 0;
11749
+ float sumf = 0;
11750
+
11748
11751
#if defined __ARM_NEON
11749
11752
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
11750
11753
const uint8x16_t m4b = vdupq_n_u8(0x0f);
@@ -11753,16 +11756,14 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11753
11756
int8x16x4_t q8b;
11754
11757
int32x4_t prod_1, prod_2;
11755
11758
11756
- float sumf = 0;
11757
-
11758
- for (int ib = 0; ib < nb; ib += 2) {
11759
+ for (; ib + 1 < nb; ib += 2) {
11759
11760
11760
- q4bits.val[0] = vld1q_u8(x[ib+ 0].qs);
11761
- q4bits.val[1] = vld1q_u8(x[ib+ 1].qs);
11762
- q8b.val[0] = vld1q_s8(y[ib+ 0].qs);
11763
- q8b.val[1] = vld1q_s8(y[ib+ 0].qs + 16);
11764
- q8b.val[2] = vld1q_s8(y[ib+ 1].qs);
11765
- q8b.val[3] = vld1q_s8(y[ib+ 1].qs + 16);
11761
+ q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
11762
+ q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
11763
+ q8b.val[0] = vld1q_s8(y[ib + 0].qs);
11764
+ q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
11765
+ q8b.val[2] = vld1q_s8(y[ib + 1].qs);
11766
+ q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
11766
11767
11767
11768
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
11768
11769
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
@@ -11773,12 +11774,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11773
11774
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
11774
11775
11775
11776
sumf +=
11776
- GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+ 0].d) * vaddvq_s32(prod_1) +
11777
- GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+ 1].d) * vaddvq_s32(prod_2);
11777
+ GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
11778
+ GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
11778
11779
}
11779
11780
11780
- *s = sumf;
11781
-
11782
11781
#elif defined __AVX2__
11783
11782
11784
11783
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
@@ -11787,11 +11786,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11787
11786
11788
11787
__m256 accum1 = _mm256_setzero_ps();
11789
11788
__m256 accum2 = _mm256_setzero_ps();
11790
- for (int ib = 0; ib < nb; ib += 2) {
11791
- const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
11792
- const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
11793
- const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
11794
- const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
11789
+ for (; ib + 1 < nb; ib += 2) {
11790
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
11791
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
11792
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
11793
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
11795
11794
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
11796
11795
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
11797
11796
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
@@ -11800,16 +11799,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11800
11799
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
11801
11800
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
11802
11801
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
11803
- accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
11802
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
11804
11803
_mm256_cvtepi32_ps(p_1), accum1);
11805
- accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
11804
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
11806
11805
_mm256_cvtepi32_ps(p_2), accum2);
11807
-
11808
- y += 2;
11809
- x += 2;
11810
11806
}
11811
11807
11812
- *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
11808
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
11813
11809
11814
11810
#elif defined __AVX__
11815
11811
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
@@ -11818,13 +11814,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11818
11814
11819
11815
__m256 accum1 = _mm256_setzero_ps();
11820
11816
__m256 accum2 = _mm256_setzero_ps();
11821
- for (int ib = 0; ib < nb; ib += 2) {
11822
- const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
11823
- const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
11824
- const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
11825
- const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
11826
- const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
11827
- const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
11817
+ for (; ib + 1 < nb; ib += 2) {
11818
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
11819
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
11820
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
11821
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
11822
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
11823
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
11828
11824
11829
11825
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
11830
11826
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
@@ -11838,16 +11834,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11838
11834
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
11839
11835
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
11840
11836
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
11841
- accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
11837
+ accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
11842
11838
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
11843
- accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
11839
+ accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
11844
11840
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
11845
-
11846
- y += 2;
11847
- x += 2;
11848
11841
}
11849
11842
11850
- *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
11843
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
11851
11844
11852
11845
#elif defined(__POWER9_VECTOR__)
11853
11846
const vector signed char lowMask = vec_splats((signed char)0xF);
@@ -11860,7 +11853,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11860
11853
const vector signed char values = vec_xl( 0, kvalues_iq4nl);
11861
11854
11862
11855
#pragma GCC unroll 4
11863
- for (int ib = 0 ; ib < nb; ++ib) {
11856
+ for (; ib < nb; ++ib) {
11864
11857
__builtin_prefetch(x[ib].qs, 0, 1);
11865
11858
__builtin_prefetch(y[ib].qs, 0, 1);
11866
11859
@@ -11897,7 +11890,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11897
11890
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11898
11891
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11899
11892
11900
- *s = vec_extract(vsumf0, 0);
11893
+ sumf = vec_extract(vsumf0, 0);
11901
11894
11902
11895
#elif defined (__loongarch_asx)
11903
11896
@@ -11907,11 +11900,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11907
11900
11908
11901
__m256 accum1 = (__m256)__lasx_xvldi(0);
11909
11902
__m256 accum2 = (__m256)__lasx_xvldi(0);
11910
- for (int ib = 0; ib < nb; ib += 2) {
11911
- const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[0].qs, 0);
11912
- const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[1].qs, 0);
11913
- const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[0].qs, 0);
11914
- const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[1].qs, 0);
11903
+ for (; ib + 1 < nb; ib += 2) {
11904
+ const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
11905
+ const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
11906
+ const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
11907
+ const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
11915
11908
const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
11916
11909
lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
11917
11910
const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
@@ -11920,20 +11913,16 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11920
11913
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
11921
11914
const __m256i p_1 = lasx_madd_h(p16_1, mone);
11922
11915
const __m256i p_2 = lasx_madd_h(p16_2, mone);
11923
- accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
11916
+ accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
11924
11917
__lasx_xvffint_s_w(p_1), accum1);
11925
- accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
11918
+ accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
11926
11919
__lasx_xvffint_s_w(p_2), accum2);
11927
-
11928
- y += 2;
11929
- x += 2;
11930
11920
}
11931
11921
11932
- *s = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
11922
+ sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
11933
11923
11934
- #else
11935
- float sumf = 0;
11936
- for (int ib = 0; ib < nb; ++ib) {
11924
+ #endif
11925
+ for (; ib < nb; ++ib) {
11937
11926
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
11938
11927
int sumi1 = 0, sumi2 = 0;
11939
11928
for (int j = 0; j < QK4_NL/2; ++j) {
@@ -11943,7 +11932,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11943
11932
sumf += d * (sumi1 + sumi2);
11944
11933
}
11945
11934
*s = sumf;
11946
- #endif
11947
11935
}
11948
11936
11949
11937
void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
0 commit comments