@@ -689,94 +689,6 @@ float vmaxvq_f32(float32x4_t v) {
689
689
MAX (vgetq_lane_f32 (v , 2 ), vgetq_lane_f32 (v , 3 )));
690
690
}
691
691
692
- int8x8_t vzip1_s8 (int8x8_t a , int8x8_t b ) {
693
- int8x8_t res ;
694
-
695
- res [0 ] = a [0 ]; res [1 ] = b [0 ];
696
- res [2 ] = a [1 ]; res [3 ] = b [1 ];
697
- res [4 ] = a [2 ]; res [5 ] = b [2 ];
698
- res [6 ] = a [3 ]; res [7 ] = b [3 ];
699
-
700
- return res ;
701
- }
702
-
703
- int8x8_t vzip2_s8 (int8x8_t a , int8x8_t b ) {
704
- int8x8_t res ;
705
-
706
- res [0 ] = a [4 ]; res [1 ] = b [4 ];
707
- res [2 ] = a [5 ]; res [3 ] = b [5 ];
708
- res [4 ] = a [6 ]; res [5 ] = b [6 ];
709
- res [6 ] = a [7 ]; res [7 ] = b [7 ];
710
-
711
- return res ;
712
- }
713
-
714
- uint8x8_t vzip1_u8 (uint8x8_t a , uint8x8_t b ) {
715
- uint8x8_t res ;
716
-
717
- res [0 ] = a [0 ]; res [1 ] = b [0 ];
718
- res [2 ] = a [1 ]; res [3 ] = b [1 ];
719
- res [4 ] = a [2 ]; res [5 ] = b [2 ];
720
- res [6 ] = a [3 ]; res [7 ] = b [3 ];
721
-
722
- return res ;
723
- }
724
-
725
- uint8x8_t vzip2_u8 (uint8x8_t a , uint8x8_t b ) {
726
- uint8x8_t res ;
727
-
728
- res [0 ] = a [4 ]; res [1 ] = b [4 ];
729
- res [2 ] = a [5 ]; res [3 ] = b [5 ];
730
- res [4 ] = a [6 ]; res [5 ] = b [6 ];
731
- res [6 ] = a [7 ]; res [7 ] = b [7 ];
732
-
733
- return res ;
734
- }
735
-
736
- int8x16_t vzip1q_s8 (int8x16_t a , int8x16_t b ) {
737
- int8x16_t res ;
738
-
739
- res [0 ] = a [0 ]; res [1 ] = b [0 ]; res [2 ] = a [1 ]; res [3 ] = b [1 ];
740
- res [4 ] = a [2 ]; res [5 ] = b [2 ]; res [6 ] = a [3 ]; res [7 ] = b [3 ];
741
- res [8 ] = a [4 ]; res [9 ] = b [4 ]; res [10 ] = a [5 ]; res [11 ] = b [5 ];
742
- res [12 ] = a [6 ]; res [13 ] = b [6 ]; res [14 ] = a [7 ]; res [15 ] = b [7 ];
743
-
744
- return res ;
745
- }
746
-
747
- int8x16_t vzip2q_s8 (int8x16_t a , int8x16_t b ) {
748
- int8x16_t res ;
749
-
750
- res [0 ] = a [8 ]; res [1 ] = b [8 ]; res [2 ] = a [9 ]; res [3 ] = b [9 ];
751
- res [4 ] = a [10 ]; res [5 ] = b [10 ]; res [6 ] = a [11 ]; res [7 ] = b [11 ];
752
- res [8 ] = a [12 ]; res [9 ] = b [12 ]; res [10 ] = a [13 ]; res [11 ] = b [13 ];
753
- res [12 ] = a [14 ]; res [13 ] = b [14 ]; res [14 ] = a [15 ]; res [15 ] = b [15 ];
754
-
755
- return res ;
756
- }
757
-
758
- uint8x16_t vzip1q_u8 (uint8x16_t a , uint8x16_t b ) {
759
- uint8x16_t res ;
760
-
761
- res [0 ] = a [0 ]; res [1 ] = b [0 ]; res [2 ] = a [1 ]; res [3 ] = b [1 ];
762
- res [4 ] = a [2 ]; res [5 ] = b [2 ]; res [6 ] = a [3 ]; res [7 ] = b [3 ];
763
- res [8 ] = a [4 ]; res [9 ] = b [4 ]; res [10 ] = a [5 ]; res [11 ] = b [5 ];
764
- res [12 ] = a [6 ]; res [13 ] = b [6 ]; res [14 ] = a [7 ]; res [15 ] = b [7 ];
765
-
766
- return res ;
767
- }
768
-
769
- uint8x16_t vzip2q_u8 (uint8x16_t a , uint8x16_t b ) {
770
- uint8x16_t res ;
771
-
772
- res [0 ] = a [8 ]; res [1 ] = b [8 ]; res [2 ] = a [9 ]; res [3 ] = b [9 ];
773
- res [4 ] = a [10 ]; res [5 ] = b [10 ]; res [6 ] = a [11 ]; res [7 ] = b [11 ];
774
- res [8 ] = a [12 ]; res [9 ] = b [12 ]; res [10 ] = a [13 ]; res [11 ] = b [13 ];
775
- res [12 ] = a [14 ]; res [13 ] = b [14 ]; res [14 ] = a [15 ]; res [15 ] = b [15 ];
776
-
777
- return res ;
778
- }
779
-
780
692
int32x4_t vcvtnq_s32_f32 (float32x4_t v ) {
781
693
int32x4_t res ;
782
694
@@ -2753,13 +2665,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2753
2665
const v128_t v0l = wasm_v128_and (v0 , m4b );
2754
2666
const v128_t v0h = wasm_u8x16_shr (v0 , 4 );
2755
2667
2756
- // interleave
2757
- const v128_t v0lz = wasm_v8x16_shuffle (v0l , v0h , 0 , 16 , 1 , 17 , 2 , 18 , 3 , 19 , 4 , 20 , 5 , 21 , 6 , 22 , 7 , 23 );
2758
- const v128_t v0hz = wasm_v8x16_shuffle (v0l , v0h , 8 , 24 , 9 , 25 , 10 , 26 , 11 , 27 , 12 , 28 , 13 , 29 , 14 , 30 , 15 , 31 );
2759
-
2760
2668
// add high bit and sub 16
2761
- const v128_t v0lf = wasm_i8x16_sub (wasm_v128_or (v0lz , qhl ), s16b );
2762
- const v128_t v0hf = wasm_i8x16_sub (wasm_v128_or (v0hz , qhh ), s16b );
2669
+ const v128_t v0lf = wasm_i8x16_sub (wasm_v128_or (v0l , qhl ), s16b );
2670
+ const v128_t v0hf = wasm_i8x16_sub (wasm_v128_or (v0h , qhh ), s16b );
2763
2671
2764
2672
// load y
2765
2673
const v128_t v1l = wasm_v128_load (y0 -> qs );
@@ -2944,13 +2852,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2944
2852
2945
2853
static bool x = true;
2946
2854
2947
- // interleave
2948
- const v128_t v0lz = wasm_v8x16_shuffle (v0l , v0h , 0 , 16 , 1 , 17 , 2 , 18 , 3 , 19 , 4 , 20 , 5 , 21 , 6 , 22 , 7 , 23 );
2949
- const v128_t v0hz = wasm_v8x16_shuffle (v0l , v0h , 8 , 24 , 9 , 25 , 10 , 26 , 11 , 27 , 12 , 28 , 13 , 29 , 14 , 30 , 15 , 31 );
2950
-
2951
2855
// add high bit
2952
- const v128_t v0lf = wasm_v128_or (v0lz , qhl );
2953
- const v128_t v0hf = wasm_v128_or (v0hz , qhh );
2856
+ const v128_t v0lf = wasm_v128_or (v0l , qhl );
2857
+ const v128_t v0hf = wasm_v128_or (v0h , qhh );
2954
2858
2955
2859
// load y
2956
2860
const v128_t v1l = wasm_v128_load (y0 -> qs );
@@ -3033,11 +2937,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3033
2937
}
3034
2938
3035
2939
static void ggml_vec_dot_q8_0_q8_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
3036
- const int nb = n / QK8_0 ;
2940
+ const int qk = QK8_0 ;
2941
+ const int nb = n / qk ;
3037
2942
3038
- assert (n % QK8_0 == 0 );
2943
+ assert (n % qk == 0 );
3039
2944
assert (nb % 2 == 0 );
3040
- assert (QK8_0 == QK8_0 );
3041
2945
3042
2946
const block_q8_0 * restrict x = vx ;
3043
2947
const block_q8_0 * restrict y = vy ;
@@ -3117,16 +3021,10 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3117
3021
float sumf = 0.0 ;
3118
3022
3119
3023
for (int i = 0 ; i < nb ; i ++ ) {
3120
- const int8_t * restrict x0 = x [i ].qs ;
3121
- const int8_t * restrict y0 = y [i ].qs ;
3122
-
3123
3024
int sumi = 0 ;
3124
3025
3125
- for (int j = 0 ; j < QK8_0 ; j ++ ) {
3126
- const int v0 = x0 [j ];
3127
- const int v1 = y0 [j ];
3128
-
3129
- sumi += v0 * v1 ;
3026
+ for (int j = 0 ; j < qk ; j ++ ) {
3027
+ sumi += x [i ].qs [j ]* y [i ].qs [j ];
3130
3028
}
3131
3029
3132
3030
sumf += (x [i ].d * y [i ].d )* sumi ;
0 commit comments