@@ -682,94 +682,6 @@ float vmaxvq_f32(float32x4_t v) {
682
682
MAX (vgetq_lane_f32 (v , 2 ), vgetq_lane_f32 (v , 3 )));
683
683
}
684
684
685
- int8x8_t vzip1_s8 (int8x8_t a , int8x8_t b ) {
686
- int8x8_t res ;
687
-
688
- res [0 ] = a [0 ]; res [1 ] = b [0 ];
689
- res [2 ] = a [1 ]; res [3 ] = b [1 ];
690
- res [4 ] = a [2 ]; res [5 ] = b [2 ];
691
- res [6 ] = a [3 ]; res [7 ] = b [3 ];
692
-
693
- return res ;
694
- }
695
-
696
- int8x8_t vzip2_s8 (int8x8_t a , int8x8_t b ) {
697
- int8x8_t res ;
698
-
699
- res [0 ] = a [4 ]; res [1 ] = b [4 ];
700
- res [2 ] = a [5 ]; res [3 ] = b [5 ];
701
- res [4 ] = a [6 ]; res [5 ] = b [6 ];
702
- res [6 ] = a [7 ]; res [7 ] = b [7 ];
703
-
704
- return res ;
705
- }
706
-
707
- uint8x8_t vzip1_u8 (uint8x8_t a , uint8x8_t b ) {
708
- uint8x8_t res ;
709
-
710
- res [0 ] = a [0 ]; res [1 ] = b [0 ];
711
- res [2 ] = a [1 ]; res [3 ] = b [1 ];
712
- res [4 ] = a [2 ]; res [5 ] = b [2 ];
713
- res [6 ] = a [3 ]; res [7 ] = b [3 ];
714
-
715
- return res ;
716
- }
717
-
718
- uint8x8_t vzip2_u8 (uint8x8_t a , uint8x8_t b ) {
719
- uint8x8_t res ;
720
-
721
- res [0 ] = a [4 ]; res [1 ] = b [4 ];
722
- res [2 ] = a [5 ]; res [3 ] = b [5 ];
723
- res [4 ] = a [6 ]; res [5 ] = b [6 ];
724
- res [6 ] = a [7 ]; res [7 ] = b [7 ];
725
-
726
- return res ;
727
- }
728
-
729
- int8x16_t vzip1q_s8 (int8x16_t a , int8x16_t b ) {
730
- int8x16_t res ;
731
-
732
- res [0 ] = a [0 ]; res [1 ] = b [0 ]; res [2 ] = a [1 ]; res [3 ] = b [1 ];
733
- res [4 ] = a [2 ]; res [5 ] = b [2 ]; res [6 ] = a [3 ]; res [7 ] = b [3 ];
734
- res [8 ] = a [4 ]; res [9 ] = b [4 ]; res [10 ] = a [5 ]; res [11 ] = b [5 ];
735
- res [12 ] = a [6 ]; res [13 ] = b [6 ]; res [14 ] = a [7 ]; res [15 ] = b [7 ];
736
-
737
- return res ;
738
- }
739
-
740
- int8x16_t vzip2q_s8 (int8x16_t a , int8x16_t b ) {
741
- int8x16_t res ;
742
-
743
- res [0 ] = a [8 ]; res [1 ] = b [8 ]; res [2 ] = a [9 ]; res [3 ] = b [9 ];
744
- res [4 ] = a [10 ]; res [5 ] = b [10 ]; res [6 ] = a [11 ]; res [7 ] = b [11 ];
745
- res [8 ] = a [12 ]; res [9 ] = b [12 ]; res [10 ] = a [13 ]; res [11 ] = b [13 ];
746
- res [12 ] = a [14 ]; res [13 ] = b [14 ]; res [14 ] = a [15 ]; res [15 ] = b [15 ];
747
-
748
- return res ;
749
- }
750
-
751
- uint8x16_t vzip1q_u8 (uint8x16_t a , uint8x16_t b ) {
752
- uint8x16_t res ;
753
-
754
- res [0 ] = a [0 ]; res [1 ] = b [0 ]; res [2 ] = a [1 ]; res [3 ] = b [1 ];
755
- res [4 ] = a [2 ]; res [5 ] = b [2 ]; res [6 ] = a [3 ]; res [7 ] = b [3 ];
756
- res [8 ] = a [4 ]; res [9 ] = b [4 ]; res [10 ] = a [5 ]; res [11 ] = b [5 ];
757
- res [12 ] = a [6 ]; res [13 ] = b [6 ]; res [14 ] = a [7 ]; res [15 ] = b [7 ];
758
-
759
- return res ;
760
- }
761
-
762
- uint8x16_t vzip2q_u8 (uint8x16_t a , uint8x16_t b ) {
763
- uint8x16_t res ;
764
-
765
- res [0 ] = a [8 ]; res [1 ] = b [8 ]; res [2 ] = a [9 ]; res [3 ] = b [9 ];
766
- res [4 ] = a [10 ]; res [5 ] = b [10 ]; res [6 ] = a [11 ]; res [7 ] = b [11 ];
767
- res [8 ] = a [12 ]; res [9 ] = b [12 ]; res [10 ] = a [13 ]; res [11 ] = b [13 ];
768
- res [12 ] = a [14 ]; res [13 ] = b [14 ]; res [14 ] = a [15 ]; res [15 ] = b [15 ];
769
-
770
- return res ;
771
- }
772
-
773
685
int32x4_t vcvtnq_s32_f32 (float32x4_t v ) {
774
686
int32x4_t res ;
775
687
@@ -2626,13 +2538,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2626
2538
const v128_t v0l = wasm_v128_and (v0 , m4b );
2627
2539
const v128_t v0h = wasm_u8x16_shr (v0 , 4 );
2628
2540
2629
- // interleave
2630
- const v128_t v0lz = wasm_v8x16_shuffle (v0l , v0h , 0 , 16 , 1 , 17 , 2 , 18 , 3 , 19 , 4 , 20 , 5 , 21 , 6 , 22 , 7 , 23 );
2631
- const v128_t v0hz = wasm_v8x16_shuffle (v0l , v0h , 8 , 24 , 9 , 25 , 10 , 26 , 11 , 27 , 12 , 28 , 13 , 29 , 14 , 30 , 15 , 31 );
2632
-
2633
2541
// add high bit and sub 16
2634
- const v128_t v0lf = wasm_i8x16_sub (wasm_v128_or (v0lz , qhl ), s16b );
2635
- const v128_t v0hf = wasm_i8x16_sub (wasm_v128_or (v0hz , qhh ), s16b );
2542
+ const v128_t v0lf = wasm_i8x16_sub (wasm_v128_or (v0l , qhl ), s16b );
2543
+ const v128_t v0hf = wasm_i8x16_sub (wasm_v128_or (v0h , qhh ), s16b );
2636
2544
2637
2545
// load y
2638
2546
const v128_t v1l = wasm_v128_load (y0 -> qs );
@@ -2817,13 +2725,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2817
2725
2818
2726
static bool x = true;
2819
2727
2820
- // interleave
2821
- const v128_t v0lz = wasm_v8x16_shuffle (v0l , v0h , 0 , 16 , 1 , 17 , 2 , 18 , 3 , 19 , 4 , 20 , 5 , 21 , 6 , 22 , 7 , 23 );
2822
- const v128_t v0hz = wasm_v8x16_shuffle (v0l , v0h , 8 , 24 , 9 , 25 , 10 , 26 , 11 , 27 , 12 , 28 , 13 , 29 , 14 , 30 , 15 , 31 );
2823
-
2824
2728
// add high bit
2825
- const v128_t v0lf = wasm_v128_or (v0lz , qhl );
2826
- const v128_t v0hf = wasm_v128_or (v0hz , qhh );
2729
+ const v128_t v0lf = wasm_v128_or (v0l , qhl );
2730
+ const v128_t v0hf = wasm_v128_or (v0h , qhh );
2827
2731
2828
2732
// load y
2829
2733
const v128_t v1l = wasm_v128_load (y0 -> qs );
@@ -2906,11 +2810,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2906
2810
}
2907
2811
2908
2812
static void ggml_vec_dot_q8_0_q8_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
2909
- const int nb = n / QK8_0 ;
2813
+ const int qk = QK8_0 ;
2814
+ const int nb = n / qk ;
2910
2815
2911
- assert (n % QK8_0 == 0 );
2816
+ assert (n % qk == 0 );
2912
2817
assert (nb % 2 == 0 );
2913
- assert (QK8_0 == QK8_0 );
2914
2818
2915
2819
const block_q8_0 * restrict x = vx ;
2916
2820
const block_q8_0 * restrict y = vy ;
@@ -2990,16 +2894,10 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
2990
2894
float sumf = 0.0 ;
2991
2895
2992
2896
for (int i = 0 ; i < nb ; i ++ ) {
2993
- const int8_t * restrict x0 = x [i ].qs ;
2994
- const int8_t * restrict y0 = y [i ].qs ;
2995
-
2996
2897
int sumi = 0 ;
2997
2898
2998
- for (int j = 0 ; j < QK8_0 ; j ++ ) {
2999
- const int v0 = x0 [j ];
3000
- const int v1 = y0 [j ];
3001
-
3002
- sumi += v0 * v1 ;
2899
+ for (int j = 0 ; j < qk ; j ++ ) {
2900
+ sumi += x [i ].qs [j ]* y [i ].qs [j ];
3003
2901
}
3004
2902
3005
2903
sumf += (x [i ].d * y [i ].d )* sumi ;
0 commit comments