Skip to content

Commit 4991499

Browse files
committed
ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit
1 parent ba953d6 commit 4991499

File tree

1 file changed

+9
-111
lines changed

1 file changed

+9
-111
lines changed

ggml.c

Lines changed: 9 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -689,94 +689,6 @@ float vmaxvq_f32(float32x4_t v) {
689689
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
690690
}
691691

692-
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
693-
int8x8_t res;
694-
695-
res[0] = a[0]; res[1] = b[0];
696-
res[2] = a[1]; res[3] = b[1];
697-
res[4] = a[2]; res[5] = b[2];
698-
res[6] = a[3]; res[7] = b[3];
699-
700-
return res;
701-
}
702-
703-
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
704-
int8x8_t res;
705-
706-
res[0] = a[4]; res[1] = b[4];
707-
res[2] = a[5]; res[3] = b[5];
708-
res[4] = a[6]; res[5] = b[6];
709-
res[6] = a[7]; res[7] = b[7];
710-
711-
return res;
712-
}
713-
714-
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
715-
uint8x8_t res;
716-
717-
res[0] = a[0]; res[1] = b[0];
718-
res[2] = a[1]; res[3] = b[1];
719-
res[4] = a[2]; res[5] = b[2];
720-
res[6] = a[3]; res[7] = b[3];
721-
722-
return res;
723-
}
724-
725-
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
726-
uint8x8_t res;
727-
728-
res[0] = a[4]; res[1] = b[4];
729-
res[2] = a[5]; res[3] = b[5];
730-
res[4] = a[6]; res[5] = b[6];
731-
res[6] = a[7]; res[7] = b[7];
732-
733-
return res;
734-
}
735-
736-
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
737-
int8x16_t res;
738-
739-
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
740-
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
741-
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
742-
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
743-
744-
return res;
745-
}
746-
747-
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
748-
int8x16_t res;
749-
750-
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
751-
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
752-
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
753-
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
754-
755-
return res;
756-
}
757-
758-
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
759-
uint8x16_t res;
760-
761-
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
762-
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
763-
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
764-
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
765-
766-
return res;
767-
}
768-
769-
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
770-
uint8x16_t res;
771-
772-
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
773-
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
774-
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
775-
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
776-
777-
return res;
778-
}
779-
780692
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
781693
int32x4_t res;
782694

@@ -2753,13 +2665,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
27532665
const v128_t v0l = wasm_v128_and (v0, m4b);
27542666
const v128_t v0h = wasm_u8x16_shr(v0, 4);
27552667

2756-
// interleave
2757-
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
2758-
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
2759-
27602668
// add high bit and sub 16
2761-
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
2762-
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
2669+
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0l, qhl), s16b);
2670+
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0h, qhh), s16b);
27632671

27642672
// load y
27652673
const v128_t v1l = wasm_v128_load(y0->qs);
@@ -2944,13 +2852,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
29442852

29452853
static bool x = true;
29462854

2947-
// interleave
2948-
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
2949-
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
2950-
29512855
// add high bit
2952-
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
2953-
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
2856+
const v128_t v0lf = wasm_v128_or(v0l, qhl);
2857+
const v128_t v0hf = wasm_v128_or(v0h, qhh);
29542858

29552859
// load y
29562860
const v128_t v1l = wasm_v128_load(y0->qs);
@@ -3033,11 +2937,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
30332937
}
30342938

30352939
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3036-
const int nb = n / QK8_0;
2940+
const int qk = QK8_0;
2941+
const int nb = n / qk;
30372942

3038-
assert(n % QK8_0 == 0);
2943+
assert(n % qk == 0);
30392944
assert(nb % 2 == 0);
3040-
assert(QK8_0 == QK8_0);
30412945

30422946
const block_q8_0 * restrict x = vx;
30432947
const block_q8_0 * restrict y = vy;
@@ -3117,16 +3021,10 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
31173021
float sumf = 0.0;
31183022

31193023
for (int i = 0; i < nb; i++) {
3120-
const int8_t * restrict x0 = x[i].qs;
3121-
const int8_t * restrict y0 = y[i].qs;
3122-
31233024
int sumi = 0;
31243025

3125-
for (int j = 0; j < QK8_0; j++) {
3126-
const int v0 = x0[j];
3127-
const int v1 = y0[j];
3128-
3129-
sumi += v0*v1;
3026+
for (int j = 0; j < qk; j++) {
3027+
sumi += x[i].qs[j]*y[i].qs[j];
31303028
}
31313029

31323030
sumf += (x[i].d*y[i].d)*sumi;

0 commit comments

Comments
 (0)