Skip to content

Commit c8412d4

Browse files
committed
ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit
1 parent 79e49c9 commit c8412d4

File tree

1 file changed

+9
-111
lines changed

1 file changed

+9
-111
lines changed

ggml.c

Lines changed: 9 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -682,94 +682,6 @@ float vmaxvq_f32(float32x4_t v) {
682682
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
683683
}
684684

685-
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
686-
int8x8_t res;
687-
688-
res[0] = a[0]; res[1] = b[0];
689-
res[2] = a[1]; res[3] = b[1];
690-
res[4] = a[2]; res[5] = b[2];
691-
res[6] = a[3]; res[7] = b[3];
692-
693-
return res;
694-
}
695-
696-
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
697-
int8x8_t res;
698-
699-
res[0] = a[4]; res[1] = b[4];
700-
res[2] = a[5]; res[3] = b[5];
701-
res[4] = a[6]; res[5] = b[6];
702-
res[6] = a[7]; res[7] = b[7];
703-
704-
return res;
705-
}
706-
707-
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
708-
uint8x8_t res;
709-
710-
res[0] = a[0]; res[1] = b[0];
711-
res[2] = a[1]; res[3] = b[1];
712-
res[4] = a[2]; res[5] = b[2];
713-
res[6] = a[3]; res[7] = b[3];
714-
715-
return res;
716-
}
717-
718-
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
719-
uint8x8_t res;
720-
721-
res[0] = a[4]; res[1] = b[4];
722-
res[2] = a[5]; res[3] = b[5];
723-
res[4] = a[6]; res[5] = b[6];
724-
res[6] = a[7]; res[7] = b[7];
725-
726-
return res;
727-
}
728-
729-
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
730-
int8x16_t res;
731-
732-
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
733-
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
734-
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
735-
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
736-
737-
return res;
738-
}
739-
740-
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
741-
int8x16_t res;
742-
743-
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
744-
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
745-
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
746-
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
747-
748-
return res;
749-
}
750-
751-
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
752-
uint8x16_t res;
753-
754-
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
755-
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
756-
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
757-
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
758-
759-
return res;
760-
}
761-
762-
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
763-
uint8x16_t res;
764-
765-
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
766-
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
767-
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
768-
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
769-
770-
return res;
771-
}
772-
773685
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
774686
int32x4_t res;
775687

@@ -2626,13 +2538,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
26262538
const v128_t v0l = wasm_v128_and (v0, m4b);
26272539
const v128_t v0h = wasm_u8x16_shr(v0, 4);
26282540

2629-
// interleave
2630-
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
2631-
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
2632-
26332541
// add high bit and sub 16
2634-
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
2635-
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
2542+
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0l, qhl), s16b);
2543+
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0h, qhh), s16b);
26362544

26372545
// load y
26382546
const v128_t v1l = wasm_v128_load(y0->qs);
@@ -2817,13 +2725,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
28172725

28182726
static bool x = true;
28192727

2820-
// interleave
2821-
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
2822-
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
2823-
28242728
// add high bit
2825-
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
2826-
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
2729+
const v128_t v0lf = wasm_v128_or(v0l, qhl);
2730+
const v128_t v0hf = wasm_v128_or(v0h, qhh);
28272731

28282732
// load y
28292733
const v128_t v1l = wasm_v128_load(y0->qs);
@@ -2906,11 +2810,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
29062810
}
29072811

29082812
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2909-
const int nb = n / QK8_0;
2813+
const int qk = QK8_0;
2814+
const int nb = n / qk;
29102815

2911-
assert(n % QK8_0 == 0);
2816+
assert(n % qk == 0);
29122817
assert(nb % 2 == 0);
2913-
assert(QK8_0 == QK8_0);
29142818

29152819
const block_q8_0 * restrict x = vx;
29162820
const block_q8_0 * restrict y = vy;
@@ -2990,16 +2894,10 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
29902894
float sumf = 0.0;
29912895

29922896
for (int i = 0; i < nb; i++) {
2993-
const int8_t * restrict x0 = x[i].qs;
2994-
const int8_t * restrict y0 = y[i].qs;
2995-
29962897
int sumi = 0;
29972898

2998-
for (int j = 0; j < QK8_0; j++) {
2999-
const int v0 = x0[j];
3000-
const int v1 = y0[j];
3001-
3002-
sumi += v0*v1;
2899+
for (int j = 0; j < qk; j++) {
2900+
sumi += x[i].qs[j]*y[i].qs[j];
30032901
}
30042902

30052903
sumf += (x[i].d*y[i].d)*sumi;

0 commit comments

Comments
 (0)