Skip to content

Commit 987629a

Browse files
committed
ggml : implement vaddvq when missing
1 parent 2ae3164 commit 987629a

File tree

1 file changed

+30
-14
lines changed

1 file changed

+30
-14
lines changed

ggml.c

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -492,26 +492,42 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
492492
#endif
493493

494494
#if __ARM_NEON
495+
// check if ARMv8 is not available
495496
#if !defined(__ARM_FEATURE_QRDMX)
496497

497-
inline static int16_t vaddvq_s16(int16x8_t v) {
498-
const int16x4_t v1 = vadd_s16(vget_low_s16(v), vget_high_s16(v));
499-
return vaddv_s16(v1);
498+
inline static uint16_t vaddvq_u8(uint8x16_t v) {
499+
return
500+
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
501+
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
502+
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
503+
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
504+
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
505+
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
506+
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
507+
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
500508
}
501509

502-
inline static uint16_t vaddvq_u16(uint16x8_t v) {
503-
const uint16x4_t v1 = vadd_u16(vget_low_u16(v), vget_high_u16(v));
504-
return vaddv_u16(v1);
510+
inline static int32_t vaddvq_s16(int16x8_t v) {
511+
return
512+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
513+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
514+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
515+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
516+
517+
inline static uint32_t vaddvq_u16(uint16x8_t v) {
518+
return
519+
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
520+
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
521+
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
522+
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
505523
}
506524

507525
inline static int32_t vaddvq_s32(int32x4_t v) {
508-
const int32x2_t v1 = vadd_s32(vget_low_s32(v), vget_high_s32(v));
509-
return vaddv_s32(v1);
526+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
510527
}
511528

512529
inline static float vaddvq_f32(float32x4_t v) {
513-
const float32x2_t v1 = vadd_f32(vget_low_f32(v), vget_high_f32(v));
514-
return vaddv_f32(v1);
530+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
515531
}
516532

517533
#endif
@@ -2313,10 +2329,10 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
23132329
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
23142330
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
23152331

2316-
const uint16x8_t pl1l = vmull_u8(vget_low_s8 (v0_1l), vget_low_u8 (v1_1l));
2317-
const uint16x8_t pl1h = vmull_u8(vget_high_s8(v0_1l), vget_high_u8(v1_1l));
2318-
const uint16x8_t ph1l = vmull_u8(vget_low_s8 (v0_1h), vget_low_u8 (v1_1h));
2319-
const uint16x8_t ph1h = vmull_u8(vget_high_s8(v0_1h), vget_high_u8(v1_1h));
2332+
const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
2333+
const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
2334+
const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
2335+
const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
23202336

23212337
const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
23222338
const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);

0 commit comments

Comments
 (0)