@@ -59,6 +59,15 @@ struct ggml_compute_params {
59
59
#endif
60
60
#endif
61
61
62
+ #if defined(__s390x__ ) && defined(__VEC__ )
63
+ #ifndef __VXE__
64
+ #define __VXE__
65
+ #endif
66
+ #ifndef __VXE2__
67
+ #define __VXE2__
68
+ #endif
69
+ #endif
70
+
62
71
#if defined(__ARM_FEATURE_SVE )
63
72
#include <arm_sve.h>
64
73
#include <sys/prctl.h>
@@ -359,6 +368,148 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
359
368
#endif
360
369
#endif
361
370
371
+ #if defined(__VXE__ ) || defined(__VXE2__ )
372
+ #include <vecintrin.h>
373
+
374
+ #define vec_neg (a ) (-(a)) // Vector Negate
375
+ #define vec_add (a , b ) ((a) + (b)) // Vector Add
376
+ #define vec_sub (a , b ) ((a) - (b)) // Vector Subtract
377
+ #define vec_mul (a , b ) ((a) * (b)) // Vector Multiply
378
+ #define vec_div (a , b ) ((a) / (b)) // Vector Divide
379
+ #define vec_sl (a , b ) ((a) << (b)) // Vector Shift Left
380
+ #define vec_sra (a , b ) ((a) >> (b)) // Vector Shift Right
381
+ #define vec_sr (a , b ) ((a) >> (b)) // Vector Shift Right Algebraic
382
+ #define vec_slo (a , b ) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
383
+ #define vec_sro (a , b ) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
384
+
385
+ #ifndef vec_and
386
+ #define vec_and (a , b ) ((a) & (b)) // Vector AND
387
+ #endif
388
+
389
+ #ifndef vec_or
390
+ #define vec_or (a , b ) ((a) | (b)) // Vector OR
391
+ #endif
392
+
393
+ #ifndef vec_xor
394
+ #define vec_xor (a , b ) ((a) ^ (b)) // Vector XOR
395
+ #endif
396
+
397
+ typedef signed char char8x16_t __attribute__((vector_size (16 )));
398
+ typedef unsigned char uchar8x16_t __attribute__((vector_size (16 )));
399
+
400
+ typedef int8_t int8x16_t __attribute__((vector_size (16 )));
401
+ typedef int16_t int16x8_t __attribute__((vector_size (16 )));
402
+ typedef int32_t int32x4_t __attribute__((vector_size (16 )));
403
+
404
+ typedef uint8_t uint8x16_t __attribute__((vector_size (16 )));
405
+ typedef uint16_t uint16x8_t __attribute__((vector_size (16 )));
406
+ typedef uint32_t uint32x4_t __attribute__((vector_size (16 )));
407
+
408
+ typedef float float32x4_t __attribute__((vector_size (16 )));
409
+ typedef double double64x2_t __attribute((vector_size (16 )));
410
+
411
+ typedef signed long long long64x2_t __attribute((vector_size (16 )));
412
+ typedef unsigned long long ulong64x2_t __attribute__((vector_size (16 )));
413
+
414
+ typedef struct ggml_uint8x16x2_t {
415
+ uint8x16_t val [2 ];
416
+ } ggml_uint8x16x2_t ;
417
+
418
+ inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2 (const uint8_t * ptr ) {
419
+ ggml_uint8x16x2_t res ;
420
+
421
+ res .val [0 ] = vec_xl ( 0 , ptr );
422
+ res .val [1 ] = vec_xl (16 , ptr );
423
+
424
+ return res ;
425
+ }
426
+
427
+ typedef struct ggml_uint8x16x4_t {
428
+ uint8x16_t val [4 ];
429
+ } ggml_uint8x16x4_t ;
430
+
431
+ inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4 (const uint8_t * ptr ) {
432
+ ggml_uint8x16x4_t res ;
433
+
434
+ res .val [0 ] = vec_xl ( 0 , ptr );
435
+ res .val [1 ] = vec_xl (16 , ptr );
436
+ res .val [2 ] = vec_xl (32 , ptr );
437
+ res .val [3 ] = vec_xl (48 , ptr );
438
+
439
+ return res ;
440
+ }
441
+
442
+ typedef struct ggml_int8x16x4_t {
443
+ int8x16_t val [4 ];
444
+ } ggml_int8x16x4_t ;
445
+
446
+ inline static ggml_int8x16x4_t ggml_vec_xl_s8x4 (const int8_t * ptr ) {
447
+ ggml_int8x16x4_t res ;
448
+
449
+ res .val [0 ] = vec_xl ( 0 , ptr );
450
+ res .val [1 ] = vec_xl (16 , ptr );
451
+ res .val [2 ] = vec_xl (32 , ptr );
452
+ res .val [3 ] = vec_xl (48 , ptr );
453
+
454
+ return res ;
455
+ }
456
+
457
+ typedef struct ggml_int16x8x2_t {
458
+ int16x8_t val [2 ];
459
+ } ggml_int16x8x2_t ;
460
+
461
+ inline static ggml_int16x8x2_t ggml_vec_xl_s16x2 (const int16_t * ptr ) {
462
+ ggml_int16x8x2_t res ;
463
+
464
+ res .val [0 ] = vec_xl ( 0 , ptr );
465
+ res .val [1 ] = vec_xl (16 , ptr );
466
+
467
+ return res ;
468
+ }
469
+
470
+ /*
471
+ ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
472
+ ! or iq4_nl for example implementation.
473
+ */
474
+ inline static int8x16_t ggml_vec_tbl (int8x16_t a , uint8x16_t b ) {
475
+ int8x16_t res ;
476
+
477
+ res [ 0 ] = a [b [ 0 ]];
478
+ res [ 1 ] = a [b [ 1 ]];
479
+ res [ 2 ] = a [b [ 2 ]];
480
+ res [ 3 ] = a [b [ 3 ]];
481
+ res [ 4 ] = a [b [ 4 ]];
482
+ res [ 5 ] = a [b [ 5 ]];
483
+ res [ 6 ] = a [b [ 6 ]];
484
+ res [ 7 ] = a [b [ 7 ]];
485
+ res [ 8 ] = a [b [ 8 ]];
486
+ res [ 9 ] = a [b [ 9 ]];
487
+ res [10 ] = a [b [10 ]];
488
+ res [11 ] = a [b [11 ]];
489
+ res [12 ] = a [b [12 ]];
490
+ res [13 ] = a [b [13 ]];
491
+ res [14 ] = a [b [14 ]];
492
+ res [15 ] = a [b [15 ]];
493
+
494
+ return res ;
495
+ }
496
+
497
+ inline static int16x8_t vec_padd_s16 (int16x8_t a , int16x8_t b ) {
498
+ const uchar8x16_t v_maske = { 0 , 1 , 4 , 5 , 8 , 9 , 12 , 13 ,
499
+ 16 , 17 , 20 , 21 , 24 , 25 , 28 , 29 };
500
+
501
+ const int16x8_t v_abo = vec_pack ((int32x4_t )a , (int32x4_t )b );
502
+ const int16x8_t v_abe = vec_perm (a , b , v_maske );
503
+ return v_abo + v_abe ;
504
+ }
505
+
506
+ inline static int32x4_t ggml_vec_dot (int32x4_t acc , int8x16_t a , int8x16_t b ) {
507
+ const int16x8_t p = vec_mule (a , b ) + vec_mulo (a , b );
508
+ return acc + (vec_unpackh (p ) + vec_unpackl (p ));
509
+ }
510
+
511
+ #endif
512
+
362
513
#if defined(__loongarch_asx )
363
514
/* float type data load instructions */
364
515
static __m128 __lsx_vreplfr2vr_s (const float val ) {
0 commit comments