Skip to content

ggml-cpu: Support s390x SIMD Instruction Set #12019

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 57 commits into from
Feb 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
17d6f54
ggml: add s390x ARCH_FLAGS for compilation
taronaeo Jan 1, 2025
891922f
ggml: add SIMD for s390x using vector intrinsics
taronaeo Jan 1, 2025
32c1e11
ggml: fix missing escape character in GGML_F32x4_REDUCE
taronaeo Jan 1, 2025
518faff
ggml: add temporary patch for GGML_F32_ARR and GGML_F16_ARR
taronaeo Jan 1, 2025
b377968
ggml: fix s390x GGML_F32x4_REDUCE
taronaeo Jan 1, 2025
2dd768e
ggml: full SIMD activation for F32,F16 s390x
taronaeo Jan 1, 2025
0fdbc72
ggml: add option to disable s390x VXE/VXE2
taronaeo Jan 1, 2025
a44fba2
ggml: change vecintrin.h include to ggml-cpu-impl
taronaeo Jan 1, 2025
77696c9
cmake: add s390x target detection for VX/VXE/VXE2
taronaeo Jan 2, 2025
47ca047
ggml: move s390x vector intrinsics to ggml-cpu-impl.h
taronaeo Jan 4, 2025
2d06192
ggml: s390x Q8_0 SIMD
taronaeo Jan 4, 2025
33ea1d0
ggml: correct documentation for Q8_0
taronaeo Jan 4, 2025
82e045d
ggml: s390x reduce code complexity Q8_0
taronaeo Jan 4, 2025
261689d
ggml: s390x bugfix typo Q8_0
taronaeo Jan 4, 2025
4212c46
ggml: s390x SIMD activated for Q4_1
taronaeo Jan 5, 2025
44402b7
ggml: s390x inline vec_reve
taronaeo Jan 5, 2025
68760a8
ggml: s390x SIMD activation for Q4_0
taronaeo Jan 5, 2025
ecdf6f0
ggml: add VXE backend feature
taronaeo Jan 7, 2025
fd993b2
ggml: remove test.py
taronaeo Jan 7, 2025
0f1e7a0
ggml: s390x SIMD activation for quantize_row_q8_0
taronaeo Jan 7, 2025
cd707a7
ggml: s390x SIMD activation for quantize_row_q8_1
taronaeo Jan 8, 2025
e1f939f
ggml: s390x SIMD activation for iq4_xs
taronaeo Jan 14, 2025
37a0a62
ggml: bugfix iq4_xs
taronaeo Jan 14, 2025
8df0269
ggml: s390x SIMD activation for iq4_nl
taronaeo Jan 14, 2025
ee750c9
ggml: add float, double, and long vector data type
taronaeo Jan 14, 2025
2073291
ggml: clean up iq4_xs SIMD
taronaeo Jan 14, 2025
0c6e6d6
ggml: fix improper use of restrict keyword
taronaeo Jan 14, 2025
109be7f
ggml: update warning message for ggml_vec_tbl
taronaeo Jan 14, 2025
ed6487c
ggml: untested implementation of ggml_vec_dot_iq2_xxs_q8_K
taronaeo Jan 17, 2025
eb3fa5d
ggml: update ggml_vec_dot_q4_1_q8_1 to use typedefs
taronaeo Jan 20, 2025
33f98bd
ggml: switch to restrict for iq4_nl
taronaeo Jan 20, 2025
948441c
ggml: slight dot product speed improvement for q4_1_q8_1
taronaeo Jan 20, 2025
9a39147
ggml: s390x SIMD activation for q6_K
taronaeo Jan 20, 2025
87087de
ggml: add missing `_t` to ggml_int8x16x4_t
taronaeo Jan 20, 2025
077a597
ggml: fix missing `_t` for ggml_vec_xl_s8x4
taronaeo Jan 20, 2025
9210d70
ggml: fix more missing `_t`
taronaeo Jan 20, 2025
59d2638
ggml: add unroll and prefetch to Q8_0
taronaeo Feb 5, 2025
5c5e0aa
ggml: patch Q8_0 to use proper vector sizes
taronaeo Feb 5, 2025
69d8695
ggml: optimise Q8_0 dot prod compute kernel further
taronaeo Feb 5, 2025
b11ffbd
ggml: add unroll and prefetch to Q4_1
taronaeo Feb 6, 2025
dac5d9e
ggml: refactor Q6_K variable naming for readability
taronaeo Feb 7, 2025
8fe0803
ggml: fix Q6_K typos
taronaeo Feb 7, 2025
333e1a2
ggml: s390x SIMD activation for Q5_K
taronaeo Feb 9, 2025
c2794e8
ggml: fix wrong char*x16_t naming
taronaeo Feb 9, 2025
2606ddc
ggml: Q5_K y0 wrong signness
taronaeo Feb 9, 2025
809dac1
ggml: fix Q5_K invalid uchar type
taronaeo Feb 9, 2025
c8f9538
ggml: fix Q5_K invalid uchar type
taronaeo Feb 9, 2025
3dd7144
ggml: s390x SIMD activation for Q4_K
taronaeo Feb 10, 2025
9b01b64
ggml: fix Q4_K invalid vector intrinsics
taronaeo Feb 10, 2025
84ee8b0
ggml: simplify ggml_padd_s16 compute kernel
taronaeo Feb 15, 2025
8ced2ab
ggml: correct ggml-cpu vxe wording
taronaeo Feb 22, 2025
5796caf
ggml: change ggml_aligned_malloc alignment to 256
taronaeo Feb 22, 2025
b4b2214
ggml: resolve pr merge via cherry-pick 225bbbf
MQ-mengqing Feb 7, 2025
cfc2603
ggml : fix LoongArch compile error with 128-bit SIMD (#11701)
junchao-loongson Feb 6, 2025
f263ec3
ggml: resolve pr merge via cherry-pick 4571953
MQ-mengqing Feb 14, 2025
751528d
Merge branch 'master' into master
taronaeo Feb 22, 2025
3a42a05
ggml: cmake remove fork when determining s390x machine type
taronaeo Feb 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_VXE "ggml: enable vxe" ON)

option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ extern "C" {
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);

Expand Down
21 changes: 21 additions & 0 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_RVV)
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
message(STATUS "s390x detected")
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})

# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
elseif (${S390X_M} MATCHES "3931")
message(STATUS "z16 target")
list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
else()
message(STATUS "Unknown target")
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
list(APPEND ARCH_FLAGS -march=native -mtune=native)
endif()

if (GGML_VXE)
list(APPEND ARCH_FLAGS -mvx -mzvector)
endif()
else()
message(STATUS "Unknown architecture")
endif()
Expand Down
151 changes: 151 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ struct ggml_compute_params {
#endif
#endif

#if defined(__s390x__) && defined(__VEC__)
#ifndef __VXE__
#define __VXE__
#endif
#ifndef __VXE2__
#define __VXE2__
#endif
#endif

#if defined(__ARM_FEATURE_SVE)
#include <arm_sve.h>
#include <sys/prctl.h>
Expand Down Expand Up @@ -359,6 +368,148 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#endif
#endif

#if defined(__VXE__) || defined(__VXE2__)
#include <vecintrin.h>

#define vec_neg(a) (-(a)) // Vector Negate
#define vec_add(a, b) ((a) + (b)) // Vector Add
#define vec_sub(a, b) ((a) - (b)) // Vector Subtract
#define vec_mul(a, b) ((a) * (b)) // Vector Multiply
#define vec_div(a, b) ((a) / (b)) // Vector Divide
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet

#ifndef vec_and
#define vec_and(a, b) ((a) & (b)) // Vector AND
#endif

#ifndef vec_or
#define vec_or(a, b) ((a) | (b)) // Vector OR
#endif

#ifndef vec_xor
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
#endif

typedef signed char char8x16_t __attribute__((vector_size(16)));
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));

typedef int8_t int8x16_t __attribute__((vector_size(16)));
typedef int16_t int16x8_t __attribute__((vector_size(16)));
typedef int32_t int32x4_t __attribute__((vector_size(16)));

typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));

typedef float float32x4_t __attribute__((vector_size(16)));
typedef double double64x2_t __attribute((vector_size(16)));

typedef signed long long long64x2_t __attribute((vector_size(16)));
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));

typedef struct ggml_uint8x16x2_t {
uint8x16_t val[2];
} ggml_uint8x16x2_t;

inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
ggml_uint8x16x2_t res;

res.val[0] = vec_xl( 0, ptr);
res.val[1] = vec_xl(16, ptr);

return res;
}

typedef struct ggml_uint8x16x4_t {
uint8x16_t val[4];
} ggml_uint8x16x4_t;

inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
ggml_uint8x16x4_t res;

res.val[0] = vec_xl( 0, ptr);
res.val[1] = vec_xl(16, ptr);
res.val[2] = vec_xl(32, ptr);
res.val[3] = vec_xl(48, ptr);

return res;
}

typedef struct ggml_int8x16x4_t {
int8x16_t val[4];
} ggml_int8x16x4_t;

inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
ggml_int8x16x4_t res;

res.val[0] = vec_xl( 0, ptr);
res.val[1] = vec_xl(16, ptr);
res.val[2] = vec_xl(32, ptr);
res.val[3] = vec_xl(48, ptr);

return res;
}

typedef struct ggml_int16x8x2_t {
int16x8_t val[2];
} ggml_int16x8x2_t;

inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
ggml_int16x8x2_t res;

res.val[0] = vec_xl( 0, ptr);
res.val[1] = vec_xl(16, ptr);

return res;
}

/*
! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
! or iq4_nl for example implementation.
*/
inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
int8x16_t res;

res[ 0] = a[b[ 0]];
res[ 1] = a[b[ 1]];
res[ 2] = a[b[ 2]];
res[ 3] = a[b[ 3]];
res[ 4] = a[b[ 4]];
res[ 5] = a[b[ 5]];
res[ 6] = a[b[ 6]];
res[ 7] = a[b[ 7]];
res[ 8] = a[b[ 8]];
res[ 9] = a[b[ 9]];
res[10] = a[b[10]];
res[11] = a[b[11]];
res[12] = a[b[12]];
res[13] = a[b[13]];
res[14] = a[b[14]];
res[15] = a[b[15]];

return res;
}

inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29 };

const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
const int16x8_t v_abe = vec_perm(a, b, v_maske);
return v_abo + v_abe;
}

inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
return acc + (vec_unpackh(p) + vec_unpackl(p));
}

#endif

#if defined(__loongarch_asx)
/* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(const float val) {
Expand Down
Loading
Loading