Skip to content

Commit 481044d

Browse files
authored
additional optimizations for POWER9 (#454)
1 parent 563cdc3 commit 481044d

File tree

2 files changed

+83
-3
lines changed

2 files changed

+83
-3
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ endif
156156
ifneq ($(filter ppc64%,$(UNAME_M)),)
157157
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
158158
ifneq (,$(findstring POWER9,$(POWER9_M)))
159-
CFLAGS += -mpower9-vector
159+
CFLAGS += -mcpu=power9
160+
CXXFLAGS += -mcpu=power9
160161
endif
161162
# Require c++23's std::byteswap for big-endian support.
162163
ifeq ($(UNAME_M),ppc64)

ggml.c

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,39 @@ typedef double ggml_float;
175175
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
176176
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
177177

178+
#elif defined(__POWER9_VECTOR__)
179+
180+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
181+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
182+
/* the inline asm below is about 12% faster than the lookup method */
183+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
184+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
185+
186+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
187+
register float f;
188+
register double d;
189+
__asm__(
190+
"mtfprd %0,%2\n"
191+
"xscvhpdp %0,%0\n"
192+
"frsp %1,%0\n" :
193+
/* temp */ "=d"(d),
194+
/* out */ "=f"(f):
195+
/* in */ "r"(h));
196+
return f;
197+
}
198+
199+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
200+
register double d;
201+
register ggml_fp16_t r;
202+
__asm__( /* xscvdphp can work on double or single precision */
203+
"xscvdphp %0,%2\n"
204+
"mffprd %1,%0\n" :
205+
/* temp */ "=d"(d),
206+
/* out */ "=r"(r):
207+
/* in */ "f"(f));
208+
return r;
209+
}
210+
178211
#else
179212

180213
// FP16 <-> FP32
@@ -272,6 +305,7 @@ static float table_f32_f16[1 << 16];
272305

273306
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
274307
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
308+
// This is also true for POWER9.
275309
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
276310

277311
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
@@ -462,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
462496
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
463497
assert(k % QK == 0);
464498

465-
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
499+
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
466500
const int nb = k / QK;
467501
const size_t bs = sizeof(float) + QK/2;
468502

@@ -472,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
472506
uint8_t pp[QK/2];
473507
#endif
474508

475-
#if __ARM_NEON
509+
#if defined(__POWER9_VECTOR__)
510+
#if QK == 32
511+
const vector float v85 = vec_splats(8.5f);
512+
for (int i = 0; i < nb; i++) {
513+
float amax = 0.0f; // absolute max
514+
515+
vector float srcv [8];
516+
vector float asrcv[8];
517+
vector float amaxv[8];
518+
519+
for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l);
520+
for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
521+
522+
for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
523+
//for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
524+
amaxv[0] = vec_max(amaxv[0], amaxv[2]);
525+
amaxv[4] = vec_max(amaxv[4], amaxv[6]);
526+
//for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
527+
amaxv[0] = vec_max(amaxv[0], amaxv[4]);
528+
529+
amax = MAX(
530+
MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
531+
MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
532+
533+
const float d = amax / ((1 << 3) - 1);
534+
const float id = d ? 1.0/d : 0.0;
535+
536+
*(float *)pd = d;
537+
pd += bs;
538+
539+
const vector float vid = vec_splats(id);
540+
for (int l = 0; l < 8; l++) {
541+
const vector float vf = vec_madd(srcv[l], vid, v85);
542+
const vector signed int vi = vec_signed(vf);
543+
544+
pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
545+
pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
546+
}
547+
548+
//memcpy(pb, pp, sizeof(pp));
549+
pb += bs;
550+
}
551+
#else
552+
#error "not implemented for QK"
553+
#endif
554+
#elif __ARM_NEON
476555
#if QK == 32
477556
for (int i = 0; i < nb; i++) {
478557
float amax = 0.0f; // absolute max

0 commit comments

Comments
 (0)