Skip to content

Commit e892134

Browse files
authored
ggml : optimize llamafile cpu matrix multiplication for ppc64le (#10156)
This change upstreams llamafile's cpu matrix multiplication kernels for ppc64le using MMA builtins for FP32 datatype. This change results in a consistent 90% improvement in input processing time, and 20% to 80% improvement in output processing time, across various batch sizes. The patch is tested with Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf models on a IBM POWER10 machine. Signed-off-by: Amrita H S <[email protected]>
1 parent 8fc393f commit e892134

File tree

2 files changed

+615
-2
lines changed

2 files changed

+615
-2
lines changed

ggml/src/CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,8 +1265,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
12651265
endif()
12661266
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
12671267
message(STATUS "PowerPC detected")
1268-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
1269-
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
1268+
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1"
1269+
OUTPUT_VARIABLE POWER10_M)
1270+
string(FIND ${POWER10_M} "POWER10" substring_index)
1271+
if(${substring_index} GREATER_EQUAL 0)
1272+
list(APPEND ARCH_FLAGS -mcpu=power10)
1273+
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
1274+
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
12701275
else()
12711276
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
12721277
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)

0 commit comments

Comments
 (0)