Skip to content

Commit 19d8762

Browse files
Djip007ggerganov
andauthored
ggml : refactor online repacking (#10446)
* rename ggml-cpu-aarch64.c to .cpp * reformat extra cpu backend. - clean Q4_0_N_M and IQ4_0_N_M - remove from "file" tensor type - allow only with dynamic repack - extract cpu extra bufts and convert to C++ - hbm - "aarch64" - more generic use of extra buffer - generalise extra_supports_op - new API for "cpu-accel": - amx - aarch64 * clang-format * Clean Q4_0_N_M ref Enable restrict on C++ * add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack * added/corrected control on tensor size for Q4 repacking. * Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp Co-authored-by: Georgi Gerganov <[email protected]> * Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp Co-authored-by: Georgi Gerganov <[email protected]> * add debug logs on repacks. --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent c2a16c0 commit 19d8762

33 files changed

+1135
-1048
lines changed

Makefile

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
445445
MK_CFLAGS += -march=native -mtune=native
446446
HOST_CXXFLAGS += -march=native -mtune=native
447447

448+
# Usage AMX build test
449+
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
450+
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
451+
448452
# Usage AVX-only
449453
#MK_CFLAGS += -mfma -mf16c -mavx
450454
#MK_CXXFLAGS += -mfma -mf16c -mavx
@@ -948,17 +952,18 @@ DIR_COMMON = common
948952

949953
OBJ_GGML = \
950954
$(DIR_GGML)/src/ggml.o \
951-
$(DIR_GGML)/src/ggml-aarch64.o \
952955
$(DIR_GGML)/src/ggml-alloc.o \
953956
$(DIR_GGML)/src/ggml-backend.o \
954957
$(DIR_GGML)/src/ggml-backend-reg.o \
955958
$(DIR_GGML)/src/ggml-opt.o \
956959
$(DIR_GGML)/src/ggml-quants.o \
957960
$(DIR_GGML)/src/ggml-threading.o \
958961
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
959-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
962+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
960963
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
964+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
961965
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
966+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
962967
$(OBJ_GGML_EXT)
963968

964969
OBJ_LLAMA = \
@@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
10981103
# Default target
10991104
all: $(BUILD_TARGETS)
11001105

1106+
# force c++ build for source file that have same name as c file
11011107
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
1102-
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
1103-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
1104-
ggml/src/ggml-cpu/ggml-cpu.cpp \
1105-
ggml/include/ggml-backend.h \
1106-
ggml/include/ggml.h \
1107-
ggml/include/ggml-alloc.h \
1108-
ggml/src/ggml-backend-impl.h \
1109-
ggml/include/ggml-cpu.h \
1110-
ggml/src/ggml-impl.h
1111-
$(CXX) $(CXXFLAGS) -c $< -o $@
1108+
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
1109+
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
11121110

11131111
# Rules for building object files
11141112
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c

Package.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ var sources = [
1010
"src/unicode.cpp",
1111
"src/unicode-data.cpp",
1212
"ggml/src/ggml.c",
13-
"ggml/src/ggml-aarch64.c",
1413
"ggml/src/ggml-alloc.c",
1514
"ggml/src/ggml-backend.cpp",
1615
"ggml/src/ggml-backend-reg.cpp",
1716
"ggml/src/ggml-cpu/ggml-cpu.c",
1817
"ggml/src/ggml-cpu/ggml-cpu.cpp",
19-
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
18+
"ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp",
19+
"ggml/src/ggml-cpu/ggml-cpu-hbm.cpp",
2020
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
21+
"ggml/src/ggml-cpu/ggml-cpu-traits.cpp",
2122
"ggml/src/ggml-threading.cpp",
2223
"ggml/src/ggml-quants.c",
2324
]

docs/build.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ cmake --build build --config Release
5555
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
5656
cmake --build build-arm64-windows-llvm-release
5757
```
58-
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
58+
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
5959
6060
## BLAS Build
6161

examples/quantize/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis
5454

5555
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
5656

57-
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
58-
5957
*(outdated)*
6058

6159
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |

examples/quantize/quantize.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4848
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
4949
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
5050
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
51-
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52-
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
53-
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
5451
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
5552
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5653
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

ggml/include/ggml-cpu.h

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -103,24 +103,14 @@ extern "C" {
103103

104104
// Internal types and functions exposed for tests and benchmarks
105105

106-
typedef void (*ggml_from_float_to_mat_t)
107-
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
108106
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
109107
const void * GGML_RESTRICT y, size_t by, int nrc);
110-
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
111-
const void * GGML_RESTRICT y, int nr, int nc);
112-
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
113-
const void * GGML_RESTRICT y, int nr, int nc);
114108

115109
struct ggml_type_traits_cpu {
116110
ggml_from_float_t from_float;
117-
ggml_from_float_to_mat_t from_float_to_mat;
118111
ggml_vec_dot_t vec_dot;
119112
enum ggml_type vec_dot_type;
120113
int64_t nrows; // number of rows to process simultaneously
121-
int64_t ncols; // number of columns to process simultaneously
122-
ggml_gemv_t gemv;
123-
ggml_gemm_t gemm;
124114
};
125115

126116
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
@@ -140,13 +130,6 @@ extern "C" {
140130

141131
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
142132

143-
#ifdef GGML_USE_CPU_HBM
144-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
145-
#endif
146-
147-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
148-
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
149-
150133
#ifdef __cplusplus
151134
}
152135
#endif

ggml/include/ggml.h

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -384,15 +384,15 @@ extern "C" {
384384
GGML_TYPE_F64 = 28,
385385
GGML_TYPE_IQ1_M = 29,
386386
GGML_TYPE_BF16 = 30,
387-
GGML_TYPE_Q4_0_4_4 = 31,
388-
GGML_TYPE_Q4_0_4_8 = 32,
389-
GGML_TYPE_Q4_0_8_8 = 33,
387+
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
388+
// GGML_TYPE_Q4_0_4_8 = 32,
389+
// GGML_TYPE_Q4_0_8_8 = 33,
390390
GGML_TYPE_TQ1_0 = 34,
391391
GGML_TYPE_TQ2_0 = 35,
392-
GGML_TYPE_IQ4_NL_4_4 = 36,
392+
// GGML_TYPE_IQ4_NL_4_4 = 36,
393393
// GGML_TYPE_IQ4_NL_4_8 = 37,
394394
// GGML_TYPE_IQ4_NL_8_8 = 38,
395-
GGML_TYPE_COUNT,
395+
GGML_TYPE_COUNT = 39,
396396
};
397397

398398
// precision
@@ -433,9 +433,6 @@ extern "C" {
433433
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
434434
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
435435
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
436-
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
437-
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
438-
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
439436
};
440437

441438
// available tensor operations:
@@ -2205,11 +2202,19 @@ extern "C" {
22052202
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
22062203
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
22072204

2208-
#ifdef __cplusplus
2209-
// restrict not standard in C++
2210-
#define GGML_RESTRICT
2205+
#ifdef __cplusplus
2206+
// restrict not standard in C++
2207+
# if defined(__GNUC__)
2208+
# define GGML_RESTRICT __restrict__
2209+
# elif defined(__clang__)
2210+
# define GGML_RESTRICT __restrict
2211+
# elif defined(_MSC_VER)
2212+
# define GGML_RESTRICT __restrict
2213+
# else
2214+
# define GGML_RESTRICT
2215+
# endif
22112216
#else
2212-
#define GGML_RESTRICT restrict
2217+
# define GGML_RESTRICT restrict
22132218
#endif
22142219
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
22152220
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,7 @@ add_library(ggml-base
220220
ggml-threading.cpp
221221
ggml-threading.h
222222
ggml-quants.c
223-
ggml-quants.h
224-
ggml-aarch64.c
225-
ggml-aarch64.h)
223+
ggml-quants.h)
226224

227225
target_include_directories(ggml-base PRIVATE .)
228226

ggml/src/ggml-aarch64.c

Lines changed: 0 additions & 129 deletions
This file was deleted.

ggml/src/ggml-aarch64.h

Lines changed: 0 additions & 19 deletions
This file was deleted.

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
20892089
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
20902090
/* .get_name = */ ggml_backend_cann_reg_get_name,
20912091
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
2092-
/* .get_device_get = */ ggml_backend_cann_reg_get_device,
2092+
/* .get_device = */ ggml_backend_cann_reg_get_device,
20932093
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
20942094
};
20952095

0 commit comments

Comments
 (0)