Skip to content

Commit 4de2282

Browse files
committed
Merge branch 'master' into build-metal-default
2 parents bcf62ba + 6a31a3b commit 4de2282

File tree

13 files changed

+288
-160
lines changed

13 files changed

+288
-160
lines changed

Makefile

Lines changed: 69 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -71,21 +71,21 @@ OPT = -Ofast
7171
else
7272
OPT = -O3
7373
endif
74-
CFLAGS = -I. $(OPT) -std=c11 -fPIC
75-
CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
76-
LDFLAGS =
74+
MK_CPPFLAGS = -I. -Icommon
75+
MK_CFLAGS = $(CPPFLAGS) $(OPT) -std=c11 -fPIC
76+
MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
77+
MK_LDFLAGS =
7778

7879
ifdef LLAMA_DEBUG
79-
CFLAGS += -O0 -g
80-
CXXFLAGS += -O0 -g
81-
LDFLAGS += -g
80+
MK_CFLAGS += -O0 -g
81+
MK_CXXFLAGS += -O0 -g
82+
MK_LDFLAGS += -g
8283
else
83-
CFLAGS += -DNDEBUG
84-
CXXFLAGS += -DNDEBUG
84+
MK_CPPFLAGS += -DNDEBUG
8585
endif
8686

8787
ifdef LLAMA_SERVER_VERBOSE
88-
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
88+
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
8989
endif
9090

9191
ifdef LLAMA_DISABLE_LOGS
@@ -94,9 +94,9 @@ ifdef LLAMA_DISABLE_LOGS
9494
endif # LLAMA_DISABLE_LOGS
9595

9696
# warnings
97-
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
98-
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
99-
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
97+
MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
98+
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
99+
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
100100

101101
ifeq '' '$(findstring clang++,$(CXX))'
102102
# g++ only
@@ -105,29 +105,9 @@ endif
105105

106106
# OS specific
107107
# TODO: support Windows
108-
ifeq ($(UNAME_S),Linux)
109-
CFLAGS += -pthread
110-
CXXFLAGS += -pthread
111-
endif
112-
ifeq ($(UNAME_S),Darwin)
113-
CFLAGS += -pthread
114-
CXXFLAGS += -pthread
115-
endif
116-
ifeq ($(UNAME_S),FreeBSD)
117-
CFLAGS += -pthread
118-
CXXFLAGS += -pthread
119-
endif
120-
ifeq ($(UNAME_S),NetBSD)
121-
CFLAGS += -pthread
122-
CXXFLAGS += -pthread
123-
endif
124-
ifeq ($(UNAME_S),OpenBSD)
125-
CFLAGS += -pthread
126-
CXXFLAGS += -pthread
127-
endif
128-
ifeq ($(UNAME_S),Haiku)
129-
CFLAGS += -pthread
130-
CXXFLAGS += -pthread
108+
ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
109+
MK_CFLAGS += -pthread
110+
MK_CXXFLAGS += -pthread
131111
endif
132112

133113
# detect Windows
@@ -153,12 +133,11 @@ ifeq ($(_WIN32),1)
153133
endif
154134

155135
ifdef LLAMA_GPROF
156-
CFLAGS += -pg
157-
CXXFLAGS += -pg
136+
MK_CFLAGS += -pg
137+
MK_CXXFLAGS += -pg
158138
endif
159139
ifdef LLAMA_PERF
160-
CFLAGS += -DGGML_PERF
161-
CXXFLAGS += -DGGML_PERF
140+
MK_CPPFLAGS += -DGGML_PERF
162141
endif
163142

164143
# Architecture specific
@@ -169,16 +148,16 @@ ifndef RISCV
169148

170149
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
171150
# Use all CPU extensions that are available:
172-
CFLAGS += -march=native -mtune=native
173-
CXXFLAGS += -march=native -mtune=native
151+
MK_CFLAGS += -march=native -mtune=native
152+
MK_CXXFLAGS += -march=native -mtune=native
174153

175154
# Usage AVX-only
176-
#CFLAGS += -mfma -mf16c -mavx
177-
#CXXFLAGS += -mfma -mf16c -mavx
155+
#MK_CFLAGS += -mfma -mf16c -mavx
156+
#MK_CXXFLAGS += -mfma -mf16c -mavx
178157

179158
# Usage SSSE3-only (Not is SSE3!)
180-
#CFLAGS += -mssse3
181-
#CXXFLAGS += -mssse3
159+
#MK_CFLAGS += -mssse3
160+
#MK_CXXFLAGS += -mssse3
182161
endif
183162

184163
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
@@ -192,34 +171,33 @@ endif
192171
ifneq ($(filter aarch64%,$(UNAME_M)),)
193172
# Apple M1, M2, etc.
194173
# Raspberry Pi 3, 4, Zero 2 (64-bit)
195-
CFLAGS += -mcpu=native
196-
CXXFLAGS += -mcpu=native
174+
MK_CFLAGS += -mcpu=native
175+
MK_CXXFLAGS += -mcpu=native
197176
endif
198177

199178
ifneq ($(filter armv6%,$(UNAME_M)),)
200179
# Raspberry Pi 1, Zero
201-
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
180+
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
181+
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
202182
endif
203183

204184
ifneq ($(filter armv7%,$(UNAME_M)),)
205185
# Raspberry Pi 2
206-
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
186+
MK_CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
187+
MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
207188
endif
208189

209190
ifneq ($(filter armv8%,$(UNAME_M)),)
210191
# Raspberry Pi 3, 4, Zero 2 (32-bit)
211-
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
192+
MK_CFLAGS += -mfp16-format=ieee -mno-unaligned-access
193+
MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
212194
endif
213195

214196
ifneq ($(filter ppc64%,$(UNAME_M)),)
215197
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
216198
ifneq (,$(findstring POWER9,$(POWER9_M)))
217-
CFLAGS += -mcpu=power9
218-
CXXFLAGS += -mcpu=power9
219-
endif
220-
# Require c++23's std::byteswap for big-endian support.
221-
ifeq ($(UNAME_M),ppc64)
222-
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
199+
MK_CFLAGS += -mcpu=power9
200+
MK_CXXFLAGS += -mcpu=power9
223201
endif
224202
endif
225203

@@ -229,21 +207,19 @@ else
229207
endif
230208

231209
ifndef LLAMA_NO_K_QUANTS
232-
CFLAGS += -DGGML_USE_K_QUANTS
233-
CXXFLAGS += -DGGML_USE_K_QUANTS
210+
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
234211
OBJS += k_quants.o
235212
ifdef LLAMA_QKK_64
236-
CFLAGS += -DGGML_QKK_64
237-
CXXFLAGS += -DGGML_QKK_64
213+
MK_CPPFLAGS += -DGGML_QKK_64
238214
endif
239215
endif
240216

241217
ifndef LLAMA_NO_ACCELERATE
242218
# Mac OS - include Accelerate framework.
243219
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
244220
ifeq ($(UNAME_S),Darwin)
245-
CFLAGS += -DGGML_USE_ACCELERATE
246-
LDFLAGS += -framework Accelerate
221+
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
222+
MK_LDFLAGS += -framework Accelerate
247223
endif
248224
endif # LLAMA_NO_ACCELERATE
249225

@@ -258,25 +234,26 @@ ifndef LLAMA_NO_METAL
258234
endif # LLAMA_NO_METAL
259235

260236
ifdef LLAMA_MPI
261-
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
262-
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
237+
MK_CPPFLAGS += -DGGML_USE_MPI
238+
MK_CFLAGS += -Wno-cast-qual
239+
MK_CXXFLAGS += -Wno-cast-qual
263240
OBJS += ggml-mpi.o
264241
endif # LLAMA_MPI
265242

266243
ifdef LLAMA_OPENBLAS
267-
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
268-
LDFLAGS += $(shell pkg-config --libs openblas)
244+
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
245+
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
246+
MK_LDFLAGS += $(shell pkg-config --libs openblas)
269247
endif # LLAMA_OPENBLAS
270248

271249
ifdef LLAMA_BLIS
272-
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
273-
LDFLAGS += -lblis -L/usr/local/lib
250+
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
251+
MK_LDFLAGS += -lblis -L/usr/local/lib
274252
endif # LLAMA_BLIS
275253

276254
ifdef LLAMA_CUBLAS
277-
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
278-
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
279-
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
255+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
256+
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
280257
OBJS += ggml-cuda.o
281258
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
282259
ifdef LLAMA_CUDA_NVCC
@@ -327,14 +304,15 @@ endif # LLAMA_CUBLAS
327304

328305
ifdef LLAMA_CLBLAST
329306

330-
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
331-
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
307+
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
308+
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
309+
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
332310

333311
# Mac provides OpenCL as a framework
334312
ifeq ($(UNAME_S),Darwin)
335-
LDFLAGS += -lclblast -framework OpenCL
313+
MK_LDFLAGS += -lclblast -framework OpenCL
336314
else
337-
LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
315+
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
338316
endif
339317
OBJS += ggml-opencl.o
340318

@@ -349,10 +327,9 @@ ifdef LLAMA_HIPBLAS
349327
LLAMA_CUDA_DMMV_X ?= 32
350328
LLAMA_CUDA_MMV_Y ?= 1
351329
LLAMA_CUDA_KQUANTS_ITER ?= 2
352-
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
353-
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
354-
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
355-
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
330+
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
331+
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
332+
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
356333
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
357334
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
358335
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
@@ -366,6 +343,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
366343
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
367344
endif # LLAMA_HIPBLAS
368345

346+
ifndef LLAMA_NO_METAL
347+
MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
348+
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
349+
OBJS += ggml-metal.o
350+
endif # LLAMA_METAL
351+
369352
ifndef LLAMA_NO_METAL
370353
ggml-metal.o: ggml-metal.m ggml-metal.h
371354
$(CC) $(CFLAGS) -c $< -o $@
@@ -376,11 +359,17 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
376359
$(CC) $(CFLAGS) -c $< -o $@
377360
endif # LLAMA_MPI
378361

379-
ifdef LLAMA_NO_K_QUANTS
362+
ifndef LLAMA_NO_K_QUANTS
380363
k_quants.o: k_quants.c k_quants.h
381364
$(CC) $(CFLAGS) -c $< -o $@
382365
endif # LLAMA_NO_K_QUANTS
383366

367+
# combine build flags with cmdline overrides
368+
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
369+
override CFLAGS := $(MK_CFLAGS) $(CFLAGS)
370+
override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
371+
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
372+
384373
#
385374
# Print build information
386375
#

Package.swift

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,18 @@ let package = Package(
1212
name: "llama",
1313
path: ".",
1414
exclude: ["ggml-metal.metal"],
15-
sources: ["ggml.c", "llama.cpp"],
15+
sources: [
16+
"ggml.c",
17+
"llama.cpp",
18+
"ggml-alloc.c",
19+
"k_quants.c"
20+
],
1621
publicHeadersPath: "spm-headers",
17-
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
22+
cSettings: [
23+
.unsafeFlags(["-Wno-shorten-64-to-32"]),
24+
.define("GGML_USE_K_QUANTS"),
25+
.define("GGML_USE_ACCELERATE")
26+
],
1827
linkerSettings: [
1928
.linkedFramework("Accelerate")
2029
]

README.md

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ as the main playground for developing new features for the [ggml](https://github
120120

121121
- [nat/openplayground](https://github.com/nat/openplayground)
122122
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
123+
- [withcatai/catai](https://github.com/withcatai/catai)
123124

124125
---
125126

@@ -464,6 +465,8 @@ Building the program with BLAS support may lead to some performance improvements
464465
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
465466
- For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
466467
468+
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
469+
467470
- <details>
468471
<summary>Installing the OpenCL SDK from source</summary>
469472
@@ -481,10 +484,27 @@ Building the program with BLAS support may lead to some performance improvements
481484
```
482485
</details>
483486
484-
Installing CLBlast: it may be found in your operating system's packages.
487+
##### Installing CLBlast
488+
489+
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
490+
491+
Alternatively, they may be built from source.
485492

486493
- <details>
487-
<summary>If not, then installing from source:</summary>
494+
<summary>Windows:</summary>
495+
496+
```cmd
497+
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
498+
git clone https://github.com/CNugteren/CLBlast.git
499+
mkdir CLBlast\build
500+
cd CLBlast\build
501+
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
502+
cmake --build . --config Release
503+
cmake --install . --prefix C:/CLBlast
504+
```
505+
506+
- <details>
507+
<summary>Unix:</summary>
488508

489509
```sh
490510
git clone https://github.com/CNugteren/CLBlast.git
@@ -498,21 +518,32 @@ Building the program with BLAS support may lead to some performance improvements
498518
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
499519
</details>
500520

501-
Building:
521+
##### Building Llama with CLBlast
502522

503523
- Build with make:
504524
```sh
505525
make LLAMA_CLBLAST=1
506526
```
507-
- CMake:
527+
- CMake (Unix):
508528
```sh
509529
mkdir build
510530
cd build
511531
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
512532
cmake --build . --config Release
513533
```
534+
- CMake (Windows):
535+
```cmd
536+
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
537+
git clone https://github.com/ggerganov/llama.cpp
538+
cd llama.cpp
539+
mkdir build
540+
cd build
541+
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
542+
cmake --build . --config Release
543+
cmake --install . --prefix C:/LlamaCPP
544+
```
514545

515-
Running:
546+
##### Running Llama with CLBlast
516547

517548
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
518549

0 commit comments

Comments
 (0)