Skip to content

Commit 3aebd56

Browse files
committed
Merge branch 'master' of https://github.com/ggerganov/llama.cpp into vczf
2 parents 210a71c + 8183159 commit 3aebd56

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+9689
-3609
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ jobs:
197197
strategy:
198198
matrix:
199199
include:
200+
- build: 'noavx'
201+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
200202
- build: 'avx2'
201203
defines: '-DLLAMA_BUILD_SERVER=ON'
202204
- build: 'avx'

.gitignore

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ build/
1616
build-em/
1717
build-debug/
1818
build-release/
19+
build-ci-debug/
20+
build-ci-release/
1921
build-static/
2022
build-cublas/
2123
build-opencl/
@@ -25,9 +27,10 @@ build-no-accel/
2527
build-sanitize-addr/
2628
build-sanitize-thread/
2729
out/
30+
tmp/
2831

2932
models/*
30-
*.bin
33+
models-mnt
3134

3235
/main
3336
/quantize
@@ -58,3 +61,18 @@ qnt-*.txt
5861
perf-*.txt
5962

6063
examples/jeopardy/results.txt
64+
65+
66+
pyproject.toml
67+
poetry.lock
68+
poetry.toml
69+
70+
# Test binaries
71+
tests/test-double-float
72+
tests/test-grad0
73+
tests/test-opt
74+
tests/test-quantize-fns
75+
tests/test-quantize-perf
76+
tests/test-sampling
77+
tests/test-tokenizer-0
78+

CMakeLists.txt

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,13 @@ endif()
6767
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
6868
option(LLAMA_BLAS "llama: use BLAS" OFF)
6969
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
70-
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
70+
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
71+
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
72+
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
7173
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
7274
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7375
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
74-
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
76+
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
7577
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7678
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7779
option(LLAMA_METAL "llama: use Metal" OFF)
@@ -251,6 +253,10 @@ if (LLAMA_CUBLAS)
251253
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
252254

253255
add_compile_definitions(GGML_USE_CUBLAS)
256+
# if (LLAMA_CUDA_CUBLAS)
257+
# add_compile_definitions(GGML_CUDA_CUBLAS)
258+
# endif()
259+
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
254260
if (LLAMA_CUDA_FORCE_DMMV)
255261
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
256262
endif()
@@ -259,8 +265,8 @@ if (LLAMA_CUBLAS)
259265
if (DEFINED LLAMA_CUDA_DMMV_Y)
260266
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
261267
endif()
262-
if (LLAMA_CUDA_DMMV_F16)
263-
add_compile_definitions(GGML_CUDA_DMMV_F16)
268+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
269+
add_compile_definitions(GGML_CUDA_F16)
264270
endif()
265271
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
266272

@@ -271,10 +277,14 @@ if (LLAMA_CUBLAS)
271277
endif()
272278

273279
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
274-
if (LLAMA_CUDA_DMMV_F16)
275-
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
280+
# 52 == lowest CUDA 12 standard
281+
# 60 == f16 CUDA intrinsics
282+
# 61 == integer CUDA intrinsics
283+
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
284+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
285+
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
276286
else()
277-
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
287+
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
278288
endif()
279289
endif()
280290
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -357,6 +367,7 @@ if (LLAMA_ALL_WARNINGS)
357367
-Wshadow
358368
-Wstrict-prototypes
359369
-Wpointer-arith
370+
-Wmissing-prototypes
360371
)
361372
set(cxx_flags
362373
-Wall
@@ -496,6 +507,8 @@ endif()
496507
add_library(ggml OBJECT
497508
ggml.c
498509
ggml.h
510+
ggml-alloc.c
511+
ggml-alloc.h
499512
${GGML_SOURCES_CUDA}
500513
${GGML_SOURCES_OPENCL}
501514
${GGML_SOURCES_METAL}
@@ -512,6 +525,7 @@ if (BUILD_SHARED_LIBS)
512525
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
513526
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
514527
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
528+
install(TARGETS ggml_shared LIBRARY)
515529
endif()
516530

517531
add_library(llama
@@ -533,8 +547,32 @@ if (BUILD_SHARED_LIBS)
533547
if (LLAMA_METAL)
534548
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
535549
endif()
550+
install(TARGETS llama LIBRARY)
536551
endif()
537552

553+
include(GNUInstallDirs)
554+
install(
555+
FILES convert.py
556+
PERMISSIONS
557+
OWNER_READ
558+
OWNER_WRITE
559+
OWNER_EXECUTE
560+
GROUP_READ
561+
GROUP_EXECUTE
562+
WORLD_READ
563+
WORLD_EXECUTE
564+
DESTINATION ${CMAKE_INSTALL_BINDIR})
565+
install(
566+
FILES convert-lora-to-ggml.py
567+
PERMISSIONS
568+
OWNER_READ
569+
OWNER_WRITE
570+
OWNER_EXECUTE
571+
GROUP_READ
572+
GROUP_EXECUTE
573+
WORLD_READ
574+
WORLD_EXECUTE
575+
DESTINATION ${CMAKE_INSTALL_BINDIR})
538576

539577
#
540578
# programs, examples and tests

Makefile

Lines changed: 94 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
3+
4+
# Binaries only useful for tests
5+
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
36

47
default: $(BUILD_TARGETS)
58

@@ -60,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
6063
endif
6164

6265
# warnings
63-
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
66+
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
67+
-Wmissing-prototypes
6468
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
6569

6670
# OS specific
@@ -90,6 +94,28 @@ ifeq ($(UNAME_S),Haiku)
9094
CXXFLAGS += -pthread
9195
endif
9296

97+
# detect Windows
98+
ifneq ($(findstring _NT,$(UNAME_S)),)
99+
_WIN32 := 1
100+
endif
101+
102+
# library name prefix
103+
ifneq ($(_WIN32),1)
104+
LIB_PRE := lib
105+
endif
106+
107+
# Dynamic Shared Object extension
108+
ifneq ($(_WIN32),1)
109+
DSO_EXT := .so
110+
else
111+
DSO_EXT := .dll
112+
endif
113+
114+
# Windows Sockets 2 (Winsock) for network-capable apps
115+
ifeq ($(_WIN32),1)
116+
LWINSOCK2 := -lws2_32
117+
endif
118+
93119
ifdef LLAMA_GPROF
94120
CFLAGS += -pg
95121
CXXFLAGS += -pg
@@ -102,7 +128,7 @@ endif
102128
# Architecture specific
103129
# TODO: probably these flags need to be tweaked on some architectures
104130
# feel free to update the Makefile for your architecture and send a pull request or issue
105-
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
131+
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
106132
# Use all CPU extensions that are available:
107133
CFLAGS += -march=native -mtune=native
108134
CXXFLAGS += -march=native -mtune=native
@@ -168,8 +194,12 @@ ifdef LLAMA_CUBLAS
168194
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
169195
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
170196
OBJS += ggml-cuda.o
171-
NVCC = nvcc
172-
NVCCFLAGS = --forward-unknown-to-host-compiler
197+
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
198+
ifdef LLAMA_CUDA_NVCC
199+
NVCC = $(LLAMA_CUDA_NVCC)
200+
else
201+
NVCC = nvcc
202+
endif #LLAMA_CUDA_NVCC
173203
ifdef CUDA_DOCKER_ARCH
174204
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
175205
else
@@ -190,27 +220,42 @@ else ifdef LLAMA_CUDA_DMMV_Y
190220
else
191221
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
192222
endif # LLAMA_CUDA_MMV_Y
223+
ifdef LLAMA_CUDA_F16
224+
NVCCFLAGS += -DGGML_CUDA_F16
225+
endif # LLAMA_CUDA_F16
193226
ifdef LLAMA_CUDA_DMMV_F16
194-
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
227+
NVCCFLAGS += -DGGML_CUDA_F16
195228
endif # LLAMA_CUDA_DMMV_F16
196229
ifdef LLAMA_CUDA_KQUANTS_ITER
197230
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
198231
else
199232
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
200233
endif
201-
234+
ifdef LLAMA_CUDA_MMQ_Y
235+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
236+
else
237+
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
238+
endif # LLAMA_CUDA_MMQ_Y
239+
#ifdef LLAMA_CUDA_CUBLAS
240+
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
241+
#endif # LLAMA_CUDA_CUBLAS
242+
ifdef LLAMA_CUDA_CCBIN
243+
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
244+
endif
202245
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
203-
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
246+
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
204247
endif # LLAMA_CUBLAS
205248

206249
ifdef LLAMA_CLBLAST
207-
CFLAGS += -DGGML_USE_CLBLAST
208-
CXXFLAGS += -DGGML_USE_CLBLAST
250+
251+
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
252+
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
253+
209254
# Mac provides OpenCL as a framework
210255
ifeq ($(UNAME_S),Darwin)
211256
LDFLAGS += -lclblast -framework OpenCL
212257
else
213-
LDFLAGS += -lclblast -lOpenCL
258+
LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
214259
endif
215260
OBJS += ggml-opencl.o
216261

@@ -284,23 +329,31 @@ $(info )
284329
ggml.o: ggml.c ggml.h ggml-cuda.h
285330
$(CC) $(CFLAGS) -c $< -o $@
286331

287-
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
332+
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
333+
$(CC) $(CFLAGS) -c $< -o $@
334+
335+
OBJS += ggml-alloc.o
336+
337+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
288338
$(CXX) $(CXXFLAGS) -c $< -o $@
289339

290340
common.o: examples/common.cpp examples/common.h
291341
$(CXX) $(CXXFLAGS) -c $< -o $@
292342

343+
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
344+
$(CXX) $(CXXFLAGS) -c $< -o $@
345+
293346
libllama.so: llama.o ggml.o $(OBJS)
294347
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
295348

296349
clean:
297-
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
350+
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
298351

299352
#
300353
# Examples
301354
#
302355

303-
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
356+
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
304357
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
305358
@echo
306359
@echo '==== Run ./main -h for help. ===='
@@ -324,15 +377,15 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
324377
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
325378
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
326379

327-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
328-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
380+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
381+
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
329382

330-
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
383+
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
331384
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
332385

333386

334-
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
335-
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
387+
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
388+
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
336389

337390
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
338391
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -349,13 +402,32 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
349402
# Tests
350403
#
351404

405+
tests: $(TEST_TARGETS)
406+
352407
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
353408
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
354409
./$@
355410

356411
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
357412
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
358413

359-
.PHONY: tests clean
360-
tests:
361-
bash ./tests/run-tests.sh
414+
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
415+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
416+
417+
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
418+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
419+
420+
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
421+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
422+
423+
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
424+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
425+
426+
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
427+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
428+
429+
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
430+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
431+
432+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
433+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

0 commit comments

Comments
 (0)