Skip to content

Commit 0fc775e

Browse files
author
Joan Martinez
committed
Merge branch 'master' of https://github.com/JoanFM/llama.cpp into feat-jina-v2-base-code
2 parents 96a6f55 + adc9ff3 commit 0fc775e

File tree

156 files changed

+108711
-42027
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+108711
-42027
lines changed

.github/workflows/build.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,22 @@ jobs:
294294
295295
- name: Build
296296
id: cmake_build
297+
if: ${{ matrix.sanitizer != 'THREAD' }}
297298
run: |
298299
mkdir build
299300
cd build
300301
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
301302
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
302303
304+
- name: Build (no OpenMP)
305+
id: cmake_build_no_openmp
306+
if: ${{ matrix.sanitizer == 'THREAD' }}
307+
run: |
308+
mkdir build
309+
cd build
310+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
311+
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
312+
303313
- name: Test
304314
id: cmake_test
305315
run: |

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
3434
lcov-report/
3535
gcovr-report/
3636

37+
tags
3738
build*
3839
!build.zig
3940
cmake-build-*
41+
android-ndk-*
4042
out/
4143
tmp/
4244

@@ -105,6 +107,7 @@ examples/jeopardy/results.txt
105107
examples/server/*.html.hpp
106108
examples/server/*.js.hpp
107109
examples/server/*.mjs.hpp
110+
examples/server/*.css.hpp
108111

109112
poetry.lock
110113
poetry.toml

CMakeLists.txt

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
106106
"llama: max. batch size for using peer access")
107107
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
108108
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
109+
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
109110

110111
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
111112
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@@ -125,6 +126,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
125126
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
126127
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
127128
option(LLAMA_RPC "llama: use RPC" OFF)
129+
option(LLAMA_OPENMP "llama: use OpenMP" ON)
128130
option(LLAMA_SYCL "llama: use SYCL" OFF)
129131
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
130132
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@@ -295,6 +297,17 @@ if (LLAMA_METAL)
295297
)
296298
endif()
297299

300+
if (LLAMA_OPENMP)
301+
find_package(OpenMP)
302+
if (OpenMP_FOUND)
303+
message(STATUS "OpenMP found")
304+
add_compile_definitions(GGML_USE_OPENMP)
305+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
306+
else()
307+
message(WARNING "OpenMP not found")
308+
endif()
309+
endif()
310+
298311
if (LLAMA_BLAS)
299312
if (LLAMA_STATIC)
300313
set(BLA_STATIC ON)
@@ -402,6 +415,8 @@ if (LLAMA_CUDA)
402415

403416
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
404417
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
418+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
419+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
405420

406421
add_compile_definitions(GGML_USE_CUDA)
407422
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -427,6 +442,18 @@ if (LLAMA_CUDA)
427442
if (LLAMA_CUDA_NO_PEER_COPY)
428443
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
429444
endif()
445+
if (LLAMA_CUDA_FA_ALL_QUANTS)
446+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
447+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
448+
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
449+
else()
450+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
451+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
452+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
453+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
454+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
455+
list(APPEND GGML_SOURCES_CUDA ${SRCS})
456+
endif()
430457

431458
if (LLAMA_STATIC)
432459
if (WIN32)
@@ -530,12 +557,17 @@ if (LLAMA_VULKAN)
530557
endif()
531558

532559
if (LLAMA_HIPBLAS)
533-
if ($ENV{ROCM_PATH})
534-
set(ROCM_PATH $ENV{ROCM_PATH})
560+
if (NOT EXISTS $ENV{ROCM_PATH})
561+
if (NOT EXISTS /opt/rocm)
562+
set(ROCM_PATH /usr)
563+
else()
564+
set(ROCM_PATH /opt/rocm)
565+
endif()
535566
else()
536-
set(ROCM_PATH /opt/rocm)
567+
set(ROCM_PATH $ENV{ROCM_PATH})
537568
endif()
538569
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
570+
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
539571

540572
# CMake on Windows doesn't support the HIP language yet
541573
if(WIN32)
@@ -571,6 +603,8 @@ if (LLAMA_HIPBLAS)
571603

572604
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
573605
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
606+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
607+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
574608

575609
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
576610

@@ -590,6 +624,19 @@ if (LLAMA_HIPBLAS)
590624
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
591625
endif()
592626

627+
if (LLAMA_CUDA_FA_ALL_QUANTS)
628+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
629+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
630+
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
631+
else()
632+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
633+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
634+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
635+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
636+
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
637+
list(APPEND GGML_SOURCES_ROCM ${SRCS})
638+
endif()
639+
593640
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
594641
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
595642
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -747,6 +794,7 @@ if (LLAMA_KOMPUTE)
747794
kompute-shaders/op_mul_mat_q4_0.comp
748795
kompute-shaders/op_mul_mat_q4_1.comp
749796
kompute-shaders/op_mul_mat_q6_k.comp
797+
kompute-shaders/op_getrows_f32.comp
750798
kompute-shaders/op_getrows_f16.comp
751799
kompute-shaders/op_getrows_q4_0.comp
752800
kompute-shaders/op_getrows_q4_1.comp
@@ -779,6 +827,7 @@ if (LLAMA_KOMPUTE)
779827
shaderop_mul_mat_q4_0.h
780828
shaderop_mul_mat_q4_1.h
781829
shaderop_mul_mat_q6_k.h
830+
shaderop_getrows_f32.h
782831
shaderop_getrows_f16.h
783832
shaderop_getrows_q4_0.h
784833
shaderop_getrows_q4_1.h
@@ -1341,6 +1390,13 @@ if (LLAMA_METAL)
13411390
endif()
13421391
endif()
13431392

1393+
configure_file(cmake/llama.pc.in
1394+
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
1395+
@ONLY)
1396+
1397+
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
1398+
DESTINATION lib/pkgconfig)
1399+
13441400
#
13451401
# programs, examples and tests
13461402
#

Makefile

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
5757
LLAMA_METAL := 1
5858
endif
5959

60+
LLAMA_NO_OPENMP := 1
61+
6062
ifneq ($(UNAME_P),arm)
6163
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
6264
ifeq ($(SYSCTL_M),1)
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
6769
endif
6870
endif
6971

72+
ifdef LLAMA_RPC
73+
BUILD_TARGETS += rpc-server
74+
endif
75+
7076
default: $(BUILD_TARGETS)
7177

7278
test: $(TEST_TARGETS)
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
135141
ifdef LLAMA_FAST
136142
MK_CFLAGS += -Ofast
137143
HOST_CXXFLAGS += -Ofast
144+
ifndef LLAMA_DEBUG
138145
MK_NVCCFLAGS += -O3
146+
endif # LLAMA_DEBUG
139147
else
140148
MK_CFLAGS += -O3
141149
MK_CXXFLAGS += -O3
150+
ifndef LLAMA_DEBUG
142151
MK_NVCCFLAGS += -O3
143-
endif
152+
endif # LLAMA_DEBUG
153+
endif # LLAMA_FAST
144154

145155
ifndef LLAMA_NO_CCACHE
146156
CCACHE := $(shell which ccache)
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
201211
endif
202212

203213
ifdef LLAMA_DEBUG
204-
MK_CFLAGS += -O0 -g
205-
MK_CXXFLAGS += -O0 -g
206-
MK_LDFLAGS += -g
214+
MK_CFLAGS += -O0 -g
215+
MK_CXXFLAGS += -O0 -g
216+
MK_LDFLAGS += -g
217+
MK_NVCCFLAGS += -O0 -g
207218

208219
ifeq ($(UNAME_S),Linux)
209220
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -400,6 +411,12 @@ ifndef LLAMA_NO_ACCELERATE
400411
endif
401412
endif # LLAMA_NO_ACCELERATE
402413

414+
ifndef LLAMA_NO_OPENMP
415+
MK_CPPFLAGS += -DGGML_USE_OPENMP
416+
MK_CFLAGS += -fopenmp
417+
MK_CXXFLAGS += -fopenmp
418+
endif # LLAMA_NO_OPENMP
419+
403420
ifdef LLAMA_OPENBLAS
404421
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
405422
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -416,11 +433,25 @@ ifdef LLAMA_BLIS
416433
MK_LDFLAGS += -lblis -L/usr/local/lib
417434
endif # LLAMA_BLIS
418435

436+
ifdef LLAMA_RPC
437+
MK_CPPFLAGS += -DGGML_USE_RPC
438+
OBJS += ggml-rpc.o
439+
endif # LLAMA_RPC
440+
419441
ifdef LLAMA_CUBLAS
420442
# LLAMA_CUBLAS is deprecated and will be removed in the future
421443
LLAMA_CUDA := 1
422444
endif
423445

446+
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
447+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
448+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
449+
else
450+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
451+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
452+
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
453+
endif # LLAMA_CUDA_FA_ALL_QUANTS
454+
424455
ifdef LLAMA_CUDA
425456
ifneq ('', '$(wildcard /opt/cuda)')
426457
CUDA_PATH ?= /opt/cuda
@@ -431,6 +462,7 @@ ifdef LLAMA_CUDA
431462
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
432463
OBJS += ggml-cuda.o
433464
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
465+
OBJS += $(OBJS_CUDA_TEMP_INST)
434466
MK_NVCCFLAGS += -use_fast_math
435467
ifdef LLAMA_FATAL_WARNINGS
436468
MK_NVCCFLAGS += -Werror all-warnings
@@ -493,7 +525,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
493525
endif # LLAMA_CUDA_NO_PEER_COPY
494526
ifdef LLAMA_CUDA_CCBIN
495527
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
496-
endif
528+
endif # LLAMA_CUDA_CCBIN
529+
ifdef LLAMA_CUDA_FA_ALL_QUANTS
530+
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
531+
endif # LLAMA_CUDA_FA_ALL_QUANTS
497532

498533
ifdef JETSON_EOL_MODULE_DETECT
499534
define NVCC_COMPILE
@@ -505,7 +540,7 @@ define NVCC_COMPILE
505540
endef # NVCC_COMPILE
506541
endif # JETSON_EOL_MODULE_DETECT
507542

508-
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
543+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
509544
$(NVCC_COMPILE)
510545

511546
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
@@ -571,6 +606,7 @@ ifdef LLAMA_HIP_UMA
571606
MK_CPPFLAGS += -DGGML_HIP_UMA
572607
endif # LLAMA_HIP_UMA
573608
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
609+
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
574610
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
575611
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
576612
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@@ -584,11 +620,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
584620
endif # LLAMA_CUDA_NO_PEER_COPY
585621
OBJS += ggml-cuda.o
586622
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
623+
OBJS += $(OBJS_CUDA_TEMP_INST)
587624

588625
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
589626
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
590627

591-
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
628+
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
592629
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
593630

594631
endif # LLAMA_HIPBLAS
@@ -626,11 +663,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
626663
endif
627664
endif # LLAMA_METAL
628665

666+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
667+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
668+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
669+
629670
ifndef LLAMA_NO_LLAMAFILE
630671
sgemm.o: sgemm.cpp sgemm.h ggml.h
631672
$(CXX) $(CXXFLAGS) -c $< -o $@
632673
endif
633674

675+
ifdef LLAMA_RPC
676+
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
677+
$(CXX) $(CXXFLAGS) -c $< -o $@
678+
679+
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
680+
$(CXX) $(CXXFLAGS) -c $< -o $@
681+
682+
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
683+
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
684+
endif # LLAMA_RPC
685+
634686
GF_CC := $(CC)
635687
include scripts/get-flags.mk
636688

@@ -710,14 +762,9 @@ unicode.o: unicode.cpp unicode.h
710762
unicode-data.o: unicode-data.cpp unicode-data.h
711763
$(CXX) $(CXXFLAGS) -c $< -o $@
712764

713-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
714-
715765
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
716766
$(CXX) $(CXXFLAGS) -c $< -o $@
717767

718-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
719-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
720-
721768
common.o: common/common.cpp $(COMMON_H_DEPS)
722769
$(CXX) $(CXXFLAGS) -c $< -o $@
723770

@@ -748,6 +795,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
748795
clean:
749796
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
750797
rm -vrf ggml-cuda/*.o
798+
rm -vrf ggml-cuda/template-instances/*.o
751799
find examples pocs -type f -name "*.o" -delete
752800

753801
#
@@ -816,7 +864,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
816864
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
817865
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
818866

819-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
867+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
820868
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
821869
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
822870

0 commit comments

Comments
 (0)