Skip to content

Commit afa6119

Browse files
committed
Merge branch 'master' into compilade/fix-mpt-pretok
2 parents 98edea6 + dd07a12 commit afa6119

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+3072
-238
lines changed

.github/labeler.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ SYCL:
1616
- any-glob-to-any-file:
1717
- ggml/include/ggml-sycl.h
1818
- ggml/src/ggml-sycl.cpp
19-
- README-sycl.md
19+
- ggml/src/ggml-sycl/**
20+
- docs/backend/SYCL.md
21+
- examples/sycl/**
2022
Nvidia GPU:
2123
- changed-files:
2224
- any-glob-to-any-file:

CMakeLists.txt

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,6 @@ endif()
5050
# option list
5151
#
5252

53-
# general
54-
option(LLAMA_CCACHE "llama: use ccache if available" ON)
55-
5653
# debug
5754
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
5855
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
7774
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
7875

7976
# override ggml options
80-
set(GGML_CCACHE ${LLAMA_CCACHE})
8177
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
8278
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
8379
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
@@ -115,7 +111,10 @@ llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
115111
# build the library
116112
#
117113

118-
add_subdirectory(ggml)
114+
if (NOT TARGET ggml)
115+
add_subdirectory(ggml)
116+
# ... otherwise assume ggml is added by a parent CMakeLists.txt
117+
endif()
119118
add_subdirectory(src)
120119

121120
#

Makefile

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,14 @@ TEST_TARGETS = \
6464
tests/test-tokenizer-1-spm
6565

6666
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
67-
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
67+
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
6868
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
6969
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
7070

71+
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
72+
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
73+
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
74+
7175
# Deprecation aliases
7276
ifdef LLAMA_CUBLAS
7377
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@@ -193,7 +197,7 @@ ifdef GGML_RPC
193197
BUILD_TARGETS += rpc-server
194198
endif
195199

196-
default: $(BUILD_TARGETS)
200+
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
197201

198202
test: $(TEST_TARGETS)
199203
@failures=0; \
@@ -228,7 +232,7 @@ test: $(TEST_TARGETS)
228232
fi
229233
@echo 'All tests passed.'
230234

231-
all: $(BUILD_TARGETS) $(TEST_TARGETS)
235+
all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
232236

233237
ifdef RISCV_CROSS_COMPILE
234238
CC := riscv64-unknown-linux-gnu-gcc
@@ -245,17 +249,22 @@ MK_CFLAGS = -std=c11 -fPIC
245249
MK_CXXFLAGS = -std=c++11 -fPIC
246250
MK_NVCCFLAGS = -std=c++11
247251

248-
ifndef LLAMA_NO_CCACHE
252+
ifdef LLAMA_NO_CCACHE
253+
GGML_NO_CCACHE := 1
254+
DEPRECATE_WARNING := 1
255+
endif
256+
257+
ifndef GGML_NO_CCACHE
249258
CCACHE := $(shell which ccache)
250259
ifdef CCACHE
251260
export CCACHE_SLOPPINESS = time_macros
252-
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
261+
$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
253262
CC := $(CCACHE) $(CC)
254263
CXX := $(CCACHE) $(CXX)
255264
else
256265
$(info I ccache not found. Consider installing it for faster compilation.)
257266
endif # CCACHE
258-
endif # LLAMA_NO_CCACHE
267+
endif # GGML_NO_CCACHE
259268

260269
# clock_gettime came in POSIX.1b (1993)
261270
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@@ -545,7 +554,7 @@ endif # GGML_BLIS
545554

546555
ifndef GGML_NO_LLAMAFILE
547556
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
548-
OBJ_GGML += ggml/src/sgemm.o
557+
OBJ_GGML += ggml/src/llamafile/sgemm.o
549558
endif
550559

551560
ifdef GGML_RPC
@@ -826,7 +835,8 @@ OBJ_GGML += \
826835
ggml/src/ggml.o \
827836
ggml/src/ggml-alloc.o \
828837
ggml/src/ggml-backend.o \
829-
ggml/src/ggml-quants.o
838+
ggml/src/ggml-quants.o \
839+
ggml/src/ggml-aarch64.o
830840

831841
OBJ_LLAMA = \
832842
src/llama.o \
@@ -926,6 +936,7 @@ $(info - LLAMA_NO_LLAMAFILE)
926936
$(info - LLAMA_NO_ACCELERATE)
927937
$(info - LLAMA_NO_OPENMP)
928938
$(info - LLAMA_NO_METAL)
939+
$(info - LLAMA_NO_CCACHE)
929940
$(info )
930941
endif
931942

@@ -959,15 +970,22 @@ ggml/src/ggml-quants.o: \
959970
ggml/src/ggml-common.h
960971
$(CC) $(CFLAGS) -c $< -o $@
961972

973+
ggml/src/ggml-aarch64.o: \
974+
ggml/src/ggml-aarch64.c \
975+
ggml/include/ggml.h \
976+
ggml/src/ggml-aarch64.h \
977+
ggml/src/ggml-common.h
978+
$(CC) $(CFLAGS) -c $< -o $@
979+
962980
ggml/src/ggml-blas.o: \
963981
ggml/src/ggml-blas.cpp \
964982
ggml/include/ggml-blas.h
965983
$(CXX) $(CXXFLAGS) -c $< -o $@
966984

967985
ifndef GGML_NO_LLAMAFILE
968-
ggml/src/sgemm.o: \
969-
ggml/src/sgemm.cpp \
970-
ggml/src/sgemm.h \
986+
ggml/src/llamafile/sgemm.o: \
987+
ggml/src/llamafile/sgemm.cpp \
988+
ggml/src/llamafile/sgemm.h \
971989
ggml/include/ggml.h
972990
$(CXX) $(CXXFLAGS) -c $< -o $@
973991
endif # GGML_NO_LLAMAFILE
@@ -1092,7 +1110,7 @@ clean:
10921110
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
10931111
rm -rvf $(BUILD_TARGETS)
10941112
rm -rvf $(TEST_TARGETS)
1095-
rm -rvf $(LEGACY_TARGETS)
1113+
rm -rvf $(LEGACY_TARGETS_CLEAN)
10961114
find examples pocs -type f -name "*.o" -delete
10971115

10981116
#
@@ -1488,3 +1506,61 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
14881506
$(OBJ_GGML)
14891507
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
14901508
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1509+
1510+
#
1511+
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
1512+
#
1513+
# Mark legacy binary targets as .PHONY so that they are always checked.
1514+
.PHONY: main quantize perplexity embedding server finetune
1515+
1516+
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
1517+
# Eventually we will want to remove these target from building all the time.
1518+
main: examples/deprecation-warning/deprecation-warning.cpp
1519+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1520+
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1521+
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
1522+
1523+
server: examples/deprecation-warning/deprecation-warning.cpp
1524+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1525+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1526+
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
1527+
1528+
quantize: examples/deprecation-warning/deprecation-warning.cpp
1529+
ifneq (,$(wildcard quantize))
1530+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1531+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1532+
@echo "#########"
1533+
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
1534+
@echo " Remove the 'quantize' binary to remove this warning."
1535+
@echo "#########"
1536+
endif
1537+
1538+
perplexity: examples/deprecation-warning/deprecation-warning.cpp
1539+
ifneq (,$(wildcard perplexity))
1540+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1541+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1542+
@echo "#########"
1543+
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
1544+
@echo " Remove the 'perplexity' binary to remove this warning."
1545+
@echo "#########"
1546+
endif
1547+
1548+
embedding: examples/deprecation-warning/deprecation-warning.cpp
1549+
ifneq (,$(wildcard embedding))
1550+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1551+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1552+
@echo "#########"
1553+
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
1554+
@echo " Remove the 'embedding' binary to remove this warning."
1555+
@echo "#########"
1556+
endif
1557+
1558+
finetune: examples/deprecation-warning/deprecation-warning.cpp
1559+
ifneq (,$(wildcard finetune))
1560+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1561+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1562+
@echo "#########"
1563+
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
1564+
@echo " Remove the 'finetune' binary to remove this warning."
1565+
@echo "#########"
1566+
endif

Package.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ var sources = [
1010
"ggml/src/ggml-alloc.c",
1111
"ggml/src/ggml-backend.c",
1212
"ggml/src/ggml-quants.c",
13+
"ggml/src/ggml-aarch64.c",
1314
]
1415

1516
var resources: [Resource] = []

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,9 @@ Typically finetunes of the base models below are supported as well.
9696
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
9797
- [x] [OLMo](https://allenai.org/olmo)
9898
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
99+
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
99100

100-
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
101+
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
101102

102103
**Multimodal models:**
103104

@@ -452,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
452453
- [How to build](./docs/build.md)
453454
- [Running on Docker](./docs/docker.md)
454455
- [Build on Android](./docs/android.md)
455-
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
456+
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
456457
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
457458

458459
**Seminal papers and background on the models**

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
#if defined(_MSC_VER)
2+
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3+
#endif
4+
15
#include "common.h"
26
// Change JSON_ASSERT from assert() to GGML_ASSERT:
37
#define JSON_ASSERT GGML_ASSERT

common/log.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
630630
buf << "[ ";
631631

632632
bool first = true;
633-
for (const auto &token : tokens)
633+
for (const auto & token : tokens)
634634
{
635635
if (!first) {
636636
buf << ", ";

common/sampling.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
282282
GGML_ASSERT(!original_logits.empty());
283283
}
284284
llama_token id = 0;
285-
// Get a pointer to the logits
286-
float * logits = llama_get_logits_ith(ctx_main, idx);
287285

288286
if (temp < 0.0) {
289287
// greedy sampling, with probs
@@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
324322
}
325323

326324
if (ctx_sampling->grammar != NULL && !is_resampling) {
325+
// Get a pointer to the logits
326+
float * logits = llama_get_logits_ith(ctx_main, idx);
327+
327328
// Create an array with a single token data element for the sampled id
328329
llama_token_data single_token_data = {id, logits[id], 0.0f};
329330
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
@@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
377378
if (ctx_sampling->grammar != NULL && !apply_grammar) {
378379
GGML_ASSERT(original_logits != NULL);
379380
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
380-
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
381+
*original_logits = {logits, logits + n_vocab};
381382
}
382383

383384
// apply params.logit_bias map
@@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
390391
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
391392
}
392393

393-
cur.clear();
394+
cur.resize(n_vocab);
394395

395396
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
396-
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
397+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
397398
}
398399

399400
llama_token_data_array cur_p = { cur.data(), cur.size(), false };

0 commit comments

Comments
 (0)