Skip to content

Commit 201b5a5

Browse files
committed
Merge commit 'e16b9fa4baa8a09c6619b116159830e898050942' into nomic-vulkan
2 parents 922115c + e16b9fa commit 201b5a5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+7738
-7293
lines changed

.github/ISSUE_TEMPLATE/bug.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
name: Bug template
33
about: Used to report bugs in llama.cpp
4-
labels: ["bug"]
4+
labels: ["bug-unconfirmed"]
55
assignees: ''
66

77
---

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
.DS_Store
1616
.build/
1717
.cache/
18+
.ccls-cache/
1819
.direnv/
1920
.envrc
2021
.swiftpm

CMakeLists.txt

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
8282
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
8383
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
8484
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
85+
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
8586
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
8687
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
8788
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@@ -94,7 +95,6 @@ option(LLAMA_METAL "llama: use Metal"
9495
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
9596
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
9697
option(LLAMA_MPI "llama: use MPI" OFF)
97-
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
9898
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
9999

100100
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -278,13 +278,8 @@ if (LLAMA_BLAS)
278278
endif()
279279
endif()
280280

281-
if (LLAMA_K_QUANTS)
282-
set(GGML_HEADERS_EXTRA k_quants.h)
283-
set(GGML_SOURCES_EXTRA k_quants.c)
284-
add_compile_definitions(GGML_USE_K_QUANTS)
285-
if (LLAMA_QKK_64)
286-
add_compile_definitions(GGML_QKK_64)
287-
endif()
281+
if (LLAMA_QKK_64)
282+
add_compile_definitions(GGML_QKK_64)
288283
endif()
289284

290285
if (LLAMA_CUBLAS)
@@ -306,6 +301,9 @@ if (LLAMA_CUBLAS)
306301
if (LLAMA_CUDA_FORCE_DMMV)
307302
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
308303
endif()
304+
if (LLAMA_CUDA_FORCE_MMQ)
305+
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
306+
endif()
309307
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
310308
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
311309
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -332,6 +330,7 @@ if (LLAMA_CUBLAS)
332330
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
333331
else()
334332
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
333+
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
335334
endif()
336335
endif()
337336
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -405,6 +404,9 @@ if (LLAMA_HIPBLAS)
405404
if (LLAMA_CUDA_FORCE_DMMV)
406405
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
407406
endif()
407+
if (LLAMA_CUDA_FORCE_MMQ)
408+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
409+
endif()
408410
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
409411
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
410412
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -804,6 +806,8 @@ add_library(ggml OBJECT
804806
ggml-alloc.h
805807
ggml-backend.c
806808
ggml-backend.h
809+
ggml-quants.c
810+
ggml-quants.h
807811
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
808812
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
809813
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

Makefile

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -342,13 +342,9 @@ else
342342
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
343343
endif
344344

345-
ifndef LLAMA_NO_K_QUANTS
346-
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
347-
OBJS += k_quants.o
348345
ifdef LLAMA_QKK_64
349346
MK_CPPFLAGS += -DGGML_QKK_64
350347
endif
351-
endif
352348

353349
ifndef LLAMA_NO_ACCELERATE
354350
# Mac OS - include Accelerate framework.
@@ -365,7 +361,7 @@ ifdef LLAMA_MPI
365361
MK_CPPFLAGS += -DGGML_USE_MPI
366362
MK_CFLAGS += -Wno-cast-qual
367363
MK_CXXFLAGS += -Wno-cast-qual
368-
OBJS += ggml-mpi.o
364+
OBJS += ggml-mpi.o
369365
endif # LLAMA_MPI
370366

371367
ifdef LLAMA_OPENBLAS
@@ -382,7 +378,7 @@ endif # LLAMA_BLIS
382378
ifdef LLAMA_CUBLAS
383379
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
384380
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
385-
OBJS += ggml-cuda.o
381+
OBJS += ggml-cuda.o
386382
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
387383
ifdef LLAMA_CUDA_NVCC
388384
NVCC = $(LLAMA_CUDA_NVCC)
@@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
397393
ifdef LLAMA_CUDA_FORCE_DMMV
398394
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
399395
endif # LLAMA_CUDA_FORCE_DMMV
396+
ifdef LLAMA_CUDA_FORCE_MMQ
397+
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
398+
endif # LLAMA_CUDA_FORCE_MMQ
400399
ifdef LLAMA_CUDA_DMMV_X
401400
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
402401
else
@@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
494493
$(CC) $(CFLAGS) -c $< -o $@
495494
endif # LLAMA_MPI
496495

497-
ifndef LLAMA_NO_K_QUANTS
498-
k_quants.o: k_quants.c k_quants.h
499-
$(CC) $(CFLAGS) -c $< -o $@
500-
endif # LLAMA_NO_K_QUANTS
501-
502496
# combine build flags with cmdline overrides
503497
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
504498
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@@ -539,15 +533,18 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
539533
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
540534
$(CC) $(CFLAGS) -c $< -o $@
541535

542-
OBJS += ggml-alloc.o ggml-backend.o
536+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
537+
$(CC) $(CFLAGS) -c $< -o $@
538+
539+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
543540

544541
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
545542
$(CXX) $(CXXFLAGS) -c $< -o $@
546543

547-
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
548-
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
544+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
545+
COMMON_DEPS = common.o sampling.o grammar-parser.o
549546

550-
common.o: common/common.cpp $(COMMON_H_DEPS)
547+
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
551548
$(CXX) $(CXXFLAGS) -c $< -o $@
552549

553550
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)

Package.swift

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ let package = Package(
4242
"llama.cpp",
4343
"ggml-alloc.c",
4444
"ggml-backend.c",
45-
"k_quants.c",
45+
"ggml-quants.c",
4646
] + additionalSources,
4747
resources: resources,
4848
publicHeadersPath: "spm-headers",
4949
cSettings: [
5050
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
51-
.define("GGML_USE_K_QUANTS"),
5251
.define("GGML_USE_ACCELERATE")
5352
// NOTE: NEW_LAPACK will required iOS version 16.4+
5453
// We should consider add this in the future when we drop support for iOS 14

build.zig

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void {
116116
var make = try Maker.init(b);
117117
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
118118

119-
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
120-
try make.addFlag("-DGGML_USE_K_QUANTS");
121-
const k_quants = make.obj("k_quants", "k_quants.c");
122-
try make.objs.append(k_quants);
123-
}
124-
125119
const ggml = make.obj("ggml", "ggml.c");
126120
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
127121
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
122+
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
128123
const llama = make.obj("llama", "llama.cpp");
129124
const common = make.obj("common", "common/common.cpp");
130125
const console = make.obj("console", "common/console.cpp");
@@ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
133128
const train = make.obj("train", "common/train.cpp");
134129
const clip = make.obj("clip", "examples/llava/clip.cpp");
135130

136-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
137-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
138-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
139-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
140-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
141-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
131+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
132+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
133+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
134+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
135+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
136+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
142137

143-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
138+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
144139
if (server.target.isWindows()) {
145140
server.linkSystemLibrary("ws2_32");
146141
}

common/common.cpp

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,24 @@ void process_escapes(std::string& input) {
103103
}
104104

105105
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
106+
bool result = true;
107+
try {
108+
if (!gpt_params_parse_ex(argc, argv, params)) {
109+
gpt_print_usage(argc, argv, gpt_params());
110+
exit(0);
111+
}
112+
}
113+
catch (const std::invalid_argument & ex) {
114+
fprintf(stderr, "%s\n", ex.what());
115+
gpt_print_usage(argc, argv, gpt_params());
116+
exit(1);
117+
}
118+
return result;
119+
}
120+
121+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
106122
bool invalid_param = false;
107123
std::string arg;
108-
gpt_params default_params;
109124
const std::string arg_prefix = "--";
110125
llama_sampling_params & sparams = params.sparams;
111126

@@ -218,12 +233,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
218233
break;
219234
}
220235
sparams.top_p = std::stof(argv[i]);
236+
} else if (arg == "--min-p") {
237+
if (++i >= argc) {
238+
invalid_param = true;
239+
break;
240+
}
241+
sparams.min_p = std::stof(argv[i]);
221242
} else if (arg == "--temp") {
222243
if (++i >= argc) {
223244
invalid_param = true;
224245
break;
225246
}
226247
sparams.temp = std::stof(argv[i]);
248+
sparams.temp = std::max(sparams.temp, 0.0f);
227249
} else if (arg == "--tfs") {
228250
if (++i >= argc) {
229251
invalid_param = true;
@@ -547,11 +569,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
547569
break;
548570
}
549571
} else if (arg == "-h" || arg == "--help") {
550-
gpt_print_usage(argc, argv, default_params);
551-
#ifndef LOG_DISABLE_LOGS
552-
log_print_usage();
553-
#endif // LOG_DISABLE_LOGS
554-
exit(0);
572+
return false;
573+
555574
} else if (arg == "--random-prompt") {
556575
params.random_prompt = true;
557576
} else if (arg == "--in-prefix-bos") {
@@ -610,22 +629,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
610629
// End of Parse args for logging parameters
611630
#endif // LOG_DISABLE_LOGS
612631
} else {
613-
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
614-
gpt_print_usage(argc, argv, default_params);
615-
exit(1);
632+
throw std::invalid_argument("error: unknown argument: " + arg);
616633
}
617634
}
618635
if (invalid_param) {
619-
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
620-
gpt_print_usage(argc, argv, default_params);
621-
exit(1);
636+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
622637
}
623638
if (params.prompt_cache_all &&
624639
(params.interactive || params.interactive_first ||
625640
params.instruct)) {
626-
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
627-
gpt_print_usage(argc, argv, default_params);
628-
exit(1);
641+
642+
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
629643
}
630644

631645
if (params.escape) {
@@ -644,6 +658,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
644658
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
645659
const llama_sampling_params & sparams = params.sparams;
646660

661+
printf("\n");
647662
printf("usage: %s [options]\n", argv[0]);
648663
printf("\n");
649664
printf("options:\n");
@@ -678,6 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
678693
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
679694
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
680695
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
696+
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
681697
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
682698
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
683699
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@@ -743,7 +759,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
743759
#endif // GGML_USE_CUBLAS
744760
#endif
745761
printf(" --verbose-prompt print prompt before generation\n");
746-
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
762+
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
747763
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
748764
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
749765
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -754,6 +770,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
754770
printf(" -ld LOGDIR, --logdir LOGDIR\n");
755771
printf(" path under which to save YAML logs (no logging if unset)\n");
756772
printf("\n");
773+
#ifndef LOG_DISABLE_LOGS
774+
log_print_usage();
775+
#endif // LOG_DISABLE_LOGS
757776
}
758777

759778
std::string get_system_info(const gpt_params & params) {
@@ -888,7 +907,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
888907

889908
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
890909
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
891-
llama_kv_cache_tokens_rm(lctx, -1, -1);
910+
llama_kv_cache_clear(lctx);
892911
llama_reset_timings(lctx);
893912
}
894913

@@ -1274,6 +1293,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
12741293
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
12751294
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
12761295
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
1296+
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
12771297
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
12781298
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
12791299
}

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ struct gpt_params {
110110
std::string image = ""; // path to an image file
111111
};
112112

113+
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
114+
113115
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
114116

115117
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

0 commit comments

Comments
 (0)