Skip to content

Commit a129a31

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 400dcce + 2d8b76a commit a129a31

37 files changed

+7216
-3101
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
*.o
22
*.a
33
*.so
4+
*.bin
45
.DS_Store
56
.build/
67
.cache/
@@ -39,13 +40,15 @@ models-mnt
3940
/perplexity
4041
/embedding
4142
/train-text-from-scratch
43+
/convert-llama2c-to-ggml
4244
/simple
4345
/benchmark-matmult
4446
/vdot
4547
/server
4648
/Pipfile
4749
/embd-input-test
4850
/libllama.so
51+
/llama-bench
4952
build-info.h
5053
arm_neon.h
5154
compile_commands.json
@@ -68,6 +71,7 @@ poetry.lock
6871
poetry.toml
6972

7073
# Test binaries
74+
tests/test-grammar-parser
7175
tests/test-double-float
7276
tests/test-grad0
7377
tests/test-opt

CMakeLists.txt

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ option(LLAMA_BLAS "llama: use BLAS"
6969
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
7171
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
72-
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
7372
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
7473
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7574
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@@ -256,7 +255,6 @@ if (LLAMA_CUBLAS)
256255
# if (LLAMA_CUDA_CUBLAS)
257256
# add_compile_definitions(GGML_CUDA_CUBLAS)
258257
# endif()
259-
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
260258
if (LLAMA_CUDA_FORCE_DMMV)
261259
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
262260
endif()
@@ -298,7 +296,6 @@ if (LLAMA_METAL)
298296
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
299297
find_library(METAL_FRAMEWORK Metal REQUIRED)
300298
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
301-
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
302299

303300
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
304301

@@ -315,7 +312,6 @@ if (LLAMA_METAL)
315312
${FOUNDATION_LIBRARY}
316313
${METAL_FRAMEWORK}
317314
${METALKIT_FRAMEWORK}
318-
${METALPERFORMANCE_FRAMEWORK}
319315
)
320316
endif()
321317

@@ -573,6 +569,16 @@ install(
573569
WORLD_READ
574570
WORLD_EXECUTE
575571
DESTINATION ${CMAKE_INSTALL_BINDIR})
572+
if (LLAMA_METAL)
573+
install(
574+
FILES ggml-metal.metal
575+
PERMISSIONS
576+
OWNER_READ
577+
OWNER_WRITE
578+
GROUP_READ
579+
WORLD_READ
580+
DESTINATION ${CMAKE_INSTALL_BINDIR})
581+
endif()
576582

577583
#
578584
# programs, examples and tests

Makefile

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
33

44
# Binaries only useful for tests
5-
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
5+
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
66

77
default: $(BUILD_TARGETS)
88

@@ -283,7 +283,7 @@ endif # LLAMA_CLBLAST
283283
ifdef LLAMA_METAL
284284
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
285285
CXXFLAGS += -DGGML_USE_METAL
286-
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
286+
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
287287
OBJS += ggml-metal.o
288288
endif # LLAMA_METAL
289289

@@ -345,7 +345,7 @@ libllama.so: llama.o ggml.o $(OBJS)
345345
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
346346

347347
clean:
348-
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
348+
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
349349

350350
#
351351
# Examples
@@ -388,6 +388,12 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
388388
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
389389
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
390390

391+
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS)
392+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
393+
394+
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
395+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
396+
391397
build-info.h: $(wildcard .git/index) scripts/build-info.sh
392398
@sh scripts/build-info.sh > $@.tmp
393399
@if ! cmp -s $@.tmp $@; then \
@@ -409,6 +415,12 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
409415
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
410416
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
411417

418+
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
419+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
420+
421+
tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
422+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
423+
412424
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
413425
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
414426

README.md

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@
99

1010
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

12-
**Hot topics:**
12+
### 🚧 Incoming breaking change + refactoring:
1313

14-
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
15-
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
16-
- New roadmap: https://github.com/users/ggerganov/projects/7
17-
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
18-
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
14+
See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
15+
16+
To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
17+
18+
----
1919

2020
<details>
2121
<summary>Table of Contents</summary>
@@ -96,8 +96,10 @@ as the main playground for developing new features for the [ggml](https://github
9696
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
9797
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
9898
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
99+
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
99100
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
100101
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
102+
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
101103

102104
**UI:**
103105

@@ -238,12 +240,17 @@ In order to build llama.cpp you have three different options.
238240
cmake --build . --config Release
239241
```
240242

241-
- Using `Zig`:
243+
- Using `Zig` (version 0.11 or later):
244+
245+
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
246+
it's also possible to cross compile for other operating systems and architectures:
242247
243248
```bash
244-
zig build -Doptimize=ReleaseFast
249+
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
245250
```
246251
252+
The `zig targets` command will give you valid options to use.
253+
247254
- Using `gmake` (FreeBSD):
248255
249256
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -408,7 +415,7 @@ Building the program with BLAS support may lead to some performance improvements
408415
|-------------------------|------------------------|---------|-------------|
409416
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
410417
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
411-
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
418+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
412419
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
413420
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
414421

build.zig

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Compatible with Zig Version 0.11.0
22
const std = @import("std");
3+
const ArrayList = std.ArrayList;
34
const Compile = std.Build.Step.Compile;
45
const ConfigHeader = std.Build.Step.ConfigHeader;
56
const Mode = std.builtin.Mode;
@@ -10,11 +11,31 @@ const Maker = struct {
1011
target: CrossTarget,
1112
optimize: Mode,
1213
config_header: *ConfigHeader,
14+
enable_lto: bool,
1315

14-
const cflags = .{"-std=c11"};
15-
const cxxflags = .{"-std=c++11"};
16+
include_dirs: ArrayList([]const u8),
17+
cflags: ArrayList([]const u8),
18+
cxxflags: ArrayList([]const u8),
19+
objs: ArrayList(*Compile),
1620

17-
fn init(builder: *std.build.Builder) Maker {
21+
fn addInclude(m: *Maker, dir: []const u8) !void {
22+
try m.include_dirs.append(dir);
23+
}
24+
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
25+
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
26+
}
27+
fn addCFlag(m: *Maker, flag: []const u8) !void {
28+
try m.cflags.append(flag);
29+
}
30+
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
31+
try m.cxxflags.append(flag);
32+
}
33+
fn addFlag(m: *Maker, flag: []const u8) !void {
34+
try m.addCFlag(flag);
35+
try m.addCxxFlag(flag);
36+
}
37+
38+
fn init(builder: *std.build.Builder) !Maker {
1839
const commit_hash = @embedFile(".git/refs/heads/master");
1940
const config_header = builder.addConfigHeader(
2041
.{ .style = .blank, .include_path = "build-info.h" },
@@ -23,58 +44,71 @@ const Maker = struct {
2344
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
2445
},
2546
);
26-
return Maker{
47+
var m = Maker{
2748
.builder = builder,
2849
.target = builder.standardTargetOptions(.{}),
2950
.optimize = builder.standardOptimizeOption(.{}),
3051
.config_header = config_header,
52+
.enable_lto = false,
53+
.include_dirs = ArrayList([]const u8).init(builder.allocator),
54+
.cflags = ArrayList([]const u8).init(builder.allocator),
55+
.cxxflags = ArrayList([]const u8).init(builder.allocator),
56+
.objs = ArrayList(*Compile).init(builder.allocator),
3157
};
58+
try m.addCFlag("-std=c11");
59+
try m.addCxxFlag("-std=c++11");
60+
try m.addProjectInclude(&.{});
61+
try m.addProjectInclude(&.{"examples"});
62+
return m;
3263
}
3364

3465
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
3566
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
3667
if (std.mem.endsWith(u8, src, ".c")) {
37-
o.addCSourceFiles(&.{src}, &cflags);
68+
o.addCSourceFiles(&.{src}, m.cflags.items);
3869
o.linkLibC();
3970
} else {
40-
o.addCSourceFiles(&.{src}, &cxxflags);
71+
o.addCSourceFiles(&.{src}, m.cxxflags.items);
4172
o.linkLibCpp();
4273
}
43-
o.addIncludePath(.{ .path = "." });
44-
o.addIncludePath(.{ .path = "./examples" });
74+
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
75+
o.want_lto = m.enable_lto;
4576
return o;
4677
}
4778

4879
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
4980
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
50-
e.addIncludePath(.{ .path = "." });
51-
e.addIncludePath(.{ .path = "./examples" });
52-
e.addCSourceFiles(&.{src}, &cxxflags);
81+
e.addCSourceFiles(&.{src}, m.cxxflags.items);
5382
for (deps) |d| e.addObject(d);
83+
for (m.objs.items) |o| e.addObject(o);
84+
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
5485
e.linkLibC();
5586
e.linkLibCpp();
5687
e.addConfigHeader(m.config_header);
5788
m.builder.installArtifact(e);
58-
59-
// Currently a bug is preventing correct linking for optimized builds for Windows:
60-
// https://github.com/ziglang/zig/issues/15958
61-
if (e.target.isWindows()) {
62-
e.want_lto = false;
63-
}
89+
e.want_lto = m.enable_lto;
6490
return e;
6591
}
6692
};
6793

68-
pub fn build(b: *std.build.Builder) void {
69-
const make = Maker.init(b);
94+
pub fn build(b: *std.build.Builder) !void {
95+
var make = try Maker.init(b);
96+
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
97+
98+
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
99+
try make.addFlag("-DGGML_USE_K_QUANTS");
100+
const k_quants = make.obj("k_quants", "k_quants.c");
101+
try make.objs.append(k_quants);
102+
}
70103

71104
const ggml = make.obj("ggml", "ggml.c");
72105
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
73106
const llama = make.obj("llama", "llama.cpp");
74107
const common = make.obj("common", "examples/common.cpp");
108+
const console = make.obj("common", "examples/console.cpp");
75109
const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
76110

77-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
111+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
78112
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
79113
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
80114
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });

examples/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,10 @@ else()
4242
add_subdirectory(benchmark)
4343
add_subdirectory(baby-llama)
4444
add_subdirectory(train-text-from-scratch)
45+
add_subdirectory(convert-llama2c-to-ggml)
4546
add_subdirectory(simple)
4647
add_subdirectory(embd-input)
48+
add_subdirectory(llama-bench)
4749
if (LLAMA_METAL)
4850
add_subdirectory(metal)
4951
endif()

examples/common.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
283283
break;
284284
}
285285
params.cfg_negative_prompt = argv[i];
286+
} else if (arg == "--cfg-negative-prompt-file") {
287+
if (++i >= argc) {
288+
invalid_param = true;
289+
break;
290+
}
291+
std::ifstream file(argv[i]);
292+
if (!file) {
293+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
294+
invalid_param = true;
295+
break;
296+
}
297+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
298+
if (params.cfg_negative_prompt.back() == '\n') {
299+
params.cfg_negative_prompt.pop_back();
300+
}
286301
} else if (arg == "--cfg-scale") {
287302
if (++i >= argc) {
288303
invalid_param = true;
@@ -578,8 +593,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
578593
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
579594
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
580595
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
581-
fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
596+
fprintf(stdout, " --cfg-negative-prompt PROMPT\n");
582597
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
598+
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n");
599+
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n");
583600
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
584601
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
585602
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET convert-llama2c-to-ggml)
2+
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

0 commit comments

Comments
 (0)