ggml-org
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 10 additions & 4 deletions b/‎CMakeLists.txt
Lines changed: 10 additions & 4 deletions
diff --git a/‎Makefile
Lines changed: 16 additions & 4 deletions b/‎Makefile
Lines changed: 16 additions & 4 deletions
diff --git a/‎README.md
Lines changed: 16 additions & 9 deletions b/‎README.md
Lines changed: 16 additions & 9 deletions
diff --git a/‎build.zig
Lines changed: 54 additions & 20 deletions b/‎build.zig
Lines changed: 54 additions & 20 deletions
diff --git a/‎examples/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎examples/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/common.cpp
Lines changed: 18 additions & 1 deletion b/‎examples/common.cpp
Lines changed: 18 additions & 1 deletion
diff --git a/‎examples/convert-llama2c-to-ggml/CMakeLists.txt
Lines changed: 5 additions & 0 deletions b/‎examples/convert-llama2c-to-ggml/CMakeLists.txt
Lines changed: 5 additions & 0 deletions
@@ -1,6 +1,7 @@
 *.o
 *.a
 *.so
+*.bin
 .DS_Store
 .build/
 .cache/
@@ -39,13 +40,15 @@ models-mnt
 /perplexity
 /embedding
 /train-text-from-scratch
+/convert-llama2c-to-ggml
 /simple
 /benchmark-matmult
 /vdot
 /server
 /Pipfile
 /embd-input-test
 /libllama.so
+/llama-bench
 build-info.h
 arm_neon.h
 compile_commands.json
@@ -68,6 +71,7 @@ poetry.lock
 poetry.toml
 
 # Test binaries
+tests/test-grammar-parser
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
 
@@ -69,7 +69,6 @@ option(LLAMA_BLAS                            "llama: use BLAS"
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
-set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@@ -256,7 +255,6 @@ if (LLAMA_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
 #            add_compile_definitions(GGML_CUDA_CUBLAS)
 #        endif()
-        add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
         if (LLAMA_CUDA_FORCE_DMMV)
             add_compile_definitions(GGML_CUDA_FORCE_DMMV)
         endif()
@@ -298,7 +296,6 @@ if (LLAMA_METAL)
     find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
     find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
     find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
 
     set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
 
@@ -315,7 +312,6 @@ if (LLAMA_METAL)
         ${FOUNDATION_LIBRARY}
         ${METAL_FRAMEWORK}
         ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
         )
 endif()
 
@@ -573,6 +569,16 @@ install(
         WORLD_READ
         WORLD_EXECUTE
     DESTINATION ${CMAKE_INSTALL_BINDIR})
+if (LLAMA_METAL)
+    install(
+        FILES ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
 
 #
 # programs, examples and tests
 
@@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
 
 # Binaries only useful for tests
-TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
 
 default: $(BUILD_TARGETS)
 
@@ -283,7 +283,7 @@ endif # LLAMA_CLBLAST
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL
 
@@ -345,7 +345,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -388,6 +388,12 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@@ -409,6 +415,12 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 
 
@@ -9,13 +9,13 @@
 
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
-**Hot topics:**
+### 🚧 Incoming breaking change + refactoring:
 
-- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
-- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
-- New roadmap: https://github.com/users/ggerganov/projects/7
-- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
-- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
+
+To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
+
+----
 
 <details>
   <summary>Table of Contents</summary>
@@ -96,8 +96,10 @@ as the main playground for developing new features for the [ggml](https://github
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 
 **UI:**
 
@@ -238,12 +240,17 @@ In order to build llama.cpp you have three different options.
     cmake --build . --config Release
     ```
 
-- Using `Zig`:
+- Using `Zig` (version 0.11 or later):
+
+    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
+    it's also possible to cross compile for other operating systems and architectures:
 
     ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
     ```
 
+    The `zig targets` command will give you valid options to use.
+
 -   Using `gmake` (FreeBSD):
 
     1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -408,7 +415,7 @@ Building the program with BLAS support may lead to some performance improvements
   |-------------------------|------------------------|---------|-------------|
   | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
   | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
   | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
 
@@ -1,5 +1,6 @@
 // Compatible with Zig Version 0.11.0
 const std = @import("std");
+const ArrayList = std.ArrayList;
 const Compile = std.Build.Step.Compile;
 const ConfigHeader = std.Build.Step.ConfigHeader;
 const Mode = std.builtin.Mode;
@@ -10,11 +11,31 @@ const Maker = struct {
     target: CrossTarget,
     optimize: Mode,
     config_header: *ConfigHeader,
+    enable_lto: bool,
 
-    const cflags = .{"-std=c11"};
-    const cxxflags = .{"-std=c++11"};
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),
 
-    fn init(builder: *std.build.Builder) Maker {
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }
+
+    fn init(builder: *std.build.Builder) !Maker {
         const commit_hash = @embedFile(".git/refs/heads/master");
         const config_header = builder.addConfigHeader(
             .{ .style = .blank, .include_path = "build-info.h" },
@@ -23,58 +44,71 @@ const Maker = struct {
                 .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
             },
         );
-        return Maker{
+        var m = Maker{
             .builder = builder,
             .target = builder.standardTargetOptions(.{}),
             .optimize = builder.standardOptimizeOption(.{}),
             .config_header = config_header,
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
         };
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"examples"});
+        return m;
     }
 
     fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
         const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
         if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, &cflags);
+            o.addCSourceFiles(&.{src}, m.cflags.items);
             o.linkLibC();
         } else {
-            o.addCSourceFiles(&.{src}, &cxxflags);
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
             o.linkLibCpp();
         }
-        o.addIncludePath(.{ .path = "." });
-        o.addIncludePath(.{ .path = "./examples" });
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
         return o;
     }
 
     fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
         const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addIncludePath(.{ .path = "." });
-        e.addIncludePath(.{ .path = "./examples" });
-        e.addCSourceFiles(&.{src}, &cxxflags);
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
         for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
         e.linkLibC();
         e.linkLibCpp();
         e.addConfigHeader(m.config_header);
         m.builder.installArtifact(e);
-
-        // Currently a bug is preventing correct linking for optimized builds for Windows:
-        // https://github.com/ziglang/zig/issues/15958
-        if (e.target.isWindows()) {
-            e.want_lto = false;
-        }
+        e.want_lto = m.enable_lto;
         return e;
     }
 };
 
-pub fn build(b: *std.build.Builder) void {
-    const make = Maker.init(b);
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
+        try make.addFlag("-DGGML_USE_K_QUANTS");
+        const k_quants = make.obj("k_quants", "k_quants.c");
+        try make.objs.append(k_quants);
+    }
 
     const ggml = make.obj("ggml", "ggml.c");
     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
     const llama = make.obj("llama", "llama.cpp");
     const common = make.obj("common", "examples/common.cpp");
+    const console = make.obj("common", "examples/console.cpp");
     const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
 
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
     _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
     _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
 
@@ -42,8 +42,10 @@ else()
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
+    add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(embd-input)
+    add_subdirectory(llama-bench)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
 
@@ -283,6 +283,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.cfg_negative_prompt = argv[i];
+        } else if (arg == "--cfg-negative-prompt-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            if (params.cfg_negative_prompt.back() == '\n') {
+                params.cfg_negative_prompt.pop_back();
+            }
         } else if (arg == "--cfg-scale") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -578,8 +593,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
     fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
     fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
-    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
     fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
+    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
     fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
     fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
 
@@ -0,0 +1,5 @@
+set(TARGET convert-llama2c-to-ggml)
+add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)