Merge branch 'master' into pizza

comex · comex · commit 02025a717bbe · 2023-04-07T12:55:49.000-07:00
diff --git a/Makefile b/Makefile
@@ -166,6 +166,8 @@ perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
 
+libllama.so: llama.o ggml.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
 #
 # Tests
 #
diff --git a/README.md b/README.md
@@ -350,20 +350,22 @@ We have two Docker images available for this project:
 
 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
 
+Replace `/path/to/models` below with the actual path where you downloaded the models.
+
  ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```
 
 On complete, you are ready to play!
 
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
 or with light image:
 
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 
 ### Contributing
diff --git a/build.zig b/build.zig
@@ -3,12 +3,14 @@ const std = @import("std");
 pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
+    const want_lto = b.option(bool, "lto", "Want -fLTO");
 
     const lib = b.addStaticLibrary(.{
         .name = "llama",
         .target = target,
         .optimize = optimize,
     });
+    lib.want_lto = want_lto;
     lib.linkLibCpp();
     lib.addIncludePath(".");
     lib.addIncludePath("examples");
@@ -17,11 +19,11 @@ pub fn build(b: *std.Build) void {
     }, &.{"-std=c11"});
     lib.addCSourceFiles(&.{
         "llama.cpp",
-        "examples/common.cpp",
     }, &.{"-std=c++11"});
     lib.install();
 
-    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
+    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
+
     const exe = build_example("main", build_args);
     _ = build_example("quantize", build_args);
     _ = build_example("perplexity", build_args);
@@ -44,16 +46,19 @@ fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjS
     const lib = args.lib;
     const target = args.target;
     const optimize = args.optimize;
+    const want_lto = args.want_lto;
 
     const exe = b.addExecutable(.{
         .name = name,
         .target = target,
         .optimize = optimize,
     });
+    exe.want_lto = want_lto;
     exe.addIncludePath(".");
     exe.addIncludePath("examples");
     exe.addCSourceFiles(&.{
         std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
+        "examples/common.cpp",
     }, &.{"-std=c++11"});
     exe.linkLibrary(lib);
     exe.install();
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -432,7 +432,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of text token
-        if (embd.back() == llama_token_eos()) {
+        if (!embd.empty() && embd.back() == llama_token_eos()) {
             if (params.instruct) {
                 is_interacting = true;
             } else {
diff --git a/llama.cpp b/llama.cpp
@@ -1454,19 +1454,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
         }
     }
 
-    if (top_k > 0 && top_k < n_logits) {
-        sample_top_k(logits_id, top_k);
-    }
-
-    float maxl = -std::numeric_limits<float>::infinity();
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
+    sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
 
     // compute probs for the top k tokens
     std::vector<float> probs;
     probs.reserve(logits_id.size());
 
+    float maxl = logits_id[0].first;
     double sum = 0.0;
     for (const auto & kv : logits_id) {
         const float p = expf(kv.first - maxl);
@@ -1489,16 +1483,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
                 break;
             }
         }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
     }
 
     //printf("\n");
     //for (int i = 0; i < (int) 10; i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    //    printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
     //}
     //printf("\n\n");
     //exit(0);

Original file line number	Diff line number	Diff line change
`@@ -166,6 +166,8 @@ perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o`
`166`	`166`	`embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o`
`167`	`167`	`$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)`
`168`	`168`
	`169`	`+libllama.so: llama.o ggml.o`
	`170`	`+ $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)`
`169`	`171`	`#`
`170`	`172`	`# Tests`
`171`	`173`	`#`
Original file line number	Diff line number	Diff line change
`@@ -432,7 +432,7 @@ int main(int argc, char ** argv) {`
`432`	`432`	`}`
`433`	`433`
`434`	`434`	`// end of text token`
`435`		`- if (embd.back() == llama_token_eos()) {`
	`435`	`+ if (!embd.empty() && embd.back() == llama_token_eos()) {`
`436`	`436`	`if (params.instruct) {`
`437`	`437`	`is_interacting = true;`
`438`	`438`	`} else {`