l3utterfly
diff --git a/‎.devops/llama-server.Dockerfile
Lines changed: 2 additions & 2 deletions b/‎.devops/llama-server.Dockerfile
Lines changed: 2 additions & 2 deletions
diff --git a/‎.devops/nix/package.nix
Lines changed: 3 additions & 10 deletions b/‎.devops/nix/package.nix
Lines changed: 3 additions & 10 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile
Lines changed: 14 additions & 15 deletions b/‎Makefile
Lines changed: 14 additions & 15 deletions
diff --git a/‎README.md
Lines changed: 9 additions & 0 deletions b/‎README.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎common/common.cpp
Lines changed: 11 additions & 9 deletions b/‎common/common.cpp
Lines changed: 11 additions & 9 deletions
diff --git a/‎common/common.h
Lines changed: 6 additions & 2 deletions b/‎common/common.h
Lines changed: 6 additions & 2 deletions
diff --git a/‎convert_hf_to_gguf.py
Lines changed: 1 addition & 1 deletion b/‎convert_hf_to_gguf.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/build.md
Lines changed: 5 additions & 1 deletion b/‎docs/build.md
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/baby-llama/baby-llama.cpp
Lines changed: 0 additions & 1 deletion b/‎examples/baby-llama/baby-llama.cpp
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/batched-bench/batched-bench.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/batched-bench/batched-bench.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cvector-generator/cvector-generator.cpp
Lines changed: 4 additions & 3 deletions b/‎examples/cvector-generator/cvector-generator.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 4 additions & 4 deletions b/‎examples/embedding/embedding.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/eval-callback/eval-callback.cpp
Lines changed: 4 additions & 3 deletions b/‎examples/eval-callback/eval-callback.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/imatrix/imatrix.cpp
Lines changed: 3 additions & 3 deletions b/‎examples/imatrix/imatrix.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/infill/infill.cpp
Lines changed: 4 additions & 1 deletion b/‎examples/infill/infill.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/lookahead/lookahead.cpp
Lines changed: 4 additions & 4 deletions b/‎examples/lookahead/lookahead.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/lookup/lookup-create.cpp
Lines changed: 4 additions & 4 deletions b/‎examples/lookup/lookup-create.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/lookup/lookup-stats.cpp
Lines changed: 4 additions & 4 deletions b/‎examples/lookup/lookup-stats.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/lookup/lookup.cpp
Lines changed: 4 additions & 4 deletions b/‎examples/lookup/lookup.cpp
Lines changed: 4 additions & 4 deletions
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev curl
+    apt-get install -y build-essential git libcurl4-openssl-dev
 
 WORKDIR /app
 
@@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 
 COPY --from=build /app/llama-server /llama-server
 
 
@@ -126,16 +126,9 @@ let
     ++ optionals useMetalKit [ MetalKit ];
 
   cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
-
-    # A temporary hack for reducing the closure size, remove once cudaPackages
-    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
-    cuda_cudart.dev
-    cuda_cudart.lib
-    cuda_cudart.static
-    libcublas.dev
-    libcublas.lib
-    libcublas.static
+    cuda_cudart
+    cuda_cccl # <nv/target>
+    libcublas
   ];
 
   rocmBuildInputs = with rocmPackages; [
 
@@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location o
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
-get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
+get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
+get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
 set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
 
@@ -1605,42 +1605,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Mark legacy binary targets as .PHONY so that they are always checked.
 .PHONY: main quantize perplexity embedding server
 
+# Define the object file target
+examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
 #  Eventually we will want to remove these target from building all the time.
-main: examples/deprecation-warning/deprecation-warning.cpp
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+main: examples/deprecation-warning/deprecation-warning.o
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
 
-server: examples/deprecation-warning/deprecation-warning.cpp
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+server: examples/deprecation-warning/deprecation-warning.o
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
 
-quantize: examples/deprecation-warning/deprecation-warning.cpp
+quantize: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard quantize))
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
 	@echo "  Remove the 'quantize' binary to remove this warning."
 	@echo "#########"
 endif
 
-perplexity: examples/deprecation-warning/deprecation-warning.cpp
+perplexity: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard perplexity))
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
 	@echo "  Remove the 'perplexity' binary to remove this warning."
 	@echo "#########"
 endif
 
-embedding: examples/deprecation-warning/deprecation-warning.cpp
+embedding: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard embedding))
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
 	@echo "  Remove the 'embedding' binary to remove this warning."
 
@@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
+- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
+- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
+- [x] [Smaug](https://huggingface.co/models?search=Smaug)
+- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
+- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
+- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
+- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
+- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 
 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
 
@@ -145,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
+- [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 
@@ -1659,7 +1659,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
     options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
     options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
     options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
     options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
@@ -2064,8 +2064,8 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
-
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
+struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+    llama_init_result iparams;
     auto mparams = llama_model_params_from_gpt_params(params);
 
     llama_model * model = nullptr;
@@ -2080,7 +2080,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return std::make_tuple(nullptr, nullptr);
+        return iparams;
     }
 
     auto cparams = llama_context_params_from_gpt_params(params);
@@ -2089,7 +2089,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     if (lctx == NULL) {
         fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
         llama_free_model(model);
-        return std::make_tuple(nullptr, nullptr);
+        return iparams;
     }
 
     if (!params.control_vectors.empty()) {
@@ -2100,7 +2100,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         if (cvec.n_embd == -1) {
             llama_free(lctx);
             llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
+            return iparams;
         }
 
         int err = llama_control_vector_apply(lctx,
@@ -2112,7 +2112,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         if (err) {
             llama_free(lctx);
             llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
+            return iparams;
         }
     }
 
@@ -2124,7 +2124,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
             llama_free(lctx);
             llama_free_model(model);
-            return std::make_tuple(nullptr, nullptr);
+            return iparams;
         }
         llama_lora_adapter_set(lctx, adapter, lora_scale);
     }
@@ -2160,7 +2160,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         llama_reset_timings(lctx);
     }
 
-    return std::make_tuple(model, lctx);
+    iparams.model   = model;
+    iparams.context = lctx;
+    return iparams;
 }
 
 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
 
@@ -308,8 +308,12 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
 
-// TODO: avoid tuplue, use struct
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
+struct llama_init_result {
+    struct llama_model * model = nullptr;
+    struct llama_context * context = nullptr;
+};
+
+struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
 
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
@@ -316,7 +316,7 @@ def prepare_tensors(self):
                 if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
                     if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                         data = gguf.quantize_bf16(data)
-                        assert data.dtype == np.int16
+                        assert data.dtype == np.uint16
                         data_qtype = gguf.GGMLQuantizationType.BF16
 
                     elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
 
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
   cmake --build build --config Release
   ```
 
-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
+The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
+
+The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
+
+The following compilation options are also available to tweak performance:
 
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 
@@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "train.h"
 
-#include <vector>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 
@@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
     llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
     // ensure enough sequences are available
-    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
+    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
 
@@ -414,9 +414,10 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // load the model to get hparams
-    llama_model * model;
-    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
 
     // int n_ctx = llama_n_ctx(ctx);
     int n_layers = llama_n_layer(model);
 
@@ -79,11 +79,11 @@ int main(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
-
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
 
@@ -163,9 +163,10 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     // init
-    llama_model * model;
-    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
         return 1;
 
@@ -611,10 +611,10 @@ int main(int argc, char ** argv) {
     params.warmup = false;
 
     // init
-    llama_model * model;
-    llama_context * ctx;
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
 
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
         return 1;
 
@@ -179,7 +179,10 @@ int main(int argc, char ** argv) {
 
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    model = llama_init.model;
+    ctx = llama_init.context;
 
     if (model == NULL) {
         LOG_TEE("%s: error: unable to load model\n", __func__);
 
@@ -58,11 +58,11 @@ int main(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the target model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
 
     // Tokenize the prompt
     std::vector<llama_token> inp;
 
@@ -22,11 +22,11 @@ int main(int argc, char ** argv){
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     GGML_ASSERT(model != nullptr);
 
     // tokenize the prompt
 
@@ -26,11 +26,11 @@ int main(int argc, char ** argv){
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
 
     // tokenize the prompt
     std::vector<llama_token> inp;
 
@@ -34,11 +34,11 @@ int main(int argc, char ** argv){
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
 
     // tokenize the prompt
     std::vector<llama_token> inp;