Skip to content

Commit b016f4c

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents c0ad8bb + ebef1e8 commit b016f4c

File tree

9 files changed

+3026
-2091
lines changed

9 files changed

+3026
-2091
lines changed

bindings/javascript/whisper.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

examples/talk-llama/llama.cpp

Lines changed: 907 additions & 843 deletions
Large diffs are not rendered by default.

examples/talk-llama/llama.h

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include <stdbool.h>
77

88
#ifdef LLAMA_SHARED
9-
# ifdef _WIN32
9+
# if defined(_WIN32) && !defined(__MINGW32__)
1010
# ifdef LLAMA_BUILD
1111
# define LLAMA_API __declspec(dllexport)
1212
# else
@@ -20,7 +20,7 @@
2020
#endif
2121

2222
#define LLAMA_FILE_VERSION 1
23-
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
23+
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
2424
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
2525

2626
#ifdef __cplusplus
@@ -45,7 +45,7 @@ extern "C" {
4545

4646
} llama_token_data;
4747

48-
typedef void (*llama_progress_callback)(double progress, void *ctx);
48+
typedef void (*llama_progress_callback)(float progress, void *ctx);
4949

5050
struct llama_context_params {
5151
int n_ctx; // text context
@@ -55,6 +55,7 @@ extern "C" {
5555
bool f16_kv; // use fp16 for KV cache
5656
bool logits_all; // the llama_eval() call computes all logits, not just the last one
5757
bool vocab_only; // only load the vocabulary, no weights
58+
bool use_mmap; // use mmap if possible
5859
bool use_mlock; // force system to keep model in RAM
5960
bool embedding; // embedding mode only
6061

@@ -66,6 +67,9 @@ extern "C" {
6667

6768
LLAMA_API struct llama_context_params llama_context_default_params();
6869

70+
LLAMA_API bool llama_mmap_supported();
71+
LLAMA_API bool llama_mlock_supported();
72+
6973
// Various functions for loading a ggml llama model.
7074
// Allocate (almost) all memory needed for the model.
7175
// Return NULL on failure
@@ -81,8 +85,24 @@ extern "C" {
8185
LLAMA_API int llama_model_quantize(
8286
const char * fname_inp,
8387
const char * fname_out,
84-
int itype,
85-
int qk);
88+
int itype);
89+
90+
// Returns the KV cache that will contain the context for the
91+
// ongoing prediction with the model.
92+
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
93+
94+
// Returns the size of the KV cache
95+
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
96+
97+
// Returns the number of tokens in the KV cache
98+
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
99+
100+
// Sets the KV cache containing the current context for the model
101+
LLAMA_API void llama_set_kv_cache(
102+
struct llama_context * ctx,
103+
const uint8_t * kv_cache,
104+
size_t n_size,
105+
int n_token_count);
86106

87107
// Run the llama inference to obtain the logits and probabilities for the next token.
88108
// tokens + n_tokens is the provided batch of new tokens to process
@@ -135,9 +155,9 @@ extern "C" {
135155
const llama_token * last_n_tokens_data,
136156
int last_n_tokens_size,
137157
int top_k,
138-
double top_p,
139-
double temp,
140-
double repeat_penalty);
158+
float top_p,
159+
float temp,
160+
float repeat_penalty);
141161

142162
// Performance information
143163
LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -150,4 +170,4 @@ extern "C" {
150170
}
151171
#endif
152172

153-
#endif
173+
#endif // LLAMA_H

examples/talk-llama/llama_internal.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Internal header to be included by llama.cpp and tests/benchmarks only.
2+
3+
#ifndef LLAMA_INTERNAL_H
4+
#define LLAMA_INTERNAL_H
5+
6+
#include <vector>
7+
#include <string>
8+
struct ggml_tensor;
9+
10+
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
11+
12+
#endif // LLAMA_INTERNAL_H

0 commit comments

Comments
 (0)