Skip to content

Commit cfa0750

Browse files
llama : support input embeddings directly (#1910)
* add interface for float input * fixed inpL shape and type * add examples of input floats * add test example for embd input * fixed sampling * add free for context * fixed add end condition for generating * add examples for llava.py * add READMD for llava.py * add READMD for llava.py * add example of PandaGPT * refactor the interface and fixed the styles * add cmake build for embd-input * add cmake build for embd-input * Add MiniGPT-4 example * change the order of the args of llama_eval_internal * fix ci error
1 parent 9d23589 commit cfa0750

16 files changed

+811
-22
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
*.o
22
*.a
3+
*.so
34
.DS_Store
45
.build/
56
.cache/
@@ -39,8 +40,8 @@ models/*
3940
/vdot
4041
/server
4142
/Pipfile
43+
/embd-input-test
4244
/libllama.so
43-
4445
build-info.h
4546
arm_neon.h
4647
compile_commands.json

Makefile

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test
33

44
ifdef LLAMA_BUILD_SERVER
55
BUILD_TARGETS += server
@@ -272,7 +272,7 @@ libllama.so: llama.o ggml.o $(OBJS)
272272
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
273273

274274
clean:
275-
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
275+
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h
276276

277277
#
278278
# Examples
@@ -305,6 +305,13 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
305305
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
306306
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
307307

308+
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
309+
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
310+
311+
312+
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
313+
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
314+
308315
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
309316
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
310317

convert-lora-to-ggml.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,18 @@ def write_tensor_header(
113113

114114
write_file_header(fout, params)
115115
for k, v in model.items():
116+
if k.endswith(".default.weight"):
117+
k = k.replace(".default.weight", ".weight")
118+
if k in ["llama_proj.weight", "llama_proj.bias"]:
119+
continue
116120
if k.endswith("lora_A.weight"):
117121
if v.dtype != torch.float16 and v.dtype != torch.float32:
118122
v = v.float()
119123
v = v.T
120124
else:
121125
v = v.float()
122126

123-
t = v.numpy()
127+
t = v.detach().numpy()
124128
tname = translate_tensor_name(k)
125129
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
126130
write_tensor_header(fout, tname, t.shape, t.dtype)

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ else()
3939
add_subdirectory(baby-llama)
4040
add_subdirectory(train-text-from-scratch)
4141
add_subdirectory(simple)
42+
add_subdirectory(embd-input)
4243
if (LLAMA_METAL)
4344
add_subdirectory(metal)
4445
endif()

examples/embd-input/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
PandaGPT
2+
MiniGPT-4
3+
*.pth
4+

examples/embd-input/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
set(TARGET embdinput)
2+
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
3+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
5+
if(TARGET BUILD_INFO)
6+
add_dependencies(${TARGET} BUILD_INFO)
7+
endif()
8+
9+
set(TARGET embd-input-test)
10+
add_executable(${TARGET} embd-input-test.cpp)
11+
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
12+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
13+
if(TARGET BUILD_INFO)
14+
add_dependencies(${TARGET} BUILD_INFO)
15+
endif()

examples/embd-input/README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
### Examples for input embedding directly
2+
3+
## Requirement
4+
build `libembdinput.so`
5+
run the following comman in main dir (../../).
6+
```
7+
make
8+
```
9+
10+
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
11+
12+
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
13+
2. Convert it to ggml format.
14+
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
15+
16+
```
17+
import torch
18+
19+
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
20+
pth_path = "./examples/embd_input/llava_projection.pth"
21+
22+
dic = torch.load(bin_path)
23+
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
24+
torch.save({k: dic[k] for k in used_key}, pth_path)
25+
```
26+
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
27+
28+
29+
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
30+
31+
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
32+
The `adapter_config.json` is
33+
```
34+
{
35+
"peft_type": "LORA",
36+
"fan_in_fan_out": false,
37+
"bias": null,
38+
"modules_to_save": null,
39+
"r": 32,
40+
"lora_alpha": 32,
41+
"lora_dropout": 0.1,
42+
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
43+
}
44+
```
45+
2. Papare the `vicuna` v0 model.
46+
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
47+
4. Clone the PandaGPT source.
48+
```
49+
git clone https://github.com/yxuansu/PandaGPT
50+
```
51+
5. Install the requirement of PandaGPT.
52+
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
53+
54+
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
55+
56+
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
57+
2. Clone the MiniGPT-4 source.
58+
```
59+
git clone https://github.com/Vision-CAIR/MiniGPT-4/
60+
```
61+
3. Install the requirement of PandaGPT.
62+
4. Papare the `vicuna` v0 model.
63+
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
// Defines sigaction on msys:
2+
#ifndef _GNU_SOURCE
3+
#define _GNU_SOURCE
4+
#endif
5+
6+
#include "embd-input.h"
7+
8+
#include <cassert>
9+
#include <cinttypes>
10+
#include <cmath>
11+
#include <cstdio>
12+
#include <cstring>
13+
#include <ctime>
14+
#include <fstream>
15+
#include <iostream>
16+
#include <string>
17+
#include <vector>
18+
19+
static llama_context ** g_ctx;
20+
21+
extern "C" {
22+
23+
struct MyModel* create_mymodel(int argc, char ** argv) {
24+
gpt_params params;
25+
26+
if (gpt_params_parse(argc, argv, params) == false) {
27+
return nullptr;
28+
}
29+
30+
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
31+
32+
if (params.seed < 0) {
33+
params.seed = time(NULL);
34+
}
35+
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
36+
37+
llama_init_backend(params.numa);
38+
39+
llama_model * model;
40+
llama_context * ctx;
41+
42+
g_ctx = &ctx;
43+
44+
// load the model and apply lora adapter, if any
45+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
46+
if (model == NULL) {
47+
fprintf(stderr, "%s: error: unable to load model\n", __func__);
48+
return nullptr;
49+
}
50+
51+
// print system information
52+
{
53+
fprintf(stderr, "\n");
54+
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
55+
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
56+
}
57+
struct MyModel * ret = new MyModel();
58+
ret->ctx = ctx;
59+
ret->params = params;
60+
ret->n_past = 0;
61+
// printf("ctx: %d\n", ret->ctx);
62+
return ret;
63+
}
64+
65+
void free_mymodel(struct MyModel * mymodel) {
66+
llama_context * ctx = mymodel->ctx;
67+
llama_print_timings(ctx);
68+
llama_free(ctx);
69+
delete mymodel;
70+
}
71+
72+
73+
bool eval_float(void * model, float * input, int N){
74+
MyModel * mymodel = (MyModel*)model;
75+
llama_context * ctx = mymodel->ctx;
76+
gpt_params params = mymodel->params;
77+
int n_emb = llama_n_embd(ctx);
78+
int n_past = mymodel->n_past;
79+
int n_batch = N; // params.n_batch;
80+
81+
for (int i = 0; i < (int) N; i += n_batch) {
82+
int n_eval = (int) N - i;
83+
if (n_eval > n_batch) {
84+
n_eval = n_batch;
85+
}
86+
if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
87+
fprintf(stderr, "%s : failed to eval\n", __func__);
88+
return false;
89+
}
90+
n_past += n_eval;
91+
}
92+
mymodel->n_past = n_past;
93+
return true;
94+
}
95+
96+
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
97+
MyModel * mymodel = (MyModel* )model;
98+
llama_context * ctx;
99+
ctx = mymodel->ctx;
100+
gpt_params params = mymodel->params;
101+
int n_past = mymodel->n_past;
102+
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
103+
int n_eval = (int) tokens.size() - i;
104+
if (n_eval > params.n_batch) {
105+
n_eval = params.n_batch;
106+
}
107+
if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
108+
fprintf(stderr, "%s : failed to eval\n", __func__);
109+
return false;
110+
}
111+
n_past += n_eval;
112+
}
113+
mymodel->n_past = n_past;
114+
return true;
115+
}
116+
117+
bool eval_id(struct MyModel* mymodel, int id) {
118+
std::vector<llama_token> tokens;
119+
tokens.push_back(id);
120+
return eval_tokens(mymodel, tokens);
121+
}
122+
123+
bool eval_string(struct MyModel * mymodel,const char* str){
124+
llama_context * ctx = mymodel->ctx;
125+
std::string str2 = str;
126+
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
127+
eval_tokens(mymodel, embd_inp);
128+
return true;
129+
}
130+
131+
llama_token sampling_id(struct MyModel* mymodel) {
132+
llama_context* ctx = mymodel->ctx;
133+
gpt_params params = mymodel->params;
134+
// int n_ctx = llama_n_ctx(ctx);
135+
136+
// out of user input, sample next token
137+
const float temp = params.temp;
138+
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
139+
const float top_p = params.top_p;
140+
const float tfs_z = params.tfs_z;
141+
const float typical_p = params.typical_p;
142+
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
143+
// const float repeat_penalty = params.repeat_penalty;
144+
// const float alpha_presence = params.presence_penalty;
145+
// const float alpha_frequency = params.frequency_penalty;
146+
const int mirostat = params.mirostat;
147+
const float mirostat_tau = params.mirostat_tau;
148+
const float mirostat_eta = params.mirostat_eta;
149+
// const bool penalize_nl = params.penalize_nl;
150+
151+
llama_token id = 0;
152+
{
153+
auto logits = llama_get_logits(ctx);
154+
auto n_vocab = llama_n_vocab(ctx);
155+
156+
// Apply params.logit_bias map
157+
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
158+
logits[it->first] += it->second;
159+
}
160+
161+
std::vector<llama_token_data> candidates;
162+
candidates.reserve(n_vocab);
163+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
164+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
165+
}
166+
167+
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
168+
169+
// TODO: Apply penalties
170+
// float nl_logit = logits[llama_token_nl()];
171+
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
172+
// llama_sample_repetition_penalty(ctx, &candidates_p,
173+
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
174+
// last_n_repeat, repeat_penalty);
175+
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
176+
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
177+
// last_n_repeat, alpha_frequency, alpha_presence);
178+
// if (!penalize_nl) {
179+
// logits[llama_token_nl()] = nl_logit;
180+
// }
181+
182+
if (temp <= 0) {
183+
// Greedy sampling
184+
id = llama_sample_token_greedy(ctx, &candidates_p);
185+
} else {
186+
if (mirostat == 1) {
187+
static float mirostat_mu = 2.0f * mirostat_tau;
188+
const int mirostat_m = 100;
189+
llama_sample_temperature(ctx, &candidates_p, temp);
190+
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
191+
} else if (mirostat == 2) {
192+
static float mirostat_mu = 2.0f * mirostat_tau;
193+
llama_sample_temperature(ctx, &candidates_p, temp);
194+
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
195+
} else {
196+
// Temperature sampling
197+
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
198+
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
199+
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
200+
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
201+
llama_sample_temperature(ctx, &candidates_p, temp);
202+
id = llama_sample_token(ctx, &candidates_p);
203+
}
204+
}
205+
}
206+
207+
return id;
208+
}
209+
210+
const char * sampling(struct MyModel * mymodel) {
211+
llama_context * ctx = mymodel->ctx;
212+
int id = sampling_id(mymodel);
213+
std::string ret;
214+
if (id == llama_token_eos()) ret = "</s>";
215+
else ret = llama_token_to_str(ctx, id);
216+
eval_id(mymodel, id);
217+
return ret.c_str();
218+
}
219+
220+
}

0 commit comments

Comments
 (0)