Skip to content

Commit 0c19ae7

Browse files
committed
simple : minor style changes
1 parent 5c5a95b commit 0c19ae7

File tree

4 files changed

+91
-201
lines changed

4 files changed

+91
-201
lines changed

convert-llama-h5-to-gguf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,18 @@
22

33
import gguf
44
import gguf_namemap as tmap
5+
56
import os
67
import sys
78
import struct
89
import json
910
import numpy as np
11+
import torch
12+
1013
from typing import Any, List
1114
from pathlib import Path
12-
import torch
1315
from sentencepiece import SentencePieceProcessor
1416

15-
1617
#NDArray = np.ndarray[Any, Any]
1718
# compatible with python < 3.9
1819
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@@ -225,7 +226,7 @@ def count_model_parts(dir_model: str) -> int:
225226
sys.exit()
226227

227228
n_dims = len(data.shape)
228-
data_dtype = data.dtype
229+
data_dtype = data.dtype
229230

230231
# if f32 desired, convert any float16 to float32
231232
if ftype == 0 and data.dtype == np.float16:
@@ -268,7 +269,6 @@ def count_model_parts(dir_model: str) -> int:
268269
for name in model_part.keys():
269270
data = model_part[name]
270271

271-
272272
old_dtype = data.dtype
273273

274274
# we don't need these
@@ -295,7 +295,7 @@ def count_model_parts(dir_model: str) -> int:
295295
sys.exit()
296296

297297
n_dims = len(data.shape)
298-
data_dtype = data.dtype
298+
data_dtype = data.dtype
299299

300300
# if f32 desired, convert any float16 to float32
301301
if ftype == 0 and data.dtype == np.float16:

examples/gguf/gguf-llama-simple.cpp

Lines changed: 41 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -6,177 +6,121 @@
66
#include "gguf-llama.h"
77
#include "build-info.h"
88

9-
#include <cassert>
10-
#include <cinttypes>
119
#include <cmath>
1210
#include <cstdio>
13-
#include <cstring>
14-
#include <ctime>
15-
#include <fstream>
16-
#include <iostream>
1711
#include <string>
1812
#include <vector>
1913

20-
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
21-
#include <signal.h>
22-
#include <unistd.h>
23-
#elif defined (_WIN32)
24-
#define WIN32_LEAN_AND_MEAN
25-
#define NOMINMAX
26-
#include <windows.h>
27-
#include <signal.h>
28-
#endif
29-
30-
31-
32-
int main(int argc, char ** argv)
33-
{
14+
int main(int argc, char ** argv) {
3415
gpt_params params;
3516

36-
//---------------------------------
37-
// Print help :
38-
//---------------------------------
39-
40-
if ( argc == 1 || argv[1][0] == '-' )
41-
{
42-
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
17+
if (argc == 1 || argv[1][0] == '-') {
18+
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
4319
return 1 ;
4420
}
4521

46-
//---------------------------------
47-
// Load parameters :
48-
//---------------------------------
49-
50-
if ( argc >= 2 )
51-
{
22+
if (argc >= 2) {
5223
params.model = argv[1];
5324
}
5425

55-
if ( argc >= 3 )
56-
{
26+
if (argc >= 3) {
5727
params.prompt = argv[2];
5828
}
5929

60-
if ( params.prompt.empty() )
61-
{
30+
if (params.prompt.empty()) {
6231
params.prompt = "Hello my name is";
6332
}
6433

65-
//---------------------------------
66-
// Init LLM :
67-
//---------------------------------
34+
// init LLM
6835

6936
llama_backend_init(params.numa);
7037

7138
llama_context_params ctx_params = llama_context_default_params();
7239

7340
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
74-
75-
if ( model == NULL )
76-
{
77-
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
41+
42+
if (model == NULL) {
43+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
7844
return 1;
7945
}
8046

8147
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
8248

83-
//---------------------------------
84-
// Tokenize the prompt :
85-
//---------------------------------
49+
// tokenize the prompt
8650

8751
std::vector<llama_token> tokens_list;
88-
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
52+
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
8953

90-
const int max_context_size = llama_n_ctx( ctx );
91-
const int max_tokens_list_size = max_context_size - 4 ;
54+
const int max_context_size = llama_n_ctx(ctx);
55+
const int max_tokens_list_size = max_context_size - 4;
9256

93-
if ( (int)tokens_list.size() > max_tokens_list_size )
94-
{
95-
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
96-
__func__ , (int)tokens_list.size() , max_tokens_list_size );
57+
if ((int)tokens_list.size() > max_tokens_list_size) {
58+
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
9759
return 1;
9860
}
9961

100-
fprintf( stderr, "\n\n" );
101-
102-
// Print the tokens from the prompt :
62+
fprintf(stderr, "\n\n");
10363

104-
for( auto id : tokens_list )
105-
{
106-
printf( "%s" , llama_token_to_str( ctx , id ) );
64+
for (auto id : tokens_list) {
65+
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
10766
}
10867

109-
fflush(stdout);
110-
68+
fflush(stderr);
11169

112-
//---------------------------------
113-
// Main prediction loop :
114-
//---------------------------------
70+
// main loop
11571

11672
// The LLM keeps a contextual cache memory of previous token evaluation.
11773
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
11874
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
11975
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
12076

121-
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
122-
{
123-
//---------------------------------
124-
// Evaluate the tokens :
125-
//---------------------------------
77+
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
78+
// evaluate the transformer
12679

127-
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
128-
{
129-
fprintf( stderr, "%s : failed to eval\n" , __func__ );
80+
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
81+
fprintf(stderr, "%s : failed to eval\n", __func__);
13082
return 1;
13183
}
13284

13385
tokens_list.clear();
13486

135-
//---------------------------------
136-
// Select the best prediction :
137-
//---------------------------------
87+
// sample the next token
13888

13989
llama_token new_token_id = 0;
14090

141-
auto logits = llama_get_logits( ctx );
142-
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
91+
auto logits = llama_get_logits(ctx);
92+
auto n_vocab = llama_n_vocab(ctx);
14393

14494
std::vector<llama_token_data> candidates;
145-
candidates.reserve( n_vocab );
95+
candidates.reserve(n_vocab);
14696

147-
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
148-
{
149-
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
97+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
98+
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
15099
}
151100

152101
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
153102

154-
// Select it using the "Greedy sampling" method :
155-
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
156-
103+
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
157104

158105
// is it an end of stream ?
159-
if ( new_token_id == llama_token_eos() )
160-
{
106+
if (new_token_id == llama_token_eos()) {
161107
fprintf(stderr, " [end of text]\n");
162108
break;
163109
}
164110

165-
// Print the new token :
166-
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
167-
fflush( stdout );
111+
// print the new token :
112+
printf("%s", llama_token_to_str(ctx, new_token_id));
113+
fflush(stdout);
168114

169-
// Push this new token for next evaluation :
170-
tokens_list.push_back( new_token_id );
115+
// push this new token for next evaluation
116+
tokens_list.push_back(new_token_id);
171117

172-
} // wend of main loop
118+
}
173119

174-
llama_free( ctx );
175-
llama_free_model( model );
120+
llama_free(ctx);
121+
llama_free_model(model);
176122

177123
llama_backend_free();
178124

179125
return 0;
180126
}
181-
182-
// EOF

0 commit comments

Comments
 (0)