Skip to content

Commit 7e8640a

Browse files
committed
gguf-hash: add cpp and python implementation of layer + model wide hashing
1 parent b1ef562 commit 7e8640a

File tree

10 files changed

+7789
-0
lines changed

10 files changed

+7789
-0
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ BUILD_TARGETS = \
1414
llama-finetune \
1515
llama-gbnf-validator \
1616
llama-gguf \
17+
llama-gguf-hash
1718
llama-gguf-split \
1819
llama-gritlm \
1920
llama-imatrix \
@@ -919,6 +920,10 @@ llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
919920
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
920921
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
921922

923+
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
924+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
925+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
926+
922927
llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
923928
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
924929
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ else()
2323
add_subdirectory(export-lora)
2424
add_subdirectory(finetune)
2525
add_subdirectory(gbnf-validator)
26+
add_subdirectory(gguf-hash)
2627
add_subdirectory(gguf-split)
2728
add_subdirectory(gguf)
2829
add_subdirectory(gritlm)

examples/gguf-hash/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
set(TARGET llama-gguf-hash)
2+
add_library(sha1 OBJECT sha1.c sha1.h)
3+
add_library(xxhash OBJECT xxhash.c xxhash.h)
4+
add_executable(${TARGET} gguf-hash.cpp)
5+
install(TARGETS ${TARGET} RUNTIME)
6+
target_link_libraries(${TARGET} PRIVATE ggml sha1 xxhash ${CMAKE_THREAD_LIBS_INIT})
7+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/gguf-hash/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
## GGUF hash Example
2+
3+
CLI to hash GGUF files.
4+
5+
**Command line options:**
6+
7+
- `--xxhash`: use xhash (default)
8+
- `--sha1`: use sha1
9+
10+
### Compile Example
11+
12+
```
13+
cmake -B build
14+
make -C build llama-gguf-hash
15+
./build/bin/llama-gguf-hash test.gguf
16+
```
17+
18+
### Crypto/Hash Libaries Used
19+
20+
- https://github.com/clibs/sha1/
21+
- https://github.com/Cyan4973/xxHash

examples/gguf-hash/gguf-hash.cpp

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#include "ggml.h"
2+
3+
#include "stdlib.h" /* abort() */
4+
#include <cstddef>
5+
#include "sha1.h"
6+
#include "xxhash.h"
7+
8+
#include <cstdio>
9+
#include <string>
10+
#include <stdexcept>
11+
#include <algorithm>
12+
13+
#include <string.h>
14+
15+
16+
struct hash_params {
17+
std::string input;
18+
bool xxhash = false;
19+
bool sha1 = false;
20+
};
21+
22+
static void hash_print_usage(const char * executable) {
23+
const hash_params default_params;
24+
printf("\n");
25+
printf("usage: %s [options] GGUF_IN\n", executable);
26+
printf("\n");
27+
printf("Hash a GGUF file");
28+
printf("\n");
29+
printf("options:\n");
30+
printf(" -h, --help show this help message and exit\n");
31+
printf(" --xxhash use xxhash\n");
32+
printf(" --sha1 use sha1\n");
33+
printf("\n");
34+
}
35+
36+
static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
37+
std::string arg;
38+
const std::string arg_prefix = "--";
39+
40+
int arg_idx = 1;
41+
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
42+
arg = argv[arg_idx];
43+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
44+
std::replace(arg.begin(), arg.end(), '_', '-');
45+
}
46+
47+
bool arg_found = false;
48+
if (arg == "-h" || arg == "--help") {
49+
hash_print_usage(argv[0]);
50+
exit(0);
51+
}
52+
53+
if (arg == "--xxhash") {
54+
arg_found = true;
55+
params.xxhash = true;
56+
}
57+
58+
if (arg == "--sha1") {
59+
arg_found = true;
60+
params.sha1 = true;
61+
}
62+
63+
if (!arg_found) {
64+
throw std::invalid_argument("error: unknown argument: " + arg);
65+
}
66+
}
67+
68+
if (!params.xxhash && !params.sha1) {
69+
// By default if no swich argument provided, assume xxhash
70+
params.xxhash = true;
71+
}
72+
73+
if (argc - arg_idx < 1) {
74+
throw std::invalid_argument("error: bad arguments");
75+
}
76+
77+
params.input = argv[arg_idx++];
78+
}
79+
80+
static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
81+
bool result = true;
82+
try {
83+
hash_params_parse_ex(argc, argv, params);
84+
}
85+
catch (const std::invalid_argument & ex) {
86+
fprintf(stderr, "%s\n", ex.what());
87+
hash_print_usage(argv[0]);
88+
exit(EXIT_FAILURE);
89+
}
90+
return result;
91+
}
92+
93+
static bool gguf_hash(const hash_params & hash_params) {
94+
const std::string & fname = hash_params.input;
95+
struct ggml_context * ctx_data = NULL;
96+
97+
struct gguf_init_params params = {
98+
/*.no_alloc = */ false,
99+
/*.ctx = */ &ctx_data,
100+
};
101+
102+
// xxhash init
103+
XXH64_state_t* xxhash_model_hash_state = NULL;
104+
if (hash_params.xxhash) {
105+
xxhash_model_hash_state = XXH64_createState();
106+
if (xxhash_model_hash_state==NULL) {
107+
abort();
108+
}
109+
110+
XXH64_hash_t const seed = 0;
111+
if (XXH64_reset(xxhash_model_hash_state, seed) == XXH_ERROR) {
112+
abort();
113+
}
114+
}
115+
116+
// sha1 init
117+
SHA1_CTX sha1_model_hash_ctx;
118+
if (hash_params.sha1) {
119+
SHA1Init(&sha1_model_hash_ctx);
120+
}
121+
122+
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
123+
const int n_tensors = gguf_get_n_tensors(ctx);
124+
for (int i = 0; i < n_tensors; ++i) {
125+
const char * name = gguf_get_tensor_name(ctx, i);
126+
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
127+
auto n_bytes = ggml_nbytes(cur);
128+
auto *raw_data = cur->data;
129+
130+
if (hash_params.xxhash) {
131+
132+
// Per Layer Hash
133+
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
134+
printf("xxhash %016lx %s:%s\n", hash, fname.c_str(), name);
135+
136+
// Overall Model Hash
137+
if (XXH64_update(xxhash_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
138+
}
139+
140+
if (hash_params.sha1) {
141+
142+
// Per Layer Hash
143+
char result[21];
144+
SHA1( result, (const char *)raw_data, n_bytes);
145+
146+
char hex_result[41];
147+
for (int offset = 0; offset < 20; offset++) {
148+
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
149+
}
150+
151+
printf("sha1 %s %s:%s\n", hex_result, fname.c_str(), name);
152+
153+
// Overall Model Hash
154+
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
155+
}
156+
}
157+
158+
if (hash_params.xxhash) {
159+
XXH64_hash_t const hash = XXH64_digest(xxhash_model_hash_state);
160+
printf("xxhash %016lx %s\n", hash, fname.c_str());
161+
}
162+
163+
if (hash_params.sha1) {
164+
unsigned char result[21];
165+
SHA1Final(result, &sha1_model_hash_ctx);
166+
167+
char hex_result[41];
168+
for (int offset = 0; offset < 20; offset++) {
169+
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
170+
}
171+
172+
printf("sha1 %s %s\n", hex_result, fname.c_str());
173+
}
174+
175+
ggml_free(ctx_data);
176+
gguf_free(ctx);
177+
178+
return true;
179+
}
180+
181+
int main(int argc, const char ** argv) {
182+
hash_params params;
183+
hash_params_parse(argc, argv, params);
184+
185+
gguf_hash(params);
186+
187+
return 0;
188+
}

0 commit comments

Comments
 (0)