Skip to content

Commit f7cab35

Browse files
mofosyneggerganov
andauthored
gguf-hash: model wide and per tensor hashing using xxhash and sha1 (#8048)
CLI to hash GGUF files to detect difference on a per model and per tensor level The hash type we support is: - `--xxh64`: use xhash 64bit hash mode (default) - `--sha1`: use sha1 - `--uuid`: use uuid - `--sha256`: use sha256 While most POSIX systems already have hash checking programs like sha256sum, it is designed to check entire files. This is not ideal for our purpose if we want to check for consistency of the tensor data even if the metadata content of the gguf KV store has been updated. This program is designed to hash a gguf tensor payload on a 'per tensor layer' in addition to a 'entire tensor model' hash. The intent is that the entire tensor layer can be checked first but if there is any detected inconsistencies, then the per tensor hash can be used to narrow down the specific tensor layer that has inconsistencies. Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 905942a commit f7cab35

File tree

17 files changed

+8846
-0
lines changed

17 files changed

+8846
-0
lines changed

Makefile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ BUILD_TARGETS = \
1414
llama-finetune \
1515
llama-gbnf-validator \
1616
llama-gguf \
17+
llama-gguf-hash \
1718
llama-gguf-split \
1819
llama-gritlm \
1920
llama-imatrix \
@@ -1178,6 +1179,23 @@ llama-gguf: examples/gguf/gguf.cpp \
11781179
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
11791180
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
11801181

1182+
examples/gguf-hash/deps/sha1/sha1.o: \
1183+
examples/gguf-hash/deps/sha1/sha1.c
1184+
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
1185+
1186+
examples/gguf-hash/deps/xxhash/xxhash.o: \
1187+
examples/gguf-hash/deps/xxhash/xxhash.c
1188+
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
1189+
1190+
examples/gguf-hash/deps/sha256/sha256.o: \
1191+
examples/gguf-hash/deps/sha256/sha256.c
1192+
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
1193+
1194+
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
1195+
$(OBJ_ALL)
1196+
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
1197+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1198+
11811199
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
11821200
$(OBJ_ALL)
11831201
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ else()
2323
add_subdirectory(export-lora)
2424
add_subdirectory(finetune)
2525
add_subdirectory(gbnf-validator)
26+
add_subdirectory(gguf-hash)
2627
add_subdirectory(gguf-split)
2728
add_subdirectory(gguf)
2829
add_subdirectory(gritlm)

examples/gguf-hash/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
set(TARGET llama-gguf-hash)
2+
add_executable(${TARGET} gguf-hash.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
5+
# clibs dependencies
6+
include_directories(deps/)
7+
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
8+
target_link_libraries(${TARGET} PRIVATE xxhash)
9+
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
10+
target_link_libraries(${TARGET} PRIVATE sha1)
11+
add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
12+
target_link_libraries(${TARGET} PRIVATE sha256)
13+
14+
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
15+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/gguf-hash/README.md

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
2+
# llama-gguf-hash
3+
4+
CLI to hash GGUF files to detect difference on a per model and per tensor level.
5+
6+
**Command line options:**
7+
8+
- `--help`: display help message
9+
- `--xxh64`: use xhash 64bit hash mode (default)
10+
- `--sha1`: use sha1
11+
- `--uuid`: use uuid
12+
- `--sha256`: use sha256
13+
- `--all`: use all hash
14+
- `--no-layer`: exclude per layer hash
15+
- `--uuid`: generate UUIDv5 ID
16+
- `-c`, `--check <manifest>`: verify against a manifest
17+
18+
## About
19+
20+
While most POSIX systems already have hash checking programs like sha256sum, it
21+
is designed to check entire files. This is not ideal for our purpose if we want
22+
to check for consistency of the tensor data even if the metadata content of the
23+
gguf KV store has been updated.
24+
25+
This program is designed to hash a gguf tensor payload on a 'per tensor layer'
26+
in addition to a 'entire tensor model' hash. The intent is that the entire
27+
tensor layer can be checked first but if there is any detected inconsistencies,
28+
then the per tensor hash can be used to narrow down the specific tensor layer
29+
that has inconsistencies.
30+
31+
For Maintainers:
32+
- Detection of tensor inconsistency during development and automated tests
33+
- This is served by xxh64 which is fast
34+
- This is also served by having per tensor layer to assist in narrowing down
35+
the location of the faulty tensor layer
36+
- This is also served by sha1 which is much slower but more widely supported
37+
38+
For Model Creators:
39+
- Optional consistent UUID generation based on model tensor content
40+
- This is served by UUIDv5 which is useful for databases keys
41+
- llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
42+
- Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
43+
44+
For Model Users:
45+
- Assurance of tensor layer integrity even if metadata was updated
46+
- This is served by sha256 which is still considered very secure as of 2024
47+
48+
### Design Note
49+
50+
- The default behavior of this program if no arguments is provided is to hash
51+
using xxhash's xxh32 mode because it is very fast and is primarily targeted
52+
towards maintainers who may want to use this in automated tests.
53+
- xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
54+
however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
55+
would have a better affinity to calculating hash that is 64bit in size.
56+
57+
## Compile Example
58+
59+
```bash
60+
cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
61+
make -C build clean
62+
make -C build llama-gguf-hash VERBOSE=1
63+
./build/bin/llama-gguf-hash test.gguf
64+
./build/bin/llama-gguf-hash --xxh64 test.gguf
65+
./build/bin/llama-gguf-hash --sha1 test.gguf
66+
./build/bin/llama-gguf-hash --uuid test.gguf
67+
./build/bin/llama-gguf-hash --sha256 test.gguf
68+
```
69+
70+
## Generation and Verification Example
71+
72+
To generate we may use this command
73+
74+
```bash
75+
./llama-gguf-hash --all test.gguf > test.gguf.manifest
76+
```
77+
78+
Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
79+
(This excludes UUID as that is an ID not a hash)
80+
81+
```bash
82+
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0
83+
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0
84+
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0
85+
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1
86+
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1
87+
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1
88+
xxh64 a0af5d700049693b test.gguf:tensor_2
89+
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2
90+
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2
91+
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3
92+
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3
93+
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3
94+
xxh64 1257733306b7992d test.gguf:tensor_4
95+
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4
96+
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4
97+
xxh64 d238d16ba4711e58 test.gguf:tensor_5
98+
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5
99+
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5
100+
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6
101+
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6
102+
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6
103+
xxh64 c22021c29854f093 test.gguf:tensor_7
104+
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7
105+
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7
106+
xxh64 936df61f5d64261f test.gguf:tensor_8
107+
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8
108+
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8
109+
xxh64 93fd20c64421c081 test.gguf:tensor_9
110+
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9
111+
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9
112+
xxh64 5a54d3aad816f302 test.gguf
113+
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf
114+
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf
115+
```
116+
117+
We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
118+
119+
```bash
120+
$ ./llama-gguf-hash --check test.gguf.manifest test.gguf
121+
manifest test.gguf.manifest sha256 sha1 xxh64
122+
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
123+
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
124+
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
125+
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
126+
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
127+
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
128+
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
129+
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
130+
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
131+
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
132+
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
133+
134+
Verification results for test.gguf.manifest - Success
135+
```
136+
137+
Or we may explicitly ask for a faster hash like:
138+
139+
```bash
140+
$ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
141+
manifest test.gguf.manifest sha256 sha1 xxh64
142+
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
143+
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
144+
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
145+
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
146+
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
147+
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
148+
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
149+
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
150+
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
151+
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
152+
xxh64 5a54d3aad816f302 test.gguf - Ok
153+
154+
Verification results for test.gguf.manifest - Success
155+
```
156+
157+
Or maybe we want to just check that all the hash is valid:
158+
159+
```bash
160+
$./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
161+
manifest test.gguf.manifest sha256 sha1 xxh64
162+
xxh64 f66e9cd66a4396a0 test.gguf:tensor_0 - Ok
163+
sha1 59f79ecefd8125a996fdf419239051a7e99e5f20 test.gguf:tensor_0 - Ok
164+
sha256 c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd test.gguf:tensor_0 - Ok
165+
xxh64 7d3a1f9ac04d0537 test.gguf:tensor_1 - Ok
166+
sha1 4765f592eacf096df4628ba59476af94d767080a test.gguf:tensor_1 - Ok
167+
sha256 8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7 test.gguf:tensor_1 - Ok
168+
xxh64 a0af5d700049693b test.gguf:tensor_2 - Ok
169+
sha1 25cbfbad4513cc348e2c95ebdee69d6ff2fd8753 test.gguf:tensor_2 - Ok
170+
sha256 947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180 test.gguf:tensor_2 - Ok
171+
xxh64 e83fddf559d7b6a6 test.gguf:tensor_3 - Ok
172+
sha1 a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c test.gguf:tensor_3 - Ok
173+
sha256 423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1 test.gguf:tensor_3 - Ok
174+
xxh64 1257733306b7992d test.gguf:tensor_4 - Ok
175+
sha1 d7bc61db93bb685ce9d598da89717c66729b7543 test.gguf:tensor_4 - Ok
176+
sha256 79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf test.gguf:tensor_4 - Ok
177+
xxh64 d238d16ba4711e58 test.gguf:tensor_5 - Ok
178+
sha1 0706566c198fe1072f37e0a5135b4b5f23654c52 test.gguf:tensor_5 - Ok
179+
sha256 60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b test.gguf:tensor_5 - Ok
180+
xxh64 3fbc3b65ab8c7f39 test.gguf:tensor_6 - Ok
181+
sha1 73922a0727226a409049f6fc3172a52219ca6f00 test.gguf:tensor_6 - Ok
182+
sha256 574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0 test.gguf:tensor_6 - Ok
183+
xxh64 c22021c29854f093 test.gguf:tensor_7 - Ok
184+
sha1 efc39cece6a951188fc41e354c73bbfe6813d447 test.gguf:tensor_7 - Ok
185+
sha256 4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75 test.gguf:tensor_7 - Ok
186+
xxh64 936df61f5d64261f test.gguf:tensor_8 - Ok
187+
sha1 c2490296d789a4f34398a337fed8377d943d9f06 test.gguf:tensor_8 - Ok
188+
sha256 c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01 test.gguf:tensor_8 - Ok
189+
xxh64 93fd20c64421c081 test.gguf:tensor_9 - Ok
190+
sha1 7047ce1e78437a6884337a3751c7ee0421918a65 test.gguf:tensor_9 - Ok
191+
sha256 23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514 test.gguf:tensor_9 - Ok
192+
xxh64 5a54d3aad816f302 test.gguf - Ok
193+
sha1 d15be52c4ff213e823cb6dd13af7ee2f978e7042 test.gguf - Ok
194+
sha256 7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4 test.gguf - Ok
195+
196+
Verification results for test.gguf.manifest - Success
197+
```
198+
199+
200+
## Crypto/Hash Libraries Used
201+
202+
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
203+
204+
- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
205+
- https://github.com/clibs/sha1/
206+
- https://github.com/jb55/sha256.c
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"name": "rotate-bits",
3+
"version": "0.1.1",
4+
"repo": "jb55/rotate-bits.h",
5+
"description": "rotate bits",
6+
"keywords": ["rotl", "rotr"],
7+
"src": ["rotate-bits.h"],
8+
"license": "Public Domain",
9+
"development": {
10+
"thlorenz/tap.c": "*"
11+
}
12+
}
13+
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
2+
3+
#ifndef __ROTATE_DEFS_H
4+
#define __ROTATE_DEFS_H
5+
6+
#ifdef _MSC_VER
7+
8+
#include <stdlib.h>
9+
10+
#define ROTL32(v, n) _rotl((v), (n))
11+
#define ROTL64(v, n) _rotl64((v), (n))
12+
13+
#define ROTR32(v, n) _rotr((v), (n))
14+
#define ROTR64(v, n) _rotr64((v), (n))
15+
16+
#else
17+
18+
#include <stdint.h>
19+
20+
#define U8V(v) ((uint8_t)(v) & 0xFFU)
21+
#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
22+
#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
23+
#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
24+
25+
#define ROTL32(v, n) \
26+
(U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
27+
28+
// tests fail if we don't have this cast...
29+
#define ROTL64(v, n) \
30+
(U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
31+
32+
#define ROTR32(v, n) ROTL32(v, 32 - (n))
33+
#define ROTR64(v, n) ROTL64(v, 64 - (n))
34+
35+
#endif
36+
37+
#define ROTL8(v, n) \
38+
(U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
39+
40+
#define ROTL16(v, n) \
41+
(U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
42+
43+
#define ROTR8(v, n) ROTL8(v, 8 - (n))
44+
#define ROTR16(v, n) ROTL16(v, 16 - (n))
45+
46+
#endif
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"name": "sha1",
3+
"version": "0.0.1",
4+
"repo": "clibs/sha1",
5+
"description": "sha1 hash algorithm",
6+
"keywords": ["sha1", "hash"],
7+
"license": "public domain",
8+
"src": ["sha1.c", "sha1.h"]
9+
}

0 commit comments

Comments
 (0)