1
1
#!/usr/bin/env node
2
2
3
- import { GGMLQuantizationType , gguf } from "." ;
3
+ import { GGMLQuantizationType , gguf , ggufAllShards , GGUFParseOutput } from "." ;
4
+ import { GGML_QUANT_SIZES } from "./quant-descriptions" ;
4
5
5
6
interface PrintColumnHeader {
6
7
name: string ;
@@ -10,11 +11,44 @@ interface PrintColumnHeader {
10
11
11
12
const mapDtypeToName = Object . fromEntries ( Object . entries ( GGMLQuantizationType ) . map ( ( [ name , value ] ) => [ value , name ] ) ) ;
12
13
14
+ function showHelp ( exitCode : number ) {
15
+ console . error ( "Usage: gguf-view [--help|-h] [--show-tensor] [--context|-c N] <path/to/gguf>" ) ;
16
+ console . error ( " --help, -h Show this help message" ) ;
17
+ console . error ( " --show-tensor Show tensor information" ) ;
18
+ console . error ( " --context, -c N Number of tokens in context (default: 4096)" ) ;
19
+ process . exit ( exitCode ) ;
20
+ }
21
+
13
22
async function main ( ) {
14
- const ggufPath = process . argv [ 2 ] ;
15
- const { metadata, tensorInfos } = await gguf ( ggufPath , {
23
+ let ggufPath = "" ;
24
+ let showTensors = false ;
25
+ let nCtx = 4096 ;
26
+ for ( let i = 2 ; i < process . argv . length ; i ++ ) {
27
+ if ( process . argv [ i ] === "--help" || process . argv [ i ] === "-h" ) {
28
+ showHelp ( 0 ) ;
29
+ } else if ( process . argv [ i ] === "--show-tensor" ) {
30
+ showTensors = true ;
31
+ } else if ( process . argv [ i ] === "--context" || process . argv [ i ] === "-c" ) {
32
+ nCtx = Number ( process . argv [ ++ i ] ) ;
33
+ } else {
34
+ ggufPath = process . argv [ i ] ;
35
+ }
36
+ }
37
+
38
+ if ( ! ggufPath . length ) {
39
+ console . error ( "Error: Missing path to gguf file" ) ;
40
+ showHelp ( 1 ) ;
41
+ }
42
+
43
+ const { shards } = await ggufAllShards ( ggufPath , {
16
44
allowLocalFile : true ,
17
45
} ) ;
46
+ const { metadata, tensorInfos } = shards [ 0 ] ;
47
+
48
+ // merge all metadata
49
+ for ( let i = 1 ; i < shards . length ; i ++ ) {
50
+ tensorInfos . push ( ...shards [ i ] . tensorInfos ) ;
51
+ }
18
52
19
53
// TODO: print info about endianess
20
54
console . log ( `* Dumping ${ Object . keys ( metadata ) . length } key/value pair(s)` ) ;
@@ -43,29 +77,110 @@ async function main() {
43
77
) ;
44
78
45
79
console . log ( ) ;
46
- console . log ( `* Dumping ${ tensorInfos . length } tensor(s)` ) ;
47
- printTable (
48
- [
49
- { name : "Idx" , alignRight : true } ,
50
- { name : "Num Elements" , alignRight : true } ,
51
- { name : "Shape" } ,
52
- { name : "Data Type" } ,
53
- { name : "Name" } ,
54
- ] ,
55
- tensorInfos . map ( ( tensorInfo , i ) => {
56
- const shape = [ 1n , 1n , 1n , 1n ] ;
57
- tensorInfo . shape . forEach ( ( dim , i ) => {
58
- shape [ i ] = dim ;
59
- } ) ;
60
- return [
61
- ( i + 1 ) . toString ( ) ,
62
- shape . reduce ( ( acc , n ) => acc * n , 1n ) . toString ( ) ,
63
- shape . map ( ( n ) => n . toString ( ) . padStart ( 6 ) ) . join ( ", " ) ,
64
- mapDtypeToName [ tensorInfo . dtype ] ,
65
- tensorInfo . name ,
66
- ] ;
67
- } )
68
- ) ;
80
+ console . log ( `* Memory usage estimation (with context length of ${ nCtx } tokens)` ) ;
81
+ try {
82
+ const kvUsage = calcMemoryUsage ( metadata as GGUFParseOutput < { strict : false } > [ "metadata" ] , nCtx ) ;
83
+ let modelWeightInBytes = 0 ;
84
+ for ( const tensorInfo of tensorInfos ) {
85
+ const nElem = Number ( tensorInfo . shape . reduce ( ( a , b ) => a * b , 1n ) ) ;
86
+ const tensorSizeInBytes = nElem * ( GGML_QUANT_SIZES [ tensorInfo . dtype ] / 8 ) ;
87
+ modelWeightInBytes += tensorSizeInBytes ;
88
+ }
89
+ const overhead =
90
+ calcMemoryUsage ( metadata as GGUFParseOutput < { strict : false } > [ "metadata" ] , 256 ) . totalBytes +
91
+ modelWeightInBytes * 0.05 ;
92
+ const totalMemoryUsage = kvUsage . totalBytes + overhead + modelWeightInBytes ;
93
+ printTable (
94
+ [ { name : "Item" } , { name : "Memory usage" , alignRight : true } ] ,
95
+ [
96
+ [ "K cache" , ( kvUsage . totalBytesK / 1e9 ) . toFixed ( 2 ) + " GB" ] ,
97
+ [ "V cache" , ( kvUsage . totalBytesV / 1e9 ) . toFixed ( 2 ) + " GB" ] ,
98
+ [ "Weight" , ( modelWeightInBytes / 1e9 ) . toFixed ( 2 ) + " GB" ] ,
99
+ [ "Overhead" , ( overhead / 1e9 ) . toFixed ( 2 ) + " GB" ] ,
100
+ [ "" , "---" ] ,
101
+ [ "TOTAL" , ( totalMemoryUsage / 1e9 ) . toFixed ( 2 ) + " GB" ] ,
102
+ ]
103
+ ) ;
104
+ } catch ( e ) {
105
+ console . error ( `Error: ${ ( e as Error ) . message } ` ) ;
106
+ }
107
+
108
+ if ( showTensors ) {
109
+ console . log ( ) ;
110
+ console . log ( `* Dumping ${ tensorInfos . length } tensor(s)` ) ;
111
+ printTable (
112
+ [
113
+ { name : "Idx" , alignRight : true } ,
114
+ { name : "Num Elements" , alignRight : true } ,
115
+ { name : "Shape" } ,
116
+ { name : "Data Type" } ,
117
+ { name : "Name" } ,
118
+ ] ,
119
+ tensorInfos . map ( ( tensorInfo , i ) => {
120
+ const shape = [ 1n , 1n , 1n , 1n ] ;
121
+ tensorInfo . shape . forEach ( ( dim , i ) => {
122
+ shape [ i ] = dim ;
123
+ } ) ;
124
+ return [
125
+ ( i + 1 ) . toString ( ) ,
126
+ shape . reduce ( ( acc , n ) => acc * n , 1n ) . toString ( ) ,
127
+ shape . map ( ( n ) => n . toString ( ) . padStart ( 6 ) ) . join ( ", " ) ,
128
+ mapDtypeToName [ tensorInfo . dtype ] ,
129
+ tensorInfo . name ,
130
+ ] ;
131
+ } )
132
+ ) ;
133
+ } else {
134
+ console . log ( ) ;
135
+ console . log ( `* Use --show-tensor to display tensor information` ) ;
136
+ }
137
+ }
138
+
139
+ function calcMemoryUsage (
140
+ metadata : GGUFParseOutput < { strict : false } > [ "metadata" ] ,
141
+ kvSize : number ,
142
+ kvTypeK : GGMLQuantizationType = GGMLQuantizationType . F16 ,
143
+ kvTypeV : GGMLQuantizationType = GGMLQuantizationType . F16
144
+ ) {
145
+ const arch = metadata [ "general.architecture" ] ?? "unknown" ;
146
+ const n_embd = ( metadata [ `${ arch } .embedding_length` ] as number ) ?? 0 ;
147
+ const n_head = ( metadata [ `${ arch } .attention.head_count` ] as number ) ?? 0 ;
148
+ const n_embd_head_k = ( metadata [ `${ arch } .attention.key_length` ] as number ) ?? n_embd / n_head ;
149
+ const n_embd_head_v = ( metadata [ `${ arch } .attention.value_length` ] as number ) ?? n_embd / n_head ;
150
+ const n_head_kv = ( metadata [ `${ arch } .attention.head_count_kv` ] as number [ ] | number ) ?? [ ] ;
151
+ const n_layer = ( metadata [ `${ arch } .block_count` ] as number ) ?? 0 ;
152
+
153
+ if ( arch . startsWith ( "mamba" ) || arch . startsWith ( "rwkv" ) ) {
154
+ throw new Error ( `Memory usage estimation for arch "${ arch } " is not supported` ) ;
155
+ }
156
+
157
+ const n_head_kv_arr = Array ( n_layer ) . fill ( n_head ) ;
158
+ if ( Array . isArray ( n_head_kv ) ) {
159
+ for ( let i = 0 ; i < n_layer ; i ++ ) {
160
+ if ( n_head_kv [ i ] ) {
161
+ n_head_kv_arr [ i ] = n_head_kv [ i ] ;
162
+ }
163
+ }
164
+ } else {
165
+ for ( let i = 0 ; i < n_layer ; i ++ ) {
166
+ n_head_kv_arr [ i ] = n_head_kv ;
167
+ }
168
+ }
169
+
170
+ let totalElemsK = 0 ;
171
+ let totalElemsV = 0 ;
172
+ for ( let i = 0 ; i < n_layer ; i ++ ) {
173
+ const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr [ i ] ;
174
+ const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr [ i ] ;
175
+ totalElemsK += n_embd_k_gqa * kvSize ;
176
+ totalElemsV += n_embd_v_gqa * kvSize ;
177
+ }
178
+
179
+ return {
180
+ totalBytesK : totalElemsK * ( GGML_QUANT_SIZES [ kvTypeK ] / 8 ) ,
181
+ totalBytesV : totalElemsV * ( GGML_QUANT_SIZES [ kvTypeV ] / 8 ) ,
182
+ totalBytes : ( totalElemsK + totalElemsV ) * ( GGML_QUANT_SIZES [ kvTypeV ] / 8 ) ,
183
+ } ;
69
184
}
70
185
71
186
function printTable ( header : PrintColumnHeader [ ] , rows : string [ ] [ ] , leftPad = 2 ) {
0 commit comments