1
+ #include " imatrix.hpp"
2
+
3
+ /* Stolen from llama.cpp (credits: Kawrakow)*/
4
+
5
+ #include " ggml-backend.h"
6
+ #include " ggml.h"
7
+ #include " util.h"
8
+
9
+ #include < fstream>
10
+ #include < mutex>
11
+ #include < unordered_map>
12
+ #include < string>
13
+
14
+ // remove any prefix and suffixes from the name
15
+ // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
16
+ static std::string filter_tensor_name (const char * name) {
17
+ std::string wname;
18
+ const char * p = strchr (name, ' #' );
19
+ if (p != NULL ) {
20
+ p = p + 1 ;
21
+ const char * q = strchr (p, ' #' );
22
+ if (q != NULL ) {
23
+ wname = std::string (p, q - p);
24
+ } else {
25
+ wname = p;
26
+ }
27
+ } else {
28
+ wname = name;
29
+ }
30
+ return wname;
31
+ }
32
+
33
+ bool IMatrixCollector::collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data) {
34
+ GGML_UNUSED (user_data);
35
+ const struct ggml_tensor * src0 = t->src [0 ];
36
+ const struct ggml_tensor * src1 = t->src [1 ];
37
+ std::string wname = filter_tensor_name (src0->name );
38
+
39
+ // when ask is true, the scheduler wants to know if we are interested in data from this tensor
40
+ // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
41
+ if (ask) {
42
+ if (t->op == GGML_OP_MUL_MAT_ID)
43
+ return true ; // collect all indirect matrix multiplications
44
+ if (t->op != GGML_OP_MUL_MAT)
45
+ return false ;
46
+ // why are small batches ignored (<16 tokens)?
47
+ // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
48
+ if (!(wname.substr (0 , 6 ) == " model." || wname.substr (0 , 17 ) == " cond_stage_model." || wname.substr (0 ,14 ) == " text_encoders." ))
49
+ return false ;
50
+ return true ;
51
+ }
52
+ // LOG_DEBUG("%s", wname.c_str());
53
+
54
+ std::lock_guard<std::mutex> lock (m_mutex);
55
+
56
+ // copy the data from the GPU memory if needed
57
+ const bool is_host = ggml_backend_buffer_is_host (src1->buffer );
58
+
59
+ if (!is_host) {
60
+ m_src1_data.resize (ggml_nelements (src1));
61
+ ggml_backend_tensor_get (src1, m_src1_data.data (), 0 , ggml_nbytes (src1));
62
+ }
63
+
64
+ const float * data = is_host ? (const float *)src1->data : m_src1_data.data ();
65
+
66
+ // this has been adapted to the new format of storing merged experts in a single 3d tensor
67
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6387
68
+ if (t->op == GGML_OP_MUL_MAT_ID) {
69
+ // ids -> [n_experts_used, n_tokens]
70
+ // src1 -> [cols, n_expert_used, n_tokens]
71
+ const ggml_tensor* ids = t->src [2 ];
72
+ const int n_as = src0->ne [2 ];
73
+ const int n_ids = ids->ne [0 ];
74
+
75
+ // the top-k selected expert ids are stored in the ids tensor
76
+ // for simplicity, always copy ids to host, because it is small
77
+ // take into account that ids is not contiguous!
78
+
79
+ GGML_ASSERT (ids->ne [1 ] == src1->ne [2 ]);
80
+
81
+ m_ids.resize (ggml_nbytes (ids));
82
+ ggml_backend_tensor_get (ids, m_ids.data (), 0 , ggml_nbytes (ids));
83
+
84
+ auto & e = m_stats[wname];
85
+
86
+ ++e.ncall ;
87
+
88
+ if (e.values .empty ()) {
89
+ e.values .resize (src1->ne [0 ] * n_as, 0 );
90
+ e.counts .resize (src1->ne [0 ] * n_as, 0 );
91
+ } else if (e.values .size () != (size_t )src1->ne [0 ] * n_as) {
92
+ LOG_ERROR (" inconsistent size for %s (%d vs %d)\n " , wname.c_str (), (int )e.values .size (), (int )src1->ne [0 ] * n_as);
93
+ exit (1 ); // GGML_ABORT("fatal error");
94
+ }
95
+ // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
96
+ // loop over all possible experts, regardless if they are used or not in the batch
97
+ for (int ex = 0 ; ex < n_as; ++ex) {
98
+ size_t e_start = ex * src1->ne [0 ];
99
+
100
+ for (int idx = 0 ; idx < n_ids; ++idx) {
101
+ for (int row = 0 ; row < (int )src1->ne [2 ]; ++row) {
102
+ const int excur = *(const int32_t *)(m_ids.data () + row * ids->nb [1 ] + idx * ids->nb [0 ]);
103
+
104
+ GGML_ASSERT (excur >= 0 && excur < n_as); // sanity check
105
+
106
+ if (excur != ex)
107
+ continue ;
108
+
109
+ const int64_t i11 = idx % src1->ne [1 ];
110
+ const int64_t i12 = row;
111
+ const float * x = (const float *)((const char *)data + i11 * src1->nb [1 ] + i12 * src1->nb [2 ]);
112
+
113
+ for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
114
+ e.values [e_start + j] += x[j] * x[j];
115
+ e.counts [e_start + j]++;
116
+ if (!std::isfinite (e.values [e_start + j])) {
117
+ printf (" \n " );
118
+ LOG_ERROR (" %f detected in %s\n " , e.values [e_start + j], wname.c_str ());
119
+ exit (1 );
120
+ }
121
+ }
122
+ }
123
+ }
124
+ }
125
+ } else {
126
+ auto & e = m_stats[wname];
127
+ if (e.values .empty ()) {
128
+ e.values .resize (src1->ne [0 ], 0 );
129
+ e.counts .resize (src1->ne [0 ], 0 );
130
+ } else if (e.values .size () != (size_t )src1->ne [0 ]) {
131
+ LOG_WARN (" inconsistent size for %s (%d vs %d)\n " , wname.c_str (), (int )e.values .size (), (int )src1->ne [0 ]);
132
+ exit (1 ); // GGML_ABORT("fatal error");
133
+ }
134
+
135
+ ++e.ncall ;
136
+ // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
137
+ for (int row = 0 ; row < (int )src1->ne [1 ]; ++row) {
138
+ const float * x = data + row * src1->ne [0 ];
139
+ for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
140
+ e.values [j] += x[j] * x[j];
141
+ e.counts [j]++;
142
+ if (!std::isfinite (e.values [j])) {
143
+ LOG_WARN (" %f detected in %s\n " , e.values [j], wname.c_str ());
144
+ exit (1 );
145
+ }
146
+ }
147
+ }
148
+ }
149
+ return true ;
150
+
151
+ }
152
+
153
+ void IMatrixCollector::save_imatrix (std::string fname,int ncall) const {
154
+ LOG_INFO (" SAVING_IMATRIX to %s\n " , fname.c_str ());
155
+
156
+ if (ncall > 0 ) {
157
+ fname += " .at_" ;
158
+ fname += std::to_string (ncall);
159
+ }
160
+ // avoid writing imatrix entries that do not have full data
161
+ // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
162
+
163
+ int n_entries = 0 ;
164
+ std::vector<std::string> to_store;
165
+
166
+ bool is_first = true ; // for printing
167
+ for (const auto & kv : m_stats) {
168
+ const int n_all = kv.second .counts .size ();
169
+
170
+ if (n_all == 0 ) {
171
+ continue ;
172
+ }
173
+
174
+ int n_zeros = 0 ;
175
+ for (const int c : kv.second .counts ) {
176
+ if (c == 0 ) {
177
+ n_zeros++;
178
+ }
179
+ }
180
+
181
+ if (n_zeros != 0 && is_first) {
182
+ printf (" \n " );
183
+ is_first = false ;
184
+ }
185
+
186
+ if (n_zeros == n_all) {
187
+ LOG_WARN (" entry '%40s' has no data - skipping\n " , kv.first .c_str ());
188
+ continue ;
189
+ }
190
+
191
+ if (n_zeros > 0 ) {
192
+ LOG_WARN (" entry '%40s' has partial data (%.2f%%) - skipping\n " , kv.first .c_str (), 100 .0f * (n_all - n_zeros) / n_all);
193
+ continue ;
194
+ }
195
+
196
+ n_entries++;
197
+ to_store.push_back (kv.first );
198
+ }
199
+
200
+ if (to_store.size () < m_stats.size ()) {
201
+ LOG_WARN (" storing only %zu out of %zu entries\n " , to_store.size (), m_stats.size ());
202
+ }
203
+
204
+ std::ofstream out (fname, std::ios::binary);
205
+ out.write ((const char *)&n_entries, sizeof (n_entries));
206
+ for (const auto & name : to_store) {
207
+ const auto & stat = m_stats.at (name);
208
+ int len = name.size ();
209
+ out.write ((const char *)&len, sizeof (len));
210
+ out.write (name.c_str (), len);
211
+ out.write ((const char *)&stat.ncall , sizeof (stat.ncall ));
212
+ int nval = stat.values .size ();
213
+ out.write ((const char *)&nval, sizeof (nval));
214
+ if (nval > 0 ) {
215
+ std::vector<float > tmp (nval);
216
+ for (int i = 0 ; i < nval; i++) {
217
+ tmp[i] = (stat.values [i] / static_cast <float >(stat.counts [i])) * static_cast <float >(stat.ncall );
218
+ }
219
+ out.write ((const char *)tmp.data (), nval * sizeof (float ));
220
+ }
221
+ }
222
+
223
+ // Write the number of call the matrix was computed with
224
+ out.write ((const char *)&m_last_call, sizeof (m_last_call));
225
+
226
+ // LOG_DEBUG("\n");
227
+ // LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
228
+ }
229
+
230
+ bool IMatrixCollector::load_imatrix (const char * fname) {
231
+ std::ifstream in (fname, std::ios::binary);
232
+ if (!in) {
233
+ LOG_ERROR (" failed to open %s\n " , fname);
234
+ return false ;
235
+ }
236
+ int n_entries;
237
+ in.read ((char *)&n_entries, sizeof (n_entries));
238
+ if (in.fail () || n_entries < 1 ) {
239
+ LOG_ERROR (" no data in file %s\n " , fname);
240
+ return false ;
241
+ }
242
+ for (int i = 0 ; i < n_entries; ++i) {
243
+ int len;
244
+ in.read ((char *)&len, sizeof (len));
245
+ std::vector<char > name_as_vec (len + 1 );
246
+ in.read ((char *)name_as_vec.data (), len);
247
+ if (in.fail ()) {
248
+ LOG_ERROR (" failed reading name for entry %d from %s\n " , i + 1 , fname);
249
+ return false ;
250
+ }
251
+ name_as_vec[len] = 0 ;
252
+ std::string name{name_as_vec.data ()};
253
+ auto & e = m_stats[std::move (name)];
254
+ int ncall;
255
+ in.read ((char *)&ncall, sizeof (ncall));
256
+ int nval;
257
+ in.read ((char *)&nval, sizeof (nval));
258
+ if (in.fail () || nval < 1 ) {
259
+ LOG_ERROR (" failed reading number of values for entry %d\n " , i);
260
+ m_stats = {};
261
+ return false ;
262
+ }
263
+
264
+ if (e.values .empty ()) {
265
+ e.values .resize (nval, 0 );
266
+ e.counts .resize (nval, 0 );
267
+ }
268
+
269
+ std::vector<float > tmp (nval);
270
+ in.read ((char *)tmp.data (), nval * sizeof (float ));
271
+ if (in.fail ()) {
272
+ LOG_ERROR (" failed reading data for entry %d\n " , i);
273
+ m_stats = {};
274
+ return false ;
275
+ }
276
+
277
+ // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
278
+ for (int i = 0 ; i < nval; i++) {
279
+ e.values [i] += tmp[i];
280
+ e.counts [i] += ncall;
281
+ }
282
+ e.ncall += ncall;
283
+ }
284
+ return true ;
285
+ }
0 commit comments