Skip to content

Commit ea6a543

Browse files
committed
Make imatrix not a header-only lib
1 parent f719cb9 commit ea6a543

File tree

2 files changed

+285
-281
lines changed

2 files changed

+285
-281
lines changed

imatrix.cpp

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
#include "imatrix.hpp"
2+
3+
/*Stolen from llama.cpp (credits: Kawrakow)*/
4+
5+
#include "ggml-backend.h"
6+
#include "ggml.h"
7+
#include "util.h"
8+
9+
#include <fstream>
10+
#include <mutex>
11+
#include <unordered_map>
12+
#include <string>
13+
14+
// remove any prefix and suffixes from the name
15+
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
16+
static std::string filter_tensor_name(const char* name) {
17+
std::string wname;
18+
const char* p = strchr(name, '#');
19+
if (p != NULL) {
20+
p = p + 1;
21+
const char* q = strchr(p, '#');
22+
if (q != NULL) {
23+
wname = std::string(p, q - p);
24+
} else {
25+
wname = p;
26+
}
27+
} else {
28+
wname = name;
29+
}
30+
return wname;
31+
}
32+
33+
bool IMatrixCollector::collect_imatrix(struct ggml_tensor* t, bool ask, void* user_data) {
34+
GGML_UNUSED(user_data);
35+
const struct ggml_tensor* src0 = t->src[0];
36+
const struct ggml_tensor* src1 = t->src[1];
37+
std::string wname = filter_tensor_name(src0->name);
38+
39+
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
40+
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
41+
if (ask) {
42+
if (t->op == GGML_OP_MUL_MAT_ID)
43+
return true; // collect all indirect matrix multiplications
44+
if (t->op != GGML_OP_MUL_MAT)
45+
return false;
46+
// why are small batches ignored (<16 tokens)?
47+
// if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
48+
if (!(wname.substr(0, 6) == "model." || wname.substr(0, 17) == "cond_stage_model." || wname.substr(0,14) == "text_encoders."))
49+
return false;
50+
return true;
51+
}
52+
// LOG_DEBUG("%s", wname.c_str());
53+
54+
std::lock_guard<std::mutex> lock(m_mutex);
55+
56+
// copy the data from the GPU memory if needed
57+
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
58+
59+
if (!is_host) {
60+
m_src1_data.resize(ggml_nelements(src1));
61+
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
62+
}
63+
64+
const float* data = is_host ? (const float*)src1->data : m_src1_data.data();
65+
66+
// this has been adapted to the new format of storing merged experts in a single 3d tensor
67+
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
68+
if (t->op == GGML_OP_MUL_MAT_ID) {
69+
// ids -> [n_experts_used, n_tokens]
70+
// src1 -> [cols, n_expert_used, n_tokens]
71+
const ggml_tensor* ids = t->src[2];
72+
const int n_as = src0->ne[2];
73+
const int n_ids = ids->ne[0];
74+
75+
// the top-k selected expert ids are stored in the ids tensor
76+
// for simplicity, always copy ids to host, because it is small
77+
// take into account that ids is not contiguous!
78+
79+
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
80+
81+
m_ids.resize(ggml_nbytes(ids));
82+
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
83+
84+
auto& e = m_stats[wname];
85+
86+
++e.ncall;
87+
88+
if (e.values.empty()) {
89+
e.values.resize(src1->ne[0] * n_as, 0);
90+
e.counts.resize(src1->ne[0] * n_as, 0);
91+
} else if (e.values.size() != (size_t)src1->ne[0] * n_as) {
92+
LOG_ERROR("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0] * n_as);
93+
exit(1); // GGML_ABORT("fatal error");
94+
}
95+
// LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
96+
// loop over all possible experts, regardless if they are used or not in the batch
97+
for (int ex = 0; ex < n_as; ++ex) {
98+
size_t e_start = ex * src1->ne[0];
99+
100+
for (int idx = 0; idx < n_ids; ++idx) {
101+
for (int row = 0; row < (int)src1->ne[2]; ++row) {
102+
const int excur = *(const int32_t*)(m_ids.data() + row * ids->nb[1] + idx * ids->nb[0]);
103+
104+
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
105+
106+
if (excur != ex)
107+
continue;
108+
109+
const int64_t i11 = idx % src1->ne[1];
110+
const int64_t i12 = row;
111+
const float* x = (const float*)((const char*)data + i11 * src1->nb[1] + i12 * src1->nb[2]);
112+
113+
for (int j = 0; j < (int)src1->ne[0]; ++j) {
114+
e.values[e_start + j] += x[j] * x[j];
115+
e.counts[e_start + j]++;
116+
if (!std::isfinite(e.values[e_start + j])) {
117+
printf("\n");
118+
LOG_ERROR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
119+
exit(1);
120+
}
121+
}
122+
}
123+
}
124+
}
125+
} else {
126+
auto& e = m_stats[wname];
127+
if (e.values.empty()) {
128+
e.values.resize(src1->ne[0], 0);
129+
e.counts.resize(src1->ne[0], 0);
130+
} else if (e.values.size() != (size_t)src1->ne[0]) {
131+
LOG_WARN("inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
132+
exit(1); // GGML_ABORT("fatal error");
133+
}
134+
135+
++e.ncall;
136+
// LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
137+
for (int row = 0; row < (int)src1->ne[1]; ++row) {
138+
const float* x = data + row * src1->ne[0];
139+
for (int j = 0; j < (int)src1->ne[0]; ++j) {
140+
e.values[j] += x[j] * x[j];
141+
e.counts[j]++;
142+
if (!std::isfinite(e.values[j])) {
143+
LOG_WARN("%f detected in %s\n", e.values[j], wname.c_str());
144+
exit(1);
145+
}
146+
}
147+
}
148+
}
149+
return true;
150+
151+
}
152+
153+
void IMatrixCollector::save_imatrix(std::string fname,int ncall) const {
154+
LOG_INFO("SAVING_IMATRIX to %s\n", fname.c_str());
155+
156+
if (ncall > 0) {
157+
fname += ".at_";
158+
fname += std::to_string(ncall);
159+
}
160+
// avoid writing imatrix entries that do not have full data
161+
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
162+
163+
int n_entries = 0;
164+
std::vector<std::string> to_store;
165+
166+
bool is_first = true; // for printing
167+
for (const auto& kv : m_stats) {
168+
const int n_all = kv.second.counts.size();
169+
170+
if (n_all == 0) {
171+
continue;
172+
}
173+
174+
int n_zeros = 0;
175+
for (const int c : kv.second.counts) {
176+
if (c == 0) {
177+
n_zeros++;
178+
}
179+
}
180+
181+
if (n_zeros != 0 && is_first) {
182+
printf("\n");
183+
is_first = false;
184+
}
185+
186+
if (n_zeros == n_all) {
187+
LOG_WARN("entry '%40s' has no data - skipping\n", kv.first.c_str());
188+
continue;
189+
}
190+
191+
if (n_zeros > 0) {
192+
LOG_WARN("entry '%40s' has partial data (%.2f%%) - skipping\n", kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
193+
continue;
194+
}
195+
196+
n_entries++;
197+
to_store.push_back(kv.first);
198+
}
199+
200+
if (to_store.size() < m_stats.size()) {
201+
LOG_WARN("storing only %zu out of %zu entries\n", to_store.size(), m_stats.size());
202+
}
203+
204+
std::ofstream out(fname, std::ios::binary);
205+
out.write((const char*)&n_entries, sizeof(n_entries));
206+
for (const auto& name : to_store) {
207+
const auto& stat = m_stats.at(name);
208+
int len = name.size();
209+
out.write((const char*)&len, sizeof(len));
210+
out.write(name.c_str(), len);
211+
out.write((const char*)&stat.ncall, sizeof(stat.ncall));
212+
int nval = stat.values.size();
213+
out.write((const char*)&nval, sizeof(nval));
214+
if (nval > 0) {
215+
std::vector<float> tmp(nval);
216+
for (int i = 0; i < nval; i++) {
217+
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
218+
}
219+
out.write((const char*)tmp.data(), nval * sizeof(float));
220+
}
221+
}
222+
223+
// Write the number of call the matrix was computed with
224+
out.write((const char*)&m_last_call, sizeof(m_last_call));
225+
226+
// LOG_DEBUG("\n");
227+
// LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
228+
}
229+
230+
bool IMatrixCollector::load_imatrix(const char* fname) {
231+
std::ifstream in(fname, std::ios::binary);
232+
if (!in) {
233+
LOG_ERROR("failed to open %s\n", fname);
234+
return false;
235+
}
236+
int n_entries;
237+
in.read((char*)&n_entries, sizeof(n_entries));
238+
if (in.fail() || n_entries < 1) {
239+
LOG_ERROR("no data in file %s\n", fname);
240+
return false;
241+
}
242+
for (int i = 0; i < n_entries; ++i) {
243+
int len;
244+
in.read((char*)&len, sizeof(len));
245+
std::vector<char> name_as_vec(len + 1);
246+
in.read((char*)name_as_vec.data(), len);
247+
if (in.fail()) {
248+
LOG_ERROR("failed reading name for entry %d from %s\n", i + 1, fname);
249+
return false;
250+
}
251+
name_as_vec[len] = 0;
252+
std::string name{name_as_vec.data()};
253+
auto& e = m_stats[std::move(name)];
254+
int ncall;
255+
in.read((char*)&ncall, sizeof(ncall));
256+
int nval;
257+
in.read((char*)&nval, sizeof(nval));
258+
if (in.fail() || nval < 1) {
259+
LOG_ERROR("failed reading number of values for entry %d\n", i);
260+
m_stats = {};
261+
return false;
262+
}
263+
264+
if (e.values.empty()) {
265+
e.values.resize(nval, 0);
266+
e.counts.resize(nval, 0);
267+
}
268+
269+
std::vector<float> tmp(nval);
270+
in.read((char*)tmp.data(), nval * sizeof(float));
271+
if (in.fail()) {
272+
LOG_ERROR("failed reading data for entry %d\n", i);
273+
m_stats = {};
274+
return false;
275+
}
276+
277+
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
278+
for (int i = 0; i < nval; i++) {
279+
e.values[i] += tmp[i];
280+
e.counts[i] += ncall;
281+
}
282+
e.ncall += ncall;
283+
}
284+
return true;
285+
}

0 commit comments

Comments
 (0)