Skip to content

Commit b16c085

Browse files
committed
examples : fix benchmark-matmult
The precision for Q4_0 has degraded since #1508
1 parent 265db98 commit b16c085

File tree

1 file changed

+15
-13
lines changed

1 file changed

+15
-13
lines changed

examples/benchmark/benchmark-matmult.cpp

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
#include <algorithm>
1818

1919
float tensor_sum_elements(const ggml_tensor * tensor) {
20-
float sum = 0;
21-
if (tensor->type==GGML_TYPE_F32) {
20+
double sum = 0;
21+
if (tensor->type == GGML_TYPE_F32) {
2222
for (int j = 0; j < tensor->ne[1]; j++) {
2323
for (int k = 0; k < tensor->ne[0]; k++) {
24-
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
24+
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
2525
}
2626
}
2727
}
@@ -110,12 +110,15 @@ int main(int argc, char ** argv) {
110110

111111
//printf("Memsize required = %i\n", sizex*sizex);
112112

113+
// TODO: perform the bench for all types or for a user specified type
114+
const ggml_type qtype = GGML_TYPE_Q4_1;
115+
113116
size_t ctx_size = 0;
114117
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
115118
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
116119
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
117-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
118-
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
120+
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
121+
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
119122
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
120123
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
121124
ctx_size += 1024*1024*16;
@@ -148,7 +151,7 @@ int main(int argc, char ** argv) {
148151
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
149152
ggml_set_f32(m2, 2.0f);
150153

151-
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
154+
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
152155
// printf("Creating new tensor m11xm2\n");
153156
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
154157

@@ -165,17 +168,16 @@ int main(int argc, char ** argv) {
165168

166169
TENSOR_DUMP(gf.nodes[0]);
167170

168-
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
171+
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
169172

170173
int32_t nelements = sizex*sizey;
171-
int32_t ne[2] = { sizex, sizey };
172174

173175
std::vector<int64_t> hist_cur(1 << 4, 0);
174176

175177
// Set up a the benchmark matrices
176178
// printf("Creating new tensor q11 & Running quantize\n");
177-
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
178-
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
179+
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
180+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
179181

180182
// Set up a the compute graph
181183
// printf("Creating new tensor q31\n");
@@ -187,8 +189,8 @@ int main(int argc, char ** argv) {
187189

188190
// Set up a second graph computation to make sure we override the CPU cache lines
189191
// printf("Creating new tensor q12 & Running quantize\n");
190-
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
191-
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
192+
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
193+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
192194

193195
// printf("Creating new tensor q32\n");
194196
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@@ -206,7 +208,7 @@ int main(int argc, char ** argv) {
206208
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
207209

208210

209-
// Let's use the F32 result from above as a reference for the q4_0 multiplication
211+
// Let's use the F32 result from above as a reference for the quantized multiplication
210212
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
211213

212214
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");

0 commit comments

Comments
 (0)