17
17
#include < algorithm>
18
18
19
19
float tensor_sum_elements (const ggml_tensor * tensor) {
20
- float sum = 0 ;
21
- if (tensor->type == GGML_TYPE_F32) {
20
+ double sum = 0 ;
21
+ if (tensor->type == GGML_TYPE_F32) {
22
22
for (int j = 0 ; j < tensor->ne [1 ]; j++) {
23
23
for (int k = 0 ; k < tensor->ne [0 ]; k++) {
24
- sum += ((float *) tensor->data )[j*tensor->ne [0 ]+ k];
24
+ sum += ((float *) tensor->data )[j*tensor->ne [0 ] + k];
25
25
}
26
26
}
27
27
}
@@ -110,12 +110,15 @@ int main(int argc, char ** argv) {
110
110
111
111
// printf("Memsize required = %i\n", sizex*sizex);
112
112
113
+ // TODO: perform the bench for all types or for a user specified type
114
+ const ggml_type qtype = GGML_TYPE_Q4_1;
115
+
113
116
size_t ctx_size = 0 ;
114
117
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
115
118
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
116
119
ctx_size += sizex*sizez*ggml_type_sizef (GGML_TYPE_F32);
117
- ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
118
- ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
120
+ ctx_size += sizex*sizey*ggml_type_sizef (qtype );
121
+ ctx_size += sizex*sizey*ggml_type_sizef (qtype );
119
122
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); // BLAS
120
123
ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); // BLAS
121
124
ctx_size += 1024 *1024 *16 ;
@@ -148,7 +151,7 @@ int main(int argc, char ** argv) {
148
151
struct ggml_tensor * m2 = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, sizex, sizez);
149
152
ggml_set_f32 (m2, 2 .0f );
150
153
151
- printf (" \n ------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------ \n " );
154
+ printf (" \n ------ Test 1 - Matrix Mult via F32 code\n " );
152
155
// printf("Creating new tensor m11xm2\n");
153
156
struct ggml_tensor * m11xm2 = ggml_mul_mat (ctx, m11, m2);
154
157
@@ -165,17 +168,16 @@ int main(int argc, char ** argv) {
165
168
166
169
TENSOR_DUMP (gf.nodes [0 ]);
167
170
168
- printf (" \n ------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------ \n " );
171
+ printf (" \n ------ Test 2 - Matrix Mult via %s code\n " , ggml_type_name (qtype) );
169
172
170
173
int32_t nelements = sizex*sizey;
171
- int32_t ne[2 ] = { sizex, sizey };
172
174
173
175
std::vector<int64_t > hist_cur (1 << 4 , 0 );
174
176
175
177
// Set up a the benchmark matrices
176
178
// printf("Creating new tensor q11 & Running quantize\n");
177
- struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
178
- ggml_quantize_q4_0 ( (const float *) m11->data , q11->data , nelements, ne[ 0 ] , hist_cur.data ());
179
+ struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
180
+ ggml_quantize_chunk (qtype, (const float *) m11->data , q11->data , 0 , nelements , hist_cur.data ());
179
181
180
182
// Set up a the compute graph
181
183
// printf("Creating new tensor q31\n");
@@ -187,8 +189,8 @@ int main(int argc, char ** argv) {
187
189
188
190
// Set up a second graph computation to make sure we override the CPU cache lines
189
191
// printf("Creating new tensor q12 & Running quantize\n");
190
- struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
191
- ggml_quantize_q4_0 ( (const float *) m12->data , q12->data , nelements, ne[ 0 ] , hist_cur.data ());
192
+ struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
193
+ ggml_quantize_chunk (qtype, (const float *) m12->data , q12->data , 0 , nelements , hist_cur.data ());
192
194
193
195
// printf("Creating new tensor q32\n");
194
196
struct ggml_tensor * q32 = ggml_mul_mat (ctx, q12, m2);
@@ -206,7 +208,7 @@ int main(int argc, char ** argv) {
206
208
printf (" Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n " , sizex, sizey, 1 , sizex, sizez, 1 , 1 .0f *flops_per_matrix / 1000 / 1000 / 1000 );
207
209
208
210
209
- // Let's use the F32 result from above as a reference for the q4_0 multiplication
211
+ // Let's use the F32 result from above as a reference for the quantized multiplication
210
212
float sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
211
213
212
214
printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
0 commit comments