24
24
25
25
float tensor_sum_elements (struct ggml_tensor * tensor ) {
26
26
float sum = 0 ;
27
- if (tensor -> type == 6 ) {
28
- for (int j = 0 ; j < tensor -> ne [1 ]; j ++ ) {
29
- for (int k = 0 ; k < tensor -> ne [0 ]; k ++ ) {
30
- sum += ((float * ) tensor -> data )[j * tensor -> ne [0 ]+ k ];
31
- }
32
- }
27
+ if (tensor -> type == 6 ) {
28
+ for (int j = 0 ; j < tensor -> ne [1 ]; j ++ ) {
29
+ for (int k = 0 ; k < tensor -> ne [0 ]; k ++ ) {
30
+ sum += ((float * ) tensor -> data )[j * tensor -> ne [0 ]+ k ];
31
+ }
32
+ }
33
33
}
34
34
return sum ;
35
35
}
@@ -39,7 +39,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
39
39
These are mapping to unknown
40
40
GGML_TYPE_I8,
41
41
GGML_TYPE_I16,
42
- GGML_TYPE_I32,
42
+ GGML_TYPE_I32,
43
43
GGML_TYPE_COUNT,
44
44
*/
45
45
@@ -50,7 +50,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
50
50
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
51
51
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
52
52
53
- struct benchmark_params_struct {
53
+ struct benchmark_params_struct {
54
54
int32_t n_threads = 1 ;
55
55
int32_t n_iterations = 10 ;
56
56
};
@@ -67,7 +67,7 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
67
67
68
68
int main (int argc , char * * argv ) {
69
69
70
-
70
+
71
71
struct benchmark_params_struct benchmark_params ;
72
72
73
73
bool invalid_param = false;
@@ -90,7 +90,7 @@ int main(int argc, char ** argv) {
90
90
} else if (arg == "-h" || arg == "--help" ) {
91
91
print_usage (argc , argv , benchmark_params );
92
92
exit (0 );
93
- }
93
+ }
94
94
if (invalid_param ) {
95
95
fprintf (stderr , "error: invalid parameter for argument: %s\n" , arg .c_str ());
96
96
print_usage (argc , argv , benchmark_params );
@@ -101,41 +101,41 @@ int main(int argc, char ** argv) {
101
101
102
102
// create the ggml context
103
103
printf ("Starting Test\n" );
104
-
105
104
106
-
105
+
106
+
107
107
struct ggml_context * ctx ;
108
108
//const int sizex = 4096;
109
109
//const int sizey = 11008;
110
110
111
111
#undef VERBOSE_DEBUGGING
112
112
#ifndef VERBOSE_DEBUGGING
113
113
const int sizey = 4096 ;
114
- const int sizex = 11008 ;
114
+ const int sizex = 11008 ;
115
115
const int sizez = 128 ;
116
116
#else
117
117
/* Working - let's increase size */
118
118
const int sizey = 1 ;
119
- const int sizex = (8 * 32 );
119
+ const int sizex = (8 * 32 );
120
120
const int sizez = 1 ;
121
121
122
122
/*const int sizey = 1;
123
- const int sizex = 3*(8*32);
123
+ const int sizex = 3*(8*32);
124
124
const int sizez = 1;*/
125
125
#endif
126
126
127
127
//printf("Memsize required = %i\n", sizex*sizex);
128
- ggml_type wtype = GGML_TYPE_F32 ;
129
-
128
+ ggml_type wtype = GGML_TYPE_F32 ;
129
+
130
130
size_t ctx_size = 0 ;
131
131
ctx_size += sizex * sizey * ggml_type_sizef (wtype );
132
132
ctx_size += sizex * sizey * ggml_type_sizef (wtype );
133
133
ctx_size += sizex * sizey * ggml_type_sizef (GGML_TYPE_F32 );
134
134
ctx_size += sizex * sizeof (float );
135
- ctx_size += 1024 * 1024 * 100 ;
136
-
135
+ ctx_size += 1024 * 1024 * 100 ;
136
+
137
137
printf ("Allocating Memory of size %li byes, %li MB\n" ,ctx_size , (ctx_size /1024 /1024 ));
138
-
138
+
139
139
struct ggml_init_params params = {
140
140
/*.mem_size =*/ ctx_size ,
141
141
/*.mem_buffer =*/ NULL ,
@@ -147,88 +147,88 @@ int main(int argc, char ** argv) {
147
147
fprintf (stderr , "%s: ggml_init() failed\n" , __func__ );
148
148
return false;
149
149
}
150
-
151
-
150
+
151
+
152
152
printf ("Creating new tensors\n" );
153
153
// printf("Creating new tensor m1\n");
154
154
struct ggml_tensor * m11 = ggml_new_tensor_2d (ctx , GGML_TYPE_F32 , sizex , sizey );
155
155
ggml_set_f32 (m11 , 1.0f );
156
-
156
+
157
157
// printf("Creating new tensor m1\n");
158
158
struct ggml_tensor * m12 = ggml_new_tensor_2d (ctx , GGML_TYPE_F32 , sizex , sizey );
159
159
ggml_set_f32 (m12 , 1.5f );
160
-
160
+
161
161
// printf("Creating new tensor m2\n");
162
162
struct ggml_tensor * m2 = ggml_new_tensor_2d (ctx , GGML_TYPE_F32 , sizex , sizez );
163
163
ggml_set_f32 (m2 , 2.0f );
164
-
164
+
165
165
printf ("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n" );
166
166
// printf("Creating new tensor m11xm2\n");
167
167
struct ggml_tensor * m11xm2 = ggml_mul_mat (ctx , m11 , m2 );
168
-
168
+
169
169
// printf("Creating compute graph\n");
170
170
struct ggml_cgraph gf = ggml_build_forward (m11xm2 );
171
-
171
+
172
172
gf .n_threads = benchmark_params .n_threads ;
173
- printf ("cgraph->n_threads=%i\n" ,gf .n_threads );
174
-
173
+ printf ("cgraph->n_threads=%i\n" ,gf .n_threads );
174
+
175
175
TENSOR_DUMP (m11 );
176
176
TENSOR_DUMP (m2 );
177
-
177
+
178
178
ggml_graph_compute (ctx , & gf );
179
179
180
180
TENSOR_DUMP (gf .nodes [0 ]);
181
-
181
+
182
182
printf ("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n" );
183
-
183
+
184
184
int32_t nelements = sizex * sizey ;
185
185
int32_t ne [2 ] = { sizex , sizey };
186
-
187
- std ::vector < int64_t > hist_cur (1 << 4 , 0 );
186
+
187
+ std ::vector < int64_t > hist_cur (1 << 4 , 0 );
188
188
189
189
// Set up a the benchmark matrices
190
190
// printf("Creating new tensor q11 & Running quantize\n");
191
191
struct ggml_tensor * q11 = ggml_new_tensor_2d (ctx , GGML_TYPE_Q4_0 , sizex , sizey );
192
192
ggml_quantize_q4_0 ((const float * ) m11 -> data , q11 -> data , nelements , ne [0 ], hist_cur .data ());
193
-
193
+
194
194
// Set up a the compute graph
195
195
// printf("Creating new tensor q31\n");
196
196
struct ggml_tensor * q31 = ggml_mul_mat (ctx , q11 , m2 );
197
-
197
+
198
198
// printf("Creating compute graph\n");
199
199
struct ggml_cgraph gf31 = ggml_build_forward (q31 );
200
200
gf31 .n_threads = benchmark_params .n_threads ;
201
-
202
- // Set up a second graph computation to make sure we override the CPU cache lines
201
+
202
+ // Set up a second graph computation to make sure we override the CPU cache lines
203
203
// printf("Creating new tensor q12 & Running quantize\n");
204
204
struct ggml_tensor * q12 = ggml_new_tensor_2d (ctx , GGML_TYPE_Q4_0 , sizex , sizey );
205
205
ggml_quantize_q4_0 ((const float * ) m12 -> data , q12 -> data , nelements , ne [0 ], hist_cur .data ());
206
206
207
207
// printf("Creating new tensor q32\n");
208
208
struct ggml_tensor * q32 = ggml_mul_mat (ctx , q12 , m2 );
209
-
209
+
210
210
//printf("Creating compute graph\n");
211
211
struct ggml_cgraph gf32 = ggml_build_forward (q32 );
212
212
gf32 .n_threads = benchmark_params .n_threads ;
213
- printf ("cgraph->n_threads=%i\n" ,gf31 .n_threads );
214
-
213
+ printf ("cgraph->n_threads=%i\n" ,gf31 .n_threads );
214
+
215
215
const int dimx = sizex ;
216
216
const int dimy = sizey ;
217
217
const int dimz = sizez ;
218
218
long long int flops_per_dot_product = dimy + dimy ;
219
219
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz ; ;
220
220
printf ("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n" , sizex , sizey , 1 , sizex , sizez , 1 , 1.0f * flops_per_matrix / 1000 / 1000 / 1000 );
221
-
221
+
222
222
223
223
// Let's use the F32 result from above as a reference for the q4_0 multiplication
224
224
float sum_of_F32_reference = tensor_sum_elements (gf .nodes [0 ]);
225
-
225
+
226
226
227
227
printf ("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n" );
228
228
printf ("==============================================================================================\n" );
229
-
229
+
230
230
for (int i = 0 ;i < benchmark_params .n_iterations ;i ++ ) {
231
-
231
+
232
232
long long int start = ggml_time_us ();
233
233
//printf("Running ggml_graph_compute\n");
234
234
ggml_graph_compute (ctx , & gf31 );
@@ -238,33 +238,33 @@ int main(int argc, char ** argv) {
238
238
float flops_per_usec = (1.0f * flops_per_matrix )/usec ;
239
239
printf ("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n" ,
240
240
i ,
241
- gf31 .n_threads ,
242
- sizex , sizey , sizez , flops_per_matrix ,
241
+ gf31 .n_threads ,
242
+ sizex , sizey , sizez , flops_per_matrix ,
243
243
usec ,flops_per_usec );
244
244
245
245
#ifdef VERBOSE_DEBUGGING
246
246
TENSOR_DUMP ("res" ,gf31 .nodes [0 ])
247
247
#endif
248
248
249
- // Check that the matrix multiplication result is in the right ballpark
249
+ // Check that the matrix multiplication result is in the right ballpark
250
250
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
251
251
float sum_of_Q4_result = tensor_sum_elements (gf31 .nodes [0 ]);
252
252
float delta = abs (sum_of_Q4_result - sum_of_F32_reference );
253
253
float allowed_delta = (sum_of_F32_reference ) / 1000 / 1000 ; // Let's accept an epsilon of 10^-6
254
254
255
255
if (delta > allowed_delta ) {
256
256
printf ("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n" ,
257
- sum_of_F32_reference ,
257
+ sum_of_F32_reference ,
258
258
sum_of_Q4_result ,
259
259
delta ,
260
260
allowed_delta
261
261
);
262
262
exit (0 );
263
263
}
264
-
265
- // Running a different graph computation to make sure we override the CPU cache lines
264
+
265
+ // Running a different graph computation to make sure we override the CPU cache lines
266
266
ggml_graph_compute (ctx , & gf32 );
267
-
267
+
268
268
}
269
-
269
+
270
270
}
0 commit comments