@@ -171,7 +171,8 @@ int main(int argc, char ** argv) {
171
171
struct ggml_tensor * m11xm2 = ggml_mul_mat (ctx, m11, m2);
172
172
173
173
// printf("Creating compute graph\n");
174
- struct ggml_cgraph gf = ggml_build_forward (m11xm2);
174
+ struct ggml_cgraph * gf = ggml_new_graph (ctx);
175
+ ggml_build_forward_expand (gf, m11xm2);
175
176
176
177
printf (" n_threads=%i\n " , benchmark_params.n_threads );
177
178
@@ -180,9 +181,9 @@ int main(int argc, char ** argv) {
180
181
181
182
std::vector<uint8_t > work_buffer;
182
183
183
- ggml_graph_compute_helper (work_buffer, & gf, benchmark_params.n_threads );
184
+ ggml_graph_compute_helper (work_buffer, gf, benchmark_params.n_threads );
184
185
185
- TENSOR_DUMP (gf. nodes [0 ]);
186
+ TENSOR_DUMP (gf-> nodes [0 ]);
186
187
187
188
printf (" \n ------ Test 2 - Matrix Mult via %s code\n " , ggml_type_name (qtype));
188
189
@@ -200,7 +201,8 @@ int main(int argc, char ** argv) {
200
201
struct ggml_tensor * q31 = ggml_mul_mat (ctx, q11, m2);
201
202
202
203
// printf("Creating compute graph\n");
203
- struct ggml_cgraph gf31 = ggml_build_forward (q31);
204
+ struct ggml_cgraph * gf31 = ggml_new_graph (ctx);
205
+ ggml_build_forward_expand (gf31, q31);
204
206
205
207
// Set up a second graph computation to make sure we override the CPU cache lines
206
208
// printf("Creating new tensor q12 & Running quantize\n");
@@ -211,7 +213,8 @@ int main(int argc, char ** argv) {
211
213
struct ggml_tensor * q32 = ggml_mul_mat (ctx, q12, m2);
212
214
213
215
// printf("Creating compute graph\n");
214
- struct ggml_cgraph gf32 = ggml_build_forward (q32);
216
+ struct ggml_cgraph * gf32 = ggml_new_graph (ctx);
217
+ ggml_build_forward_expand (gf32, q32);
215
218
printf (" n_threads=%i\n " , benchmark_params.n_threads );
216
219
217
220
const int dimx = sizex;
@@ -223,7 +226,7 @@ int main(int argc, char ** argv) {
223
226
224
227
225
228
// Let's use the F32 result from above as a reference for the quantized multiplication
226
- float sum_of_F32_reference = tensor_sum_elements (gf. nodes [0 ]);
229
+ float sum_of_F32_reference = tensor_sum_elements (gf-> nodes [0 ]);
227
230
228
231
printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
229
232
printf (" =====================================================================================\n " );
@@ -233,7 +236,7 @@ int main(int argc, char ** argv) {
233
236
234
237
long long int start = ggml_time_us ();
235
238
// printf("Running ggml_graph_compute\n");
236
- ggml_graph_compute_helper (work_buffer, & gf31, benchmark_params.n_threads );
239
+ ggml_graph_compute_helper (work_buffer, gf31, benchmark_params.n_threads );
237
240
238
241
long long int stop = ggml_time_us ();
239
242
long long int usec = stop-start;
@@ -251,7 +254,7 @@ int main(int argc, char ** argv) {
251
254
252
255
// Check that the matrix multiplication result is in the right ballpark
253
256
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
254
- float sum_of_Q4_result = tensor_sum_elements (gf31. nodes [0 ]);
257
+ float sum_of_Q4_result = tensor_sum_elements (gf31-> nodes [0 ]);
255
258
float delta = std::abs (sum_of_Q4_result - sum_of_F32_reference);
256
259
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000 ; // Let's accept an epsilon of 10^-6
257
260
@@ -266,7 +269,7 @@ int main(int argc, char ** argv) {
266
269
}
267
270
268
271
// Running a different graph computation to make sure we override the CPU cache lines
269
- ggml_graph_compute_helper (work_buffer, & gf32, benchmark_params.n_threads );
272
+ ggml_graph_compute_helper (work_buffer, gf32, benchmark_params.n_threads );
270
273
}
271
274
printf (" \n " );
272
275
printf (" Average%78.2f\n " ,gflops_sum/((double )benchmark_params.n_iterations ));
0 commit comments