Skip to content

Commit 9510af7

Browse files
committed
fixed shape not found error
1 parent 022c370 commit 9510af7

File tree

5 files changed

+54
-56
lines changed

5 files changed

+54
-56
lines changed

examples/mulmat-tune/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,6 @@ $ ./mulmat-tune bench --m_num 2
269269
32000 4096
270270
16 26 113438 0 0 0 0 100848 66797 0
271271
32 37 207217 0 0 0 0 87275 74103 0
272-
...
273272
```
274273

275274
See example files in dir [bench-out](bench-out) for details.

examples/mulmat-tune/mulmat-tune-tool.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -832,7 +832,8 @@ static void test_ggml_mulmat_tune_estimate_time(void) {
832832

833833
const struct test_data_t *e = &test_data[i];
834834

835-
ggml_mulmat_tune_estimate_time(&tune, e->M, N, K, e->nth, &time_stats);
835+
int rc = ggml_mulmat_tune_estimate_time(&tune, e->M, N, K, e->nth, &time_stats);
836+
GGML_ASSERT(rc);
836837

837838
bool pass = true;
838839
for (int i_profile = 0; i_profile < 2; i_profile++) {

examples/mulmat-tune/mulmat-tune.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) {
213213
return 0;
214214
}
215215

216-
void ggml_mulmat_tune_estimate_time(
216+
int ggml_mulmat_tune_estimate_time(
217217
const struct ggml_mulmat_tune *tune, const int M, const int N, const int K,
218218
const int nth, struct ggml_mulmat_tune_time_stats *time_stats) {
219219
int shape_index = -1;
@@ -225,8 +225,7 @@ void ggml_mulmat_tune_estimate_time(
225225
}
226226

227227
if (shape_index < 0) {
228-
fprintf(stderr, "%s: shape not found, N: %d, K: %d\n", __func__, N, K);
229-
abort();
228+
return -1;
230229
}
231230

232231
time_stats->n_profiles = tune->n_profiles;
@@ -310,6 +309,8 @@ void ggml_mulmat_tune_estimate_time(
310309
time_stats->profile_time[ip].total_time += (int)t;
311310
}
312311
}
312+
313+
return 0;
313314
}
314315

315316
static const char *ggml_backend_names[] = {

examples/mulmat-tune/mulmat-tune.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
8383

8484
int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *file);
8585

86-
// return 0: ok, -1: M out of range or no data.
87-
void ggml_mulmat_tune_estimate_time(
86+
// return 0: ok, -1: shape not found
87+
int ggml_mulmat_tune_estimate_time(
8888
const struct ggml_mulmat_tune *tune, int M, int N, int K, int nth,
8989
struct ggml_mulmat_tune_time_stats *time_stats);
9090

ggml.c

Lines changed: 46 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -14374,67 +14374,64 @@ void ggml_graph_compute_mul_mat_set_task_profile(struct ggml_cgraph *cgraph) {
1437414374
int N = (int)node->ne[0];
1437514375
int K = (int)node->src1->ne[0];
1437614376

14377+
bool continuous = ggml_is_contiguous(node->src0) && ggml_is_contiguous(node->src1);
14378+
1437714379
struct ggml_task_profile *profiles;
1437814380
int n_profiles = ggml_mulmat_get_task_profiles(&profiles, node->src0->type, node->src1->type);
1437914381
GGML_ASSERT(n_profiles >= 2);
1438014382

14381-
const struct ggml_task_profile *profile = &profiles[0]; // [0] is always the cpu only profile.
14382-
14383-
if (ggml_is_contiguous(node->src0) && ggml_is_contiguous(node->src1)) {
14384-
if (cgraph->mm_tune == NULL) {
14385-
// NOTE: assume all M are same within this comput egraph.
14386-
if (M >= 32 && N >= 32 && K >= 32) {
14387-
// NOTE: copied from llama.cpp to here. But I assue that CUDA
14388-
// or CL run in 1 OS-thread as well.
14389-
if (ggml_cpu_has_blas()) {
14390-
//if (!ggml_cpu_has_gpublas()) {
14391-
cgraph->n_threads = 1;
14392-
GGML_PRINT_THREAD_DEBUG(">>>> n_threads was set to 1");
14393-
//}
14394-
profile = &profiles[1]; // [1] is always the 1 thread gpu profile.
14395-
}
14396-
}
14397-
} else {
14398-
int slot = ggml_mulmat_tune_cache_hash(M, N, K) % mm_cache_len;
14399-
struct mm_cache_element *e = &mm_cache[slot];
14383+
const struct ggml_task_profile *profile = NULL;
1440014384

14401-
if (e->M == M && e->N == N && e->K == K) {
14402-
profile = e->profile;
14403-
} else {
14404-
struct ggml_mulmat_tune_time_stats t_stats;
14405-
size_t sz = sizeof(struct ggml_mulmat_tune_profile_time) * n_profiles;
14406-
t_stats.profile_time = malloc(sz);
14407-
GGML_ASSERT(t_stats.profile_time);
14408-
memset(t_stats.profile_time, 0, sz);
14409-
14410-
ggml_mulmat_tune_estimate_time(
14411-
cgraph->mm_tune, M, N, K, cgraph->n_threads, &t_stats);
14412-
int min = INT32_MAX;
14413-
profile = NULL;
14414-
14415-
int i_profile;
14416-
for (i_profile = 0; i_profile < t_stats.n_profiles; i_profile++) {
14417-
int total = t_stats.profile_time[i_profile].total_time;
14418-
if (total < min) {
14419-
min = total;
14420-
profile = &cgraph->mm_tune->profiles[i_profile];
14421-
}
14385+
if (cgraph->mm_tune != NULL && continuous) {
14386+
int slot = ggml_mulmat_tune_cache_hash(M, N, K) % mm_cache_len;
14387+
struct mm_cache_element *e = &mm_cache[slot];
14388+
14389+
if (e->M == M && e->N == N && e->K == K) {
14390+
profile = e->profile;
14391+
} else {
14392+
struct ggml_mulmat_tune_time_stats t_stats;
14393+
size_t sz = sizeof(struct ggml_mulmat_tune_profile_time) * n_profiles;
14394+
t_stats.profile_time = malloc(sz);
14395+
GGML_ASSERT(t_stats.profile_time);
14396+
memset(t_stats.profile_time, 0, sz);
14397+
14398+
int rc = ggml_mulmat_tune_estimate_time(
14399+
cgraph->mm_tune, M, N, K, cgraph->n_threads, &t_stats);
14400+
int min = INT32_MAX;
14401+
profile = NULL;
14402+
14403+
if (rc == 0) {
14404+
int i_profile;
14405+
for (i_profile = 0; i_profile < t_stats.n_profiles; i_profile++) {
14406+
int total = t_stats.profile_time[i_profile].total_time;
14407+
if (total < min) {
14408+
min = total;
14409+
profile = &cgraph->mm_tune->profiles[i_profile];
1442214410
}
14411+
}
1442314412

14424-
e->M = M;
14425-
e->N = N;
14426-
e->K = K;
14427-
e->profile = profile;
14413+
e->M = M;
14414+
e->N = N;
14415+
e->K = K;
14416+
e->profile = profile;
1442814417

14429-
GGML_PRINT_THREAD_DEBUG("(1) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
14430-
M, N, K,
14431-
profile->stages[0].backend,
14432-
profile->stages[1].backend,
14433-
profile->stages[2].backend);
14418+
GGML_PRINT_THREAD_DEBUG("(1) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
14419+
M, N, K,
14420+
profile->stages[0].backend,
14421+
profile->stages[1].backend,
14422+
profile->stages[2].backend);
1443414423
}
1443514424
}
1443614425
}
1443714426

14427+
if (profile == NULL) {
14428+
profile = &profiles[0]; // [0] is always the cpu only profile.
14429+
//if (continuous && M >= 32 && N >= 4096 && K >= 4096) {
14430+
if (continuous && M >= 32 && N >= 32 && K >= 32) {
14431+
profile = &profiles[1]; // [1] is always the 1 thread gpu profile.
14432+
}
14433+
}
14434+
1443814435
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
1443914436

1444014437
GGML_PRINT_THREAD_DEBUG("(2) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",

0 commit comments

Comments
 (0)