@@ -14374,67 +14374,64 @@ void ggml_graph_compute_mul_mat_set_task_profile(struct ggml_cgraph *cgraph) {
14374
14374
int N = (int)node->ne[0];
14375
14375
int K = (int)node->src1->ne[0];
14376
14376
14377
+ bool continuous = ggml_is_contiguous(node->src0) && ggml_is_contiguous(node->src1);
14378
+
14377
14379
struct ggml_task_profile *profiles;
14378
14380
int n_profiles = ggml_mulmat_get_task_profiles(&profiles, node->src0->type, node->src1->type);
14379
14381
GGML_ASSERT(n_profiles >= 2);
14380
14382
14381
- const struct ggml_task_profile *profile = &profiles[0]; // [0] is always the cpu only profile.
14382
-
14383
- if (ggml_is_contiguous(node->src0) && ggml_is_contiguous(node->src1)) {
14384
- if (cgraph->mm_tune == NULL) {
14385
- // NOTE: assume all M are same within this comput egraph.
14386
- if (M >= 32 && N >= 32 && K >= 32) {
14387
- // NOTE: copied from llama.cpp to here. But I assue that CUDA
14388
- // or CL run in 1 OS-thread as well.
14389
- if (ggml_cpu_has_blas()) {
14390
- //if (!ggml_cpu_has_gpublas()) {
14391
- cgraph->n_threads = 1;
14392
- GGML_PRINT_THREAD_DEBUG(">>>> n_threads was set to 1");
14393
- //}
14394
- profile = &profiles[1]; // [1] is always the 1 thread gpu profile.
14395
- }
14396
- }
14397
- } else {
14398
- int slot = ggml_mulmat_tune_cache_hash(M, N, K) % mm_cache_len;
14399
- struct mm_cache_element *e = &mm_cache[slot];
14383
+ const struct ggml_task_profile *profile = NULL;
14400
14384
14401
- if (e->M == M && e->N == N && e->K == K) {
14402
- profile = e->profile;
14403
- } else {
14404
- struct ggml_mulmat_tune_time_stats t_stats;
14405
- size_t sz = sizeof(struct ggml_mulmat_tune_profile_time) * n_profiles;
14406
- t_stats.profile_time = malloc(sz);
14407
- GGML_ASSERT(t_stats.profile_time);
14408
- memset(t_stats.profile_time, 0, sz);
14409
-
14410
- ggml_mulmat_tune_estimate_time(
14411
- cgraph->mm_tune, M, N, K, cgraph->n_threads, &t_stats);
14412
- int min = INT32_MAX;
14413
- profile = NULL;
14414
-
14415
- int i_profile;
14416
- for (i_profile = 0; i_profile < t_stats.n_profiles; i_profile++) {
14417
- int total = t_stats.profile_time[i_profile].total_time;
14418
- if (total < min) {
14419
- min = total;
14420
- profile = &cgraph->mm_tune->profiles[i_profile];
14421
- }
14385
+ if (cgraph->mm_tune != NULL && continuous) {
14386
+ int slot = ggml_mulmat_tune_cache_hash(M, N, K) % mm_cache_len;
14387
+ struct mm_cache_element *e = &mm_cache[slot];
14388
+
14389
+ if (e->M == M && e->N == N && e->K == K) {
14390
+ profile = e->profile;
14391
+ } else {
14392
+ struct ggml_mulmat_tune_time_stats t_stats;
14393
+ size_t sz = sizeof(struct ggml_mulmat_tune_profile_time) * n_profiles;
14394
+ t_stats.profile_time = malloc(sz);
14395
+ GGML_ASSERT(t_stats.profile_time);
14396
+ memset(t_stats.profile_time, 0, sz);
14397
+
14398
+ int rc = ggml_mulmat_tune_estimate_time(
14399
+ cgraph->mm_tune, M, N, K, cgraph->n_threads, &t_stats);
14400
+ int min = INT32_MAX;
14401
+ profile = NULL;
14402
+
14403
+ if (rc == 0) {
14404
+ int i_profile;
14405
+ for (i_profile = 0; i_profile < t_stats.n_profiles; i_profile++) {
14406
+ int total = t_stats.profile_time[i_profile].total_time;
14407
+ if (total < min) {
14408
+ min = total;
14409
+ profile = &cgraph->mm_tune->profiles[i_profile];
14422
14410
}
14411
+ }
14423
14412
14424
- e->M = M;
14425
- e->N = N;
14426
- e->K = K;
14427
- e->profile = profile;
14413
+ e->M = M;
14414
+ e->N = N;
14415
+ e->K = K;
14416
+ e->profile = profile;
14428
14417
14429
- GGML_PRINT_THREAD_DEBUG("(1) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
14430
- M, N, K,
14431
- profile->stages[0].backend,
14432
- profile->stages[1].backend,
14433
- profile->stages[2].backend);
14418
+ GGML_PRINT_THREAD_DEBUG("(1) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
14419
+ M, N, K,
14420
+ profile->stages[0].backend,
14421
+ profile->stages[1].backend,
14422
+ profile->stages[2].backend);
14434
14423
}
14435
14424
}
14436
14425
}
14437
14426
14427
+ if (profile == NULL) {
14428
+ profile = &profiles[0]; // [0] is always the cpu only profile.
14429
+ //if (continuous && M >= 32 && N >= 4096 && K >= 4096) {
14430
+ if (continuous && M >= 32 && N >= 32 && K >= 32) {
14431
+ profile = &profiles[1]; // [1] is always the 1 thread gpu profile.
14432
+ }
14433
+ }
14434
+
14438
14435
memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
14439
14436
14440
14437
GGML_PRINT_THREAD_DEBUG("(2) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
0 commit comments