@@ -1160,10 +1160,12 @@ struct llama_kv_cache {
1160
1160
ggml_free (ctx);
1161
1161
}
1162
1162
1163
+ if (ggml_cpu_has_cublas ()) {
1163
1164
#ifdef GGML_USE_CUBLAS
1164
- ggml_cuda_free_data (k);
1165
- ggml_cuda_free_data (v);
1166
- #endif // GGML_USE_CUBLAS
1165
+ ggml_cuda_free_data (k);
1166
+ ggml_cuda_free_data (v);
1167
+ #endif
1168
+ }
1167
1169
}
1168
1170
};
1169
1171
@@ -1262,12 +1264,16 @@ struct llama_model {
1262
1264
ggml_free (ctx);
1263
1265
}
1264
1266
1267
+ if (ggml_cpu_has_cublas ()) {
1265
1268
#ifdef GGML_USE_CUBLAS
1266
- for (size_t i = 0 ; i < tensors_by_name.size (); ++i) {
1267
- ggml_cuda_free_data (tensors_by_name[i].second );
1269
+ for (size_t i = 0 ; i < tensors_by_name.size (); ++i) {
1270
+ ggml_cuda_free_data (tensors_by_name[i].second );
1271
+ }
1272
+ ggml_cuda_free_scratch ();
1273
+ #endif
1268
1274
}
1269
- ggml_cuda_free_scratch ();
1270
- #elif defined(GGML_USE_CLBLAST)
1275
+
1276
+ #if defined(GGML_USE_CLBLAST)
1271
1277
for (size_t i = 0 ; i < tensors_by_name.size (); ++i) {
1272
1278
ggml_cl_free_data (tensors_by_name[i].second );
1273
1279
}
@@ -1379,23 +1385,26 @@ static bool llama_kv_cache_init(
1379
1385
ggml_set_name (cache.v , " cache_v" );
1380
1386
1381
1387
(void ) n_gpu_layers;
1388
+
1389
+ if (ggml_cpu_has_cublas ()) {
1382
1390
#ifdef GGML_USE_CUBLAS
1383
- size_t vram_kv_cache = 0 ;
1391
+ size_t vram_kv_cache = 0 ;
1384
1392
1385
- if (n_gpu_layers > (int )n_layer + 1 ) {
1386
- ggml_cuda_assign_buffers_no_scratch (cache.v );
1387
- LLAMA_LOG_INFO (" %s: offloading v cache to GPU\n " , __func__);
1388
- vram_kv_cache += ggml_nbytes (cache.v );
1389
- }
1390
- if (n_gpu_layers > (int )n_layer + 2 ) {
1391
- ggml_cuda_assign_buffers_no_scratch (cache.k );
1392
- LLAMA_LOG_INFO (" %s: offloading k cache to GPU\n " , __func__);
1393
- vram_kv_cache += ggml_nbytes (cache.k );
1394
- }
1395
- if (vram_kv_cache > 0 ) {
1396
- LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB\n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
1393
+ if (n_gpu_layers > (int )n_layer + 1 ) {
1394
+ ggml_cuda_assign_buffers_no_scratch (cache.v );
1395
+ LLAMA_LOG_INFO (" %s: offloading v cache to GPU\n " , __func__);
1396
+ vram_kv_cache += ggml_nbytes (cache.v );
1397
+ }
1398
+ if (n_gpu_layers > (int )n_layer + 2 ) {
1399
+ ggml_cuda_assign_buffers_no_scratch (cache.k );
1400
+ LLAMA_LOG_INFO (" %s: offloading k cache to GPU\n " , __func__);
1401
+ vram_kv_cache += ggml_nbytes (cache.k );
1402
+ }
1403
+ if (vram_kv_cache > 0 ) {
1404
+ LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MB\n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
1405
+ }
1406
+ #endif
1397
1407
}
1398
- #endif // GGML_USE_CUBLAS
1399
1408
1400
1409
return true ;
1401
1410
}
@@ -2455,19 +2464,23 @@ static void llm_load_tensors(
2455
2464
}
2456
2465
2457
2466
(void ) main_gpu;
2467
+
2468
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2469
+ enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2470
+
2471
+ if (ggml_cpu_has_cublas ()) {
2458
2472
#ifdef GGML_USE_CUBLAS
2459
- LLAMA_LOG_INFO (" %s: using " GGML_CUDA_NAME " for GPU acceleration\n " , __func__);
2460
- ggml_cuda_set_main_device (main_gpu);
2461
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2462
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2463
- #elif defined(GGML_USE_CLBLAST)
2464
- LLAMA_LOG_INFO (" %s: using OpenCL for GPU acceleration\n " , __func__);
2465
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2466
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
2467
- #else
2468
- #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2469
- #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
2473
+ LLAMA_LOG_INFO (" %s: using " GGML_CUDA_NAME " for GPU acceleration\n " , __func__);
2474
+ ggml_cuda_set_main_device (main_gpu);
2475
+
2476
+ llama_backend_offload = GGML_BACKEND_GPU;
2477
+ llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
2470
2478
#endif
2479
+ } else if (ggml_cpu_has_clblast ()) {
2480
+ LLAMA_LOG_INFO (" %s: using OpenCL for GPU acceleration\n " , __func__);
2481
+ llama_backend_offload = GGML_BACKEND_GPU;
2482
+ llama_backend_offload_split = GGML_BACKEND_GPU;
2483
+ }
2471
2484
2472
2485
// prepare memory for the weights
2473
2486
size_t vram_weights = 0 ;
@@ -2493,12 +2506,12 @@ static void llm_load_tensors(
2493
2506
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2494
2507
// on Windows however this is detrimental unless everything is on the GPU
2495
2508
#ifndef _WIN32
2496
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2509
+ backend_norm = llama_backend_offload ;
2497
2510
#else
2498
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2511
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2499
2512
#endif // _WIN32
2500
2513
2501
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2514
+ backend_output = llama_backend_offload_split ;
2502
2515
} else {
2503
2516
backend_norm = GGML_BACKEND_CPU;
2504
2517
backend_output = GGML_BACKEND_CPU;
@@ -2522,8 +2535,8 @@ static void llm_load_tensors(
2522
2535
model.layers .resize (n_layer);
2523
2536
2524
2537
for (uint32_t i = 0 ; i < n_layer; ++i) {
2525
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2526
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2538
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
2539
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
2527
2540
2528
2541
auto & layer = model.layers [i];
2529
2542
@@ -2559,12 +2572,12 @@ static void llm_load_tensors(
2559
2572
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2560
2573
// on Windows however this is detrimental unless everything is on the GPU
2561
2574
#ifndef _WIN32
2562
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2575
+ backend_norm = llama_backend_offload ;
2563
2576
#else
2564
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2577
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2565
2578
#endif // _WIN32
2566
2579
2567
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2580
+ backend_output = llama_backend_offload_split ;
2568
2581
} else {
2569
2582
backend_norm = GGML_BACKEND_CPU;
2570
2583
backend_output = GGML_BACKEND_CPU;
@@ -2588,8 +2601,8 @@ static void llm_load_tensors(
2588
2601
model.layers .resize (n_layer);
2589
2602
2590
2603
for (uint32_t i = 0 ; i < n_layer; ++i) {
2591
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2592
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2604
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
2605
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
2593
2606
2594
2607
auto & layer = model.layers [i];
2595
2608
@@ -2629,12 +2642,12 @@ static void llm_load_tensors(
2629
2642
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2630
2643
// on Windows however this is detrimental unless everything is on the GPU
2631
2644
#ifndef _WIN32
2632
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2645
+ backend_norm = llama_backend_offload ;
2633
2646
#else
2634
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2647
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2635
2648
#endif // _WIN32
2636
2649
2637
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2650
+ backend_output = llama_backend_offload_split ;
2638
2651
} else {
2639
2652
backend_norm = GGML_BACKEND_CPU;
2640
2653
backend_output = GGML_BACKEND_CPU;
@@ -2660,8 +2673,8 @@ static void llm_load_tensors(
2660
2673
model.layers .resize (n_layer);
2661
2674
2662
2675
for (uint32_t i = 0 ; i < n_layer; ++i) {
2663
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2664
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2676
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
2677
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
2665
2678
2666
2679
auto & layer = model.layers [i];
2667
2680
@@ -2706,12 +2719,12 @@ static void llm_load_tensors(
2706
2719
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2707
2720
// on Windows however this is detrimental unless everything is on the GPU
2708
2721
#ifndef _WIN32
2709
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2722
+ backend_norm = llama_backend_offload ;
2710
2723
#else
2711
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2724
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2712
2725
#endif // _WIN32
2713
2726
2714
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2727
+ backend_output = llama_backend_offload_split ;
2715
2728
} else {
2716
2729
backend_norm = GGML_BACKEND_CPU;
2717
2730
backend_output = GGML_BACKEND_CPU;
@@ -2737,8 +2750,8 @@ static void llm_load_tensors(
2737
2750
model.layers .resize (n_layer);
2738
2751
2739
2752
for (uint32_t i = 0 ; i < n_layer; ++i) {
2740
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2741
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2753
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
2754
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
2742
2755
2743
2756
auto & layer = model.layers [i];
2744
2757
@@ -2783,12 +2796,12 @@ static void llm_load_tensors(
2783
2796
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2784
2797
// on Windows however this is detrimental unless everything is on the GPU
2785
2798
#ifndef _WIN32
2786
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2799
+ backend_norm = llama_backend_offload ;
2787
2800
#else
2788
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2801
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2789
2802
#endif // _WIN32
2790
2803
2791
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2804
+ backend_output = llama_backend_offload_split ;
2792
2805
} else {
2793
2806
backend_norm = GGML_BACKEND_CPU;
2794
2807
backend_output = GGML_BACKEND_CPU;
@@ -2811,8 +2824,8 @@ static void llm_load_tensors(
2811
2824
const int i_gpu_start = n_layer - n_gpu_layers;
2812
2825
model.layers .resize (n_layer);
2813
2826
for (uint32_t i = 0 ; i < n_layer; ++i) {
2814
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2815
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ;
2827
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ;
2828
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ;
2816
2829
auto & layer = model.layers [i];
2817
2830
layer.attn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd}, backend);
2818
2831
layer.attn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd}, backend);
@@ -2849,12 +2862,12 @@ static void llm_load_tensors(
2849
2862
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2850
2863
// on Windows however this is detrimental unless everything is on the GPU
2851
2864
#ifndef _WIN32
2852
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2865
+ backend_norm = llama_backend_offload ;
2853
2866
#else
2854
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2867
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2855
2868
#endif // _WIN32
2856
2869
2857
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2870
+ backend_output = llama_backend_offload_split ;
2858
2871
} else {
2859
2872
backend_norm = GGML_BACKEND_CPU;
2860
2873
backend_output = GGML_BACKEND_CPU;
@@ -2880,8 +2893,8 @@ static void llm_load_tensors(
2880
2893
model.layers .resize (n_layer);
2881
2894
2882
2895
for (uint32_t i = 0 ; i < n_layer; ++i) {
2883
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2884
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2896
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
2897
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
2885
2898
2886
2899
auto & layer = model.layers [i];
2887
2900
@@ -2927,12 +2940,12 @@ static void llm_load_tensors(
2927
2940
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2928
2941
// on Windows however this is detrimental unless everything is on the GPU
2929
2942
#ifndef _WIN32
2930
- backend_norm = LLAMA_BACKEND_OFFLOAD ;
2943
+ backend_norm = llama_backend_offload ;
2931
2944
#else
2932
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ;
2945
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
2933
2946
#endif // _WIN32
2934
2947
2935
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT ;
2948
+ backend_output = llama_backend_offload_split ;
2936
2949
} else {
2937
2950
backend_norm = GGML_BACKEND_CPU;
2938
2951
backend_output = GGML_BACKEND_CPU;
@@ -2956,8 +2969,8 @@ static void llm_load_tensors(
2956
2969
model.layers .resize (n_layer);
2957
2970
2958
2971
for (uint32_t i = 0 ; i < n_layer; ++i) {
2959
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD ; // NOLINT
2960
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT ; // NOLINT
2972
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
2973
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
2961
2974
2962
2975
auto & layer = model.layers [i];
2963
2976
0 commit comments