Skip to content

Commit ad5856d

Browse files
committed
update
1 parent 731d688 commit ad5856d

File tree

1 file changed

+80
-67
lines changed

1 file changed

+80
-67
lines changed

llama.cpp

Lines changed: 80 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,10 +1160,12 @@ struct llama_kv_cache {
11601160
ggml_free(ctx);
11611161
}
11621162

1163+
if (ggml_cpu_has_cublas()) {
11631164
#ifdef GGML_USE_CUBLAS
1164-
ggml_cuda_free_data(k);
1165-
ggml_cuda_free_data(v);
1166-
#endif // GGML_USE_CUBLAS
1165+
ggml_cuda_free_data(k);
1166+
ggml_cuda_free_data(v);
1167+
#endif
1168+
}
11671169
}
11681170
};
11691171

@@ -1262,12 +1264,16 @@ struct llama_model {
12621264
ggml_free(ctx);
12631265
}
12641266

1267+
if (ggml_cpu_has_cublas()) {
12651268
#ifdef GGML_USE_CUBLAS
1266-
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1267-
ggml_cuda_free_data(tensors_by_name[i].second);
1269+
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1270+
ggml_cuda_free_data(tensors_by_name[i].second);
1271+
}
1272+
ggml_cuda_free_scratch();
1273+
#endif
12681274
}
1269-
ggml_cuda_free_scratch();
1270-
#elif defined(GGML_USE_CLBLAST)
1275+
1276+
#if defined(GGML_USE_CLBLAST)
12711277
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
12721278
ggml_cl_free_data(tensors_by_name[i].second);
12731279
}
@@ -1379,23 +1385,26 @@ static bool llama_kv_cache_init(
13791385
ggml_set_name(cache.v, "cache_v");
13801386

13811387
(void) n_gpu_layers;
1388+
1389+
if (ggml_cpu_has_cublas()) {
13821390
#ifdef GGML_USE_CUBLAS
1383-
size_t vram_kv_cache = 0;
1391+
size_t vram_kv_cache = 0;
13841392

1385-
if (n_gpu_layers > (int)n_layer + 1) {
1386-
ggml_cuda_assign_buffers_no_scratch(cache.v);
1387-
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1388-
vram_kv_cache += ggml_nbytes(cache.v);
1389-
}
1390-
if (n_gpu_layers > (int)n_layer + 2) {
1391-
ggml_cuda_assign_buffers_no_scratch(cache.k);
1392-
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1393-
vram_kv_cache += ggml_nbytes(cache.k);
1394-
}
1395-
if (vram_kv_cache > 0) {
1396-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1393+
if (n_gpu_layers > (int)n_layer + 1) {
1394+
ggml_cuda_assign_buffers_no_scratch(cache.v);
1395+
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
1396+
vram_kv_cache += ggml_nbytes(cache.v);
1397+
}
1398+
if (n_gpu_layers > (int)n_layer + 2) {
1399+
ggml_cuda_assign_buffers_no_scratch(cache.k);
1400+
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
1401+
vram_kv_cache += ggml_nbytes(cache.k);
1402+
}
1403+
if (vram_kv_cache > 0) {
1404+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1405+
}
1406+
#endif
13971407
}
1398-
#endif // GGML_USE_CUBLAS
13991408

14001409
return true;
14011410
}
@@ -2455,19 +2464,23 @@ static void llm_load_tensors(
24552464
}
24562465

24572466
(void) main_gpu;
2467+
2468+
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
2469+
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
2470+
2471+
if (ggml_cpu_has_cublas()) {
24582472
#ifdef GGML_USE_CUBLAS
2459-
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2460-
ggml_cuda_set_main_device(main_gpu);
2461-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2462-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
2463-
#elif defined(GGML_USE_CLBLAST)
2464-
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2465-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
2466-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
2467-
#else
2468-
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
2469-
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
2473+
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
2474+
ggml_cuda_set_main_device(main_gpu);
2475+
2476+
llama_backend_offload = GGML_BACKEND_GPU;
2477+
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
24702478
#endif
2479+
} else if (ggml_cpu_has_clblast()) {
2480+
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
2481+
llama_backend_offload = GGML_BACKEND_GPU;
2482+
llama_backend_offload_split = GGML_BACKEND_GPU;
2483+
}
24712484

24722485
// prepare memory for the weights
24732486
size_t vram_weights = 0;
@@ -2493,12 +2506,12 @@ static void llm_load_tensors(
24932506
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
24942507
// on Windows however this is detrimental unless everything is on the GPU
24952508
#ifndef _WIN32
2496-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2509+
backend_norm = llama_backend_offload;
24972510
#else
2498-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2511+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
24992512
#endif // _WIN32
25002513

2501-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2514+
backend_output = llama_backend_offload_split;
25022515
} else {
25032516
backend_norm = GGML_BACKEND_CPU;
25042517
backend_output = GGML_BACKEND_CPU;
@@ -2522,8 +2535,8 @@ static void llm_load_tensors(
25222535
model.layers.resize(n_layer);
25232536

25242537
for (uint32_t i = 0; i < n_layer; ++i) {
2525-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2526-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2538+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2539+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
25272540

25282541
auto & layer = model.layers[i];
25292542

@@ -2559,12 +2572,12 @@ static void llm_load_tensors(
25592572
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
25602573
// on Windows however this is detrimental unless everything is on the GPU
25612574
#ifndef _WIN32
2562-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2575+
backend_norm = llama_backend_offload;
25632576
#else
2564-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2577+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
25652578
#endif // _WIN32
25662579

2567-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2580+
backend_output = llama_backend_offload_split;
25682581
} else {
25692582
backend_norm = GGML_BACKEND_CPU;
25702583
backend_output = GGML_BACKEND_CPU;
@@ -2588,8 +2601,8 @@ static void llm_load_tensors(
25882601
model.layers.resize(n_layer);
25892602

25902603
for (uint32_t i = 0; i < n_layer; ++i) {
2591-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2592-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2604+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2605+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
25932606

25942607
auto & layer = model.layers[i];
25952608

@@ -2629,12 +2642,12 @@ static void llm_load_tensors(
26292642
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
26302643
// on Windows however this is detrimental unless everything is on the GPU
26312644
#ifndef _WIN32
2632-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2645+
backend_norm = llama_backend_offload;
26332646
#else
2634-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2647+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
26352648
#endif // _WIN32
26362649

2637-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2650+
backend_output = llama_backend_offload_split;
26382651
} else {
26392652
backend_norm = GGML_BACKEND_CPU;
26402653
backend_output = GGML_BACKEND_CPU;
@@ -2660,8 +2673,8 @@ static void llm_load_tensors(
26602673
model.layers.resize(n_layer);
26612674

26622675
for (uint32_t i = 0; i < n_layer; ++i) {
2663-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2664-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2676+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2677+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
26652678

26662679
auto & layer = model.layers[i];
26672680

@@ -2706,12 +2719,12 @@ static void llm_load_tensors(
27062719
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
27072720
// on Windows however this is detrimental unless everything is on the GPU
27082721
#ifndef _WIN32
2709-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2722+
backend_norm = llama_backend_offload;
27102723
#else
2711-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2724+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
27122725
#endif // _WIN32
27132726

2714-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2727+
backend_output = llama_backend_offload_split;
27152728
} else {
27162729
backend_norm = GGML_BACKEND_CPU;
27172730
backend_output = GGML_BACKEND_CPU;
@@ -2737,8 +2750,8 @@ static void llm_load_tensors(
27372750
model.layers.resize(n_layer);
27382751

27392752
for (uint32_t i = 0; i < n_layer; ++i) {
2740-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2741-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2753+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2754+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
27422755

27432756
auto & layer = model.layers[i];
27442757

@@ -2783,12 +2796,12 @@ static void llm_load_tensors(
27832796
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
27842797
// on Windows however this is detrimental unless everything is on the GPU
27852798
#ifndef _WIN32
2786-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2799+
backend_norm = llama_backend_offload;
27872800
#else
2788-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2801+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
27892802
#endif // _WIN32
27902803

2791-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2804+
backend_output = llama_backend_offload_split;
27922805
} else {
27932806
backend_norm = GGML_BACKEND_CPU;
27942807
backend_output = GGML_BACKEND_CPU;
@@ -2811,8 +2824,8 @@ static void llm_load_tensors(
28112824
const int i_gpu_start = n_layer - n_gpu_layers;
28122825
model.layers.resize(n_layer);
28132826
for (uint32_t i = 0; i < n_layer; ++i) {
2814-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2815-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
2827+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
2828+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
28162829
auto & layer = model.layers[i];
28172830
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
28182831
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
@@ -2849,12 +2862,12 @@ static void llm_load_tensors(
28492862
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
28502863
// on Windows however this is detrimental unless everything is on the GPU
28512864
#ifndef _WIN32
2852-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2865+
backend_norm = llama_backend_offload;
28532866
#else
2854-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2867+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
28552868
#endif // _WIN32
28562869

2857-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2870+
backend_output = llama_backend_offload_split;
28582871
} else {
28592872
backend_norm = GGML_BACKEND_CPU;
28602873
backend_output = GGML_BACKEND_CPU;
@@ -2880,8 +2893,8 @@ static void llm_load_tensors(
28802893
model.layers.resize(n_layer);
28812894

28822895
for (uint32_t i = 0; i < n_layer; ++i) {
2883-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2884-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2896+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2897+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
28852898

28862899
auto & layer = model.layers[i];
28872900

@@ -2927,12 +2940,12 @@ static void llm_load_tensors(
29272940
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
29282941
// on Windows however this is detrimental unless everything is on the GPU
29292942
#ifndef _WIN32
2930-
backend_norm = LLAMA_BACKEND_OFFLOAD;
2943+
backend_norm = llama_backend_offload;
29312944
#else
2932-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2945+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
29332946
#endif // _WIN32
29342947

2935-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2948+
backend_output = llama_backend_offload_split;
29362949
} else {
29372950
backend_norm = GGML_BACKEND_CPU;
29382951
backend_output = GGML_BACKEND_CPU;
@@ -2956,8 +2969,8 @@ static void llm_load_tensors(
29562969
model.layers.resize(n_layer);
29572970

29582971
for (uint32_t i = 0; i < n_layer; ++i) {
2959-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
2960-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
2972+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
2973+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
29612974

29622975
auto & layer = model.layers[i];
29632976

0 commit comments

Comments
 (0)