@@ -1494,6 +1494,7 @@ static bool llama_kv_cache_init(
1494
1494
ggml_type wtype,
1495
1495
uint32_t n_ctx,
1496
1496
int n_gpu_layers) {
1497
+ fprintf (stderr, " GPULAYERS '%d'\n " , n_gpu_layers);
1497
1498
const uint32_t n_embd = hparams.n_embd_gqa ();
1498
1499
const uint32_t n_layer = hparams.n_layer ;
1499
1500
@@ -1531,6 +1532,7 @@ static bool llama_kv_cache_init(
1531
1532
(void ) n_gpu_layers;
1532
1533
1533
1534
#ifdef GGML_USE_CUBLAS
1535
+ fprintf (stderr, " USE CUBLAS\n " );
1534
1536
if (ggml_cublas_loaded ()) {
1535
1537
size_t vram_kv_cache = 0 ;
1536
1538
@@ -1548,6 +1550,8 @@ static bool llama_kv_cache_init(
1548
1550
LLAMA_LOG_INFO (" %s: VRAM kv self = %.2f MiB\n " , __func__, vram_kv_cache / 1024.0 / 1024.0 );
1549
1551
}
1550
1552
}
1553
+ #else
1554
+ fprintf (stderr, " NO USE CUBLAS\n " );
1551
1555
#endif
1552
1556
1553
1557
return true ;
@@ -2065,6 +2069,7 @@ struct llama_model_loader {
2065
2069
break ;
2066
2070
#ifdef GGML_USE_CUBLAS
2067
2071
case GGML_BACKEND_GPU:
2072
+
2068
2073
case GGML_BACKEND_GPU_SPLIT:
2069
2074
// old code:
2070
2075
// ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
@@ -2741,9 +2746,11 @@ static void llm_load_tensors(
2741
2746
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
2742
2747
2743
2748
if (backend_norm == GGML_BACKEND_GPU) {
2749
+ fprintf (stderr, " vram_weights00 '%ld'\n " , vram_weights);
2744
2750
vram_weights += ggml_nbytes (model.output_norm );
2745
2751
}
2746
2752
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2753
+ fprintf (stderr, " vram_weights01 '%ld'\n " , vram_weights);
2747
2754
vram_weights += ggml_nbytes (model.output );
2748
2755
}
2749
2756
}
@@ -2774,6 +2781,7 @@ static void llm_load_tensors(
2774
2781
layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2775
2782
2776
2783
if (backend == GGML_BACKEND_GPU) {
2784
+ fprintf (stderr, " vram_weights03 '%ld'\n " , vram_weights);
2777
2785
vram_weights +=
2778
2786
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
2779
2787
ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
@@ -2807,9 +2815,11 @@ static void llm_load_tensors(
2807
2815
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
2808
2816
2809
2817
if (backend_norm == GGML_BACKEND_GPU) {
2818
+ fprintf (stderr, " vram_weights04 '%ld'\n " , vram_weights);
2810
2819
vram_weights += ggml_nbytes (model.output_norm );
2811
2820
}
2812
2821
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2822
+ fprintf (stderr, " vram_weights05 '%ld'\n " , vram_weights);
2813
2823
vram_weights += ggml_nbytes (model.output );
2814
2824
}
2815
2825
}
@@ -2840,6 +2850,7 @@ static void llm_load_tensors(
2840
2850
layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2841
2851
2842
2852
if (backend == GGML_BACKEND_GPU) {
2853
+ fprintf (stderr, " vram_weights06 '%ld'\n " , vram_weights);
2843
2854
vram_weights +=
2844
2855
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
2845
2856
ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
@@ -2878,10 +2889,13 @@ static void llm_load_tensors(
2878
2889
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
2879
2890
2880
2891
if (backend_norm == GGML_BACKEND_GPU) {
2892
+ fprintf (stderr, " vram_weights07 '%ld'\n " , vram_weights);
2881
2893
vram_weights += ggml_nbytes (model.output_norm );
2894
+ fprintf (stderr, " vram_weights08 '%ld'\n " , vram_weights);
2882
2895
vram_weights += ggml_nbytes (model.output_norm_b );
2883
2896
}
2884
2897
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2898
+ fprintf (stderr, " vram_weights09 '%ld'\n " , vram_weights);
2885
2899
vram_weights += ggml_nbytes (model.output );
2886
2900
}
2887
2901
}
@@ -2906,7 +2920,9 @@ static void llm_load_tensors(
2906
2920
layer.attn_norm_2_b = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_NORM_2, " bias" , i), {n_embd}, backend);
2907
2921
2908
2922
if (backend == GGML_BACKEND_GPU) {
2923
+ fprintf (stderr, " vram_weights10 '%ld'\n " , vram_weights);
2909
2924
vram_weights += ggml_nbytes (layer.attn_norm_2 );
2925
+ fprintf (stderr, " vram_weights11 '%ld'\n " , vram_weights);
2910
2926
vram_weights += ggml_nbytes (layer.attn_norm_2_b );
2911
2927
}
2912
2928
}
@@ -2918,6 +2934,7 @@ static void llm_load_tensors(
2918
2934
layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
2919
2935
2920
2936
if (backend == GGML_BACKEND_GPU) {
2937
+ fprintf (stderr, " vram_weights12 '%ld'\n " , vram_weights);
2921
2938
vram_weights +=
2922
2939
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.attn_norm_b ) +
2923
2940
ggml_nbytes (layer.wqkv ) + ggml_nbytes (layer.wo ) +
@@ -2955,10 +2972,12 @@ static void llm_load_tensors(
2955
2972
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
2956
2973
2957
2974
if (backend_norm == GGML_BACKEND_GPU) {
2975
+ fprintf (stderr, " vram_weights13 '%ld'\n " , vram_weights);
2958
2976
vram_weights += ggml_nbytes (model.output_norm );
2959
2977
vram_weights += ggml_nbytes (model.output_norm_b );
2960
2978
}
2961
2979
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2980
+ fprintf (stderr, " vram_weights14 '%ld'\n " , vram_weights);
2962
2981
vram_weights += ggml_nbytes (model.output );
2963
2982
}
2964
2983
}
@@ -2994,6 +3013,7 @@ static void llm_load_tensors(
2994
3013
layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend);
2995
3014
2996
3015
if (backend == GGML_BACKEND_GPU) {
3016
+ fprintf (stderr, " vram_weights15 '%ld'\n " , vram_weights);
2997
3017
vram_weights +=
2998
3018
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.attn_norm_b ) +
2999
3019
ggml_nbytes (layer.wqkv ) + ggml_nbytes (layer.bqkv ) +
@@ -3039,10 +3059,13 @@ static void llm_load_tensors(
3039
3059
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
3040
3060
3041
3061
if (backend_norm == GGML_BACKEND_GPU) {
3062
+ fprintf (stderr, " vram_weights16 '%ld'\n " , vram_weights);
3042
3063
vram_weights += ggml_nbytes (model.output_norm );
3064
+ fprintf (stderr, " vram_weights17 '%ld'\n " , vram_weights);
3043
3065
vram_weights += ggml_nbytes (model.output_norm_b );
3044
3066
}
3045
3067
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3068
+ fprintf (stderr, " vram_weights18 '%ld'\n " , vram_weights);
3046
3069
vram_weights += ggml_nbytes (model.output );
3047
3070
}
3048
3071
}
@@ -3105,10 +3128,13 @@ static void llm_load_tensors(
3105
3128
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
3106
3129
3107
3130
if (backend_norm == GGML_BACKEND_GPU) {
3131
+ fprintf (stderr, " vram_weights19 '%ld'\n " , vram_weights);
3108
3132
vram_weights += ggml_nbytes (model.output_norm );
3133
+ fprintf (stderr, " vram_weights20 '%ld'\n " , vram_weights);
3109
3134
vram_weights += ggml_nbytes (model.output_norm_b );
3110
3135
}
3111
3136
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3137
+ fprintf (stderr, " vram_weights21 '%ld'\n " , vram_weights);
3112
3138
vram_weights += ggml_nbytes (model.output );
3113
3139
}
3114
3140
}
@@ -3144,6 +3170,7 @@ static void llm_load_tensors(
3144
3170
layer.ffn_up_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff}, backend);
3145
3171
3146
3172
if (backend == GGML_BACKEND_GPU) {
3173
+ fprintf (stderr, " vram_weights22 '%ld'\n " , vram_weights);
3147
3174
vram_weights +=
3148
3175
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.attn_norm_b ) +
3149
3176
ggml_nbytes (layer.wqkv ) + ggml_nbytes (layer.bqkv ) +
@@ -3182,9 +3209,11 @@ static void llm_load_tensors(
3182
3209
model.output = ml.create_tensor (ctx, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab}, backend_output);
3183
3210
3184
3211
if (backend_norm == GGML_BACKEND_GPU) {
3212
+ fprintf (stderr, " vram_weights23 '%ld'\n " , vram_weights);
3185
3213
vram_weights += ggml_nbytes (model.output_norm );
3186
3214
}
3187
3215
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3216
+ fprintf (stderr, " vram_weights24 '%ld'\n " , vram_weights);
3188
3217
vram_weights += ggml_nbytes (model.output );
3189
3218
}
3190
3219
}
@@ -3211,6 +3240,7 @@ static void llm_load_tensors(
3211
3240
layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
3212
3241
3213
3242
if (backend == GGML_BACKEND_GPU) {
3243
+ fprintf (stderr, " vram_weights25 '%ld'\n " , vram_weights);
3214
3244
vram_weights +=
3215
3245
ggml_nbytes (layer.attn_norm ) +
3216
3246
ggml_nbytes (layer.wqkv ) +
0 commit comments