@@ -3118,14 +3118,6 @@ static void llm_load_tensors(
3118
3118
ggml_backend_type backend_norm;
3119
3119
ggml_backend_type backend_output;
3120
3120
3121
- // Don't allow for offloading of more than 33 layers.
3122
- // Offloading 34 layers causes model to respond with letter 'E'
3123
- // Offloading 35 layers doesn't work because of missing cuda implementation for rope:
3124
- // GGML_ASSERT: ggml-cuda.cu:6402: ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"
3125
- if (n_gpu_layers > 33 ) {
3126
- n_gpu_layers = 33 ;
3127
- }
3128
-
3129
3121
if (n_gpu_layers > int (n_layer)) {
3130
3122
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3131
3123
// on Windows however this is detrimental unless everything is on the GPU
@@ -4323,7 +4315,7 @@ struct llm_build_context {
4323
4315
struct ggml_tensor * Kcur = ggml_concat (ctx0, krotated, kpass);
4324
4316
cb (Kcur, " Kcur" , il);
4325
4317
4326
- struct ggml_tensor * Q = ggml_cont (ctx0, ggml_permute (ctx0, Qcur, 2 , 1 , 0 , 3 ));
4318
+ struct ggml_tensor * Q = ggml_cont (ctx0, ggml_permute (ctx0, Qcur, 1 , 2 , 0 , 3 ));
4327
4319
cb (Q, " Q" , il);
4328
4320
4329
4321
Kcur = ggml_cont (ctx0, ggml_permute (ctx0, Kcur, 2 , 1 , 0 , 3 ));
@@ -4791,20 +4783,6 @@ struct llm_build_context {
4791
4783
Kcur = ggml_cont (ctx0, ggml_permute (ctx0, Kcur, 2 , 1 , 0 , 3 ));
4792
4784
cb (Kcur, " Kcur" , il);
4793
4785
4794
- // Qcur = ggml_rope_custom(
4795
- // ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4796
- // hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4797
- // ext_factor, attn_factor, beta_fast, beta_slow
4798
- // );
4799
- // cb(Qcur, "Qcur", il);
4800
-
4801
- // Kcur = ggml_rope_custom(
4802
- // ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4803
- // hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4804
- // ext_factor, attn_factor, beta_fast, beta_slow
4805
- // );
4806
- // cb(Kcur, "Kcur", il);
4807
-
4808
4786
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4809
4787
4810
4788
cur = llm_build_kqv (ctx0, hparams, kv_self,
@@ -5026,8 +5004,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5026
5004
5027
5005
static llm_offload_trie k_offload_func_trie (k_offload_map);
5028
5006
5029
-
5030
-
5031
5007
static struct ggml_cgraph * llama_build_graph (
5032
5008
llama_context & lctx,
5033
5009
const llama_batch & batch) {
0 commit comments