Skip to content

Commit e87d709

Browse files
committed
Cleanup for review
1 parent a371a8b commit e87d709

File tree

1 file changed

+1
-25
lines changed

1 file changed

+1
-25
lines changed

llama.cpp

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3118,14 +3118,6 @@ static void llm_load_tensors(
31183118
ggml_backend_type backend_norm;
31193119
ggml_backend_type backend_output;
31203120

3121-
// Don't allow for offloading of more than 33 layers.
3122-
// Offloading 34 layers causes model to respond with letter 'E'
3123-
// Offloading 35 layers doesn't work because of missing cuda implementation for rope:
3124-
// GGML_ASSERT: ggml-cuda.cu:6402: ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"
3125-
if (n_gpu_layers > 33) {
3126-
n_gpu_layers = 33;
3127-
}
3128-
31293121
if (n_gpu_layers > int(n_layer)) {
31303122
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
31313123
// on Windows however this is detrimental unless everything is on the GPU
@@ -4323,7 +4315,7 @@ struct llm_build_context {
43234315
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
43244316
cb(Kcur, "Kcur", il);
43254317

4326-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
4318+
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
43274319
cb(Q, "Q", il);
43284320

43294321
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
@@ -4791,20 +4783,6 @@ struct llm_build_context {
47914783
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
47924784
cb(Kcur, "Kcur", il);
47934785

4794-
// Qcur = ggml_rope_custom(
4795-
// ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4796-
// hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4797-
// ext_factor, attn_factor, beta_fast, beta_slow
4798-
// );
4799-
// cb(Qcur, "Qcur", il);
4800-
4801-
// Kcur = ggml_rope_custom(
4802-
// ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4803-
// hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4804-
// ext_factor, attn_factor, beta_fast, beta_slow
4805-
// );
4806-
// cb(Kcur, "Kcur", il);
4807-
48084786
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
48094787

48104788
cur = llm_build_kqv(ctx0, hparams, kv_self,
@@ -5026,8 +5004,6 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
50265004

50275005
static llm_offload_trie k_offload_func_trie(k_offload_map);
50285006

5029-
5030-
50315007
static struct ggml_cgraph * llama_build_graph(
50325008
llama_context & lctx,
50335009
const llama_batch & batch) {

0 commit comments

Comments
 (0)