Skip to content

Commit c4d4d5f

Browse files
committed
Removed unnecessary reshapes when retrieving kv from cache (2-3% faster for 7B's 1000 tokens generation)
1 parent 74c53e0 commit c4d4d5f

File tree

1 file changed

+12
-14
lines changed

1 file changed

+12
-14
lines changed

libfalcon.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1695,14 +1695,13 @@ static bool falcon_eval_internal(
16951695

16961696
struct ggml_tensor * K = ggml_permute(
16971697
ctx0,
1698-
ggml_reshape_3d(
1698+
ggml_view_3d(
16991699
ctx0,
1700-
ggml_view_1d(ctx0, kv_self.k, (n_past + N) * n_head_kv * head_dim,
1701-
il * n_ctx *
1702-
ggml_element_size(kv_self.k) *
1703-
n_head_kv *
1704-
head_dim),
1705-
head_dim, n_head_kv, n_past + N),
1700+
kv_self.k,
1701+
head_dim, n_head_kv, n_past + N,
1702+
head_dim * sizeof_wtype,
1703+
head_dim * n_head_kv * sizeof_wtype,
1704+
il * n_ctx * ggml_element_size(kv_self.k) * n_head_kv * head_dim),
17061705
0, 2, 1, 3);
17071706

17081707
// K * Q
@@ -1741,14 +1740,13 @@ static bool falcon_eval_internal(
17411740
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
17421741
struct ggml_tensor* V = ggml_permute(
17431742
ctx0,
1744-
ggml_reshape_3d(
1743+
ggml_view_3d(
17451744
ctx0,
1746-
ggml_view_1d(ctx0, kv_self.v, (n_past + N) * n_head_kv * head_dim,
1747-
il * n_ctx *
1748-
ggml_element_size(model.kv_self.v) *
1749-
n_head_kv *
1750-
head_dim),
1751-
head_dim, n_head_kv, n_past + N),
1745+
kv_self.v,
1746+
head_dim, n_head_kv, n_past + N,
1747+
head_dim * sizeof_wtype,
1748+
head_dim * n_head_kv * sizeof_wtype,
1749+
il * n_ctx * ggml_element_size(kv_self.v) * n_head_kv * head_dim),
17521750
0, 2, 1, 3);
17531751

17541752
if(0)

0 commit comments

Comments
 (0)