@@ -1090,7 +1090,6 @@ static bool llama_eval_internal(
1090
1090
ggml_set_name (embd, " embd" );
1091
1091
memcpy (embd->data , tokens, N*ggml_element_size (embd));
1092
1092
1093
- // inpL shape [n_embd,N,1,1]
1094
1093
struct ggml_tensor * inpL = ggml_get_rows (ctx0, model.tok_embeddings , embd);
1095
1094
1096
1095
for (int il = 0 ; il < n_layer; ++il) {
@@ -1102,7 +1101,6 @@ static bool llama_eval_internal(
1102
1101
1103
1102
// norm
1104
1103
{
1105
- // cur shape [n_embd,N,1,1]
1106
1104
cur = ggml_rms_norm (ctx0, inpL);
1107
1105
1108
1106
// cur = attention_norm*cur
@@ -1114,10 +1112,6 @@ static bool llama_eval_internal(
1114
1112
// self-attention
1115
1113
{
1116
1114
// compute Q and K and RoPE them
1117
- // wq shape [n_embd, n_embd, 1, 1]
1118
- // wk shape [n_embd, n_embd, 1, 1]
1119
- // Qcur shape [n_embd/n_head, n_head, N, 1]
1120
- // Kcur shape [n_embd/n_head, n_head, N, 1]
1121
1115
struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1122
1116
struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1123
1117
ggml_set_name (Qcur, " Qcur" );
@@ -1126,14 +1120,8 @@ static bool llama_eval_internal(
1126
1120
// store key and value to memory
1127
1121
{
1128
1122
// compute the transposed [N, n_embd] V matrix
1129
- // wv shape [n_embd, n_embd, 1, 1]
1130
- // Vcur shape [n_embd, N, 1, 1]
1131
1123
struct ggml_tensor * Vcur = ggml_transpose (ctx0, ggml_reshape_2d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wv , cur), n_embd, N));
1132
1124
1133
- // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1134
- // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
1135
- // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
1136
- // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
1137
1125
struct ggml_tensor * k = ggml_view_1d (ctx0, kv_self.k , N*n_embd, (ggml_element_size (kv_self.k )*n_embd)*(il*n_ctx + n_past));
1138
1126
struct ggml_tensor * v = ggml_view_2d (ctx0, kv_self.v , N, n_embd,
1139
1127
( n_ctx)*ggml_element_size (kv_self.v ),
@@ -1144,16 +1132,12 @@ static bool llama_eval_internal(
1144
1132
ggml_build_forward_expand (&gf, ggml_cpy (ctx0, Vcur, v));
1145
1133
}
1146
1134
1147
- // Qcur shape [n_embd/n_head, n_head, N, 1]
1148
- // Q shape [n_embd/n_head, N, n_head, 1]
1149
1135
struct ggml_tensor * Q =
1150
1136
ggml_permute (ctx0,
1151
1137
Qcur,
1152
1138
0 , 2 , 1 , 3 );
1153
1139
ggml_set_name (Q, " Q" );
1154
1140
1155
- // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
1156
- // K shape [n_embd/n_head, n_past + N, n_head, 1]
1157
1141
struct ggml_tensor * K =
1158
1142
ggml_permute (ctx0,
1159
1143
ggml_reshape_3d (ctx0,
@@ -1163,7 +1147,6 @@ static bool llama_eval_internal(
1163
1147
ggml_set_name (K, " K" );
1164
1148
1165
1149
// K * Q
1166
- // KQ shape [n_past + N, N, n_head, 1]
1167
1150
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
1168
1151
ggml_set_name (KQ, " KQ" );
1169
1152
@@ -1176,19 +1159,15 @@ static bool llama_eval_internal(
1176
1159
ggml_set_name (KQ_scaled, " KQ_scaled" );
1177
1160
1178
1161
// KQ_masked = mask_past(KQ_scaled)
1179
- // KQ_masked shape [n_past + N, N, n_head, 1]
1180
1162
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace (ctx0, KQ_scaled, n_past);
1181
1163
ggml_set_name (KQ_masked, " KQ_masked" );
1182
1164
1183
1165
// KQ = soft_max(KQ_masked)
1184
- // KQ_soft_max shape [n_past + N, N, n_head, 1]
1185
1166
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
1186
1167
ggml_set_name (KQ_soft_max, " KQ_soft_max" );
1187
1168
1188
1169
1189
1170
// split cached V into n_head heads
1190
- // // V shape [n_past + N, n_embd/n_head, n_head, 1]
1191
- // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
1192
1171
struct ggml_tensor * V =
1193
1172
ggml_view_3d (ctx0, kv_self.v ,
1194
1173
n_past + N, n_embd/n_head, n_head,
@@ -1198,7 +1177,6 @@ static bool llama_eval_internal(
1198
1177
ggml_set_name (V, " V" );
1199
1178
1200
1179
#if 1
1201
- // KQV shape [n_embd/n_head, N, n_head, 1]
1202
1180
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
1203
1181
ggml_set_name (KQV, " KQV" );
1204
1182
#else
@@ -1210,12 +1188,10 @@ static bool llama_eval_internal(
1210
1188
#endif
1211
1189
1212
1190
// KQV_merged = KQV.permute(0, 2, 1, 3)
1213
- // KQV_merged shape [n_embd/n_head, n_head, N, 1]
1214
1191
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
1215
1192
ggml_set_name (KQV_merged, " KQV_merged" );
1216
1193
1217
1194
// cur = KQV_merged.contiguous().view(n_embd, N)
1218
- // cur shape [n_embd,N,1,1]
1219
1195
cur = ggml_cpy (ctx0,
1220
1196
KQV_merged,
1221
1197
ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
0 commit comments