Skip to content

Commit 2cae279

Browse files
authored
Speedup of up to 300% on large context (ggml-org#58)
KV cache is now cyclic split into permuted V variant The ggml_tensor_print function has been completely reworked to output proper 1-4dim tensors with data. Example: ``` +======================+======================+======================+======================+ | :0 | V [f32 type] +----------------------+----------------------+----------------------+----------------------+ | Dimensions | Strides | Layer id | Backend | | 3 | 4x16x1024 | 0 | CPU | +----------------------+----------------------+----------------------+----------------------+ | Elements | Src0 | Src1 | Operation | | 4 x 64 x 2 | 4 x 64 x 2 | N/A | CONT | +----------------------+----------------------+----------------------+----------------------+ | Transposed: No | Permuted: No | Contiguous: Yes | Size: 0.00 MB | | Src0 name: | cache_v (view) (permuted) | +----------------------+----------------------+----------------------+----------------------+ +-------------------------------------------------------------------------------------------+ | Content of src0 "cache_v (view) (permuted)" (3 dim) +-------------------------------------------------------------------------------------------+ | Content of src0 "cache_v (view) (permuted)" (3 dim) | Total Elements : [ Row:4 Col:64 Layer:2 ] +-------------------------------------------------------------------------------------------+ | Row 1: [0.302 , 0.010 ] [-0.238 , 0.680 ] [0.305 , 0.206 ] [-0.013 , 0.436 ] [-0.074 , -0.698 ] [-0.153 , -0.067 ] | Row 2: [0.091 , 0.199 ] [0.253 , 0.151 ] [-0.557 , 0.089 ] [0.298 , -0.272 ] [-0.149 , 0.232 ] [-0.217 , 0.193 ] | Row 3: [-0.085 , -0.014 ] [0.225 , 0.089 ] [-0.338 , 0.072 ] [0.416 , -0.186 ] [-0.071 , 0.110 ] [0.467 , 0.497 ] | Row 4: [-0.336 , 0.471 ] [-0.144 , 0.070 ] [-0.062 , 0.520 ] [0.093 , 0.217 ] [-0.332 , -0.205 ] [0.012 , 0.335 ] +-------------------------------------------------------------------------------------------+ +-------------------------------------------------------------------------------------------+ | Content of dst "V" (3 dim) +-------------------------------------------------------------------------------------------+ | Content of dst "V" (3 dim) | Total Elements : [ Row:4 Col:64 Layer:2 ] +-------------------------------------------------------------------------------------------+ | Row 1: [0.302 , 0.010 ] [-0.238 , 0.680 ] [0.305 , 0.206 ] [-0.013 , 0.436 ] [-0.074 , -0.698 ] [-0.153 , -0.067 ] | Row 2: [0.091 , 0.199 ] [0.253 , 0.151 ] [-0.557 , 0.089 ] [0.298 , -0.272 ] [-0.149 , 0.232 ] [-0.217 , 0.193 ] | Row 3: [-0.085 , -0.014 ] [0.225 , 0.089 ] [-0.338 , 0.072 ] [0.416 , -0.186 ] [-0.071 , 0.110 ] [0.467 , 0.497 ] | Row 4: [-0.336 , 0.471 ] [-0.144 , 0.070 ] [-0.062 , 0.520 ] [0.093 , 0.217 ] [-0.332 , -0.205 ] [0.012 , 0.335 ] +-------------------------------------------------------------------------------------------+ +======================+======================+======================+======================+ ```
1 parent 09f2184 commit 2cae279

File tree

4 files changed

+166
-64
lines changed

4 files changed

+166
-64
lines changed

examples/falcon/falcon_main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+-
659659
{
660660
n_regen = 4;
661661
}
662-
if (n_regen > all_generation.size()-embd.size()) n_regen = (int)all_generation.size()-embd.size();
662+
if (n_regen > all_generation.size()-embd.size()) n_regen = (int)all_generation.size()-(int)embd.size();
663663

664664
// add right sided part of all_generation storage if we still have room remaining
665665
if (n_regen)

ggml.c

Lines changed: 68 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -19540,60 +19540,80 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
1954019540

1954119541
////////////////////////////////////////////////////////////////////////////////
1954219542

19543-
void ggml_printTensorSample(char *prefix,const struct ggml_tensor * tensor) {
19543+
void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
1954419544
const char *sep = "+-------------------------------------------------------------------------------------------+\n";
19545-
printf("%s", sep);
19546-
printf("| Content of %s \"%s\" (%d dim)",prefix,tensor->name,tensor->n_dims);
19547-
printf("\n");
19548-
const int max_elements = 40000;
19545+
printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, tensor->n_dims);
1954919546

19550-
if (tensor->n_dims == 1) {
19551-
printf("| ");
19552-
for(int i = 0; i < tensor->ne[0] && i < max_elements; i++){
19553-
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0]));
19554-
}
19555-
printf("|");
19556-
printf("\n");
19557-
printf("%s", sep);
19547+
const int MAX_ELEMENTS_ROW = 10;
19548+
const int MAX_ELEMENTS_COL = 6;
19549+
const int MAX_ELEMENTS_LAYER = 3; // layered
19550+
const int MAX_ELEMENTS_BATCH = 2; // repeated display
19551+
const char *dimensionLabels[] = {"Row", "Col", "Layer", "Batch"};
19552+
19553+
printf("\n%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, tensor->n_dims);
19554+
printf("| Total Elements : [ ");
19555+
for (int i = 0; i < tensor->n_dims; i++)
19556+
printf("%s:%-3" PRId64 " ", dimensionLabels[i], tensor->ne[i]);
19557+
printf("]\n%s", sep);
19558+
19559+
if (tensor->n_dims == 1) {
19560+
printf("| 1: ");
19561+
for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
19562+
printf("%-7.3f, ", *(float *)((char *) tensor->data + i*tensor->nb[0]));
19563+
}
19564+
if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
19565+
printf("\n%s", sep);
1955819566
}
1955919567
else if (tensor->n_dims == 2) {
19560-
for(int i = 0; i < tensor->ne[0] && i < max_elements; i++){
19561-
printf("| ");
19562-
for(int j = 0; j < tensor->ne[1] && j < max_elements; j++){
19563-
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1]));
19568+
for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
19569+
printf("| %d: ", i+1);
19570+
for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
19571+
printf("%-7.3f ", *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1]));
19572+
if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
1956419573
}
19565-
printf("|");
1956619574
printf("\n");
1956719575
}
19568-
printf("%s", sep);
19569-
}
19570-
else if(tensor->n_dims == 3) {
19571-
for(int i = 0; i < tensor->ne[0] && i < 3; i++){
19572-
printf("Layer %d\n", i);
19573-
for(int j = 0; j < tensor->ne[1] && j < max_elements; j++){
19574-
printf("| ");
19575-
for(int k = 0; k < tensor->ne[2] && k < max_elements; k++){
19576-
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
19576+
if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(" .. additional rows\n");
19577+
printf("%s", sep);
19578+
}else if(tensor->n_dims == 3) {
19579+
for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
19580+
printf("| Row %d: ", i+1);
19581+
for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
19582+
printf("[");
19583+
for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
19584+
printf("%-7.3f", *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
19585+
if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1)
19586+
printf(", ");
1957719587
}
19578-
printf("|\n");
19588+
if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
19589+
printf("] ");
1957919590
}
19580-
printf("%s\n", sep);
19581-
}
19582-
}
19583-
else if(tensor->n_dims == 4){
19584-
for(int i = 0; i < tensor->ne[0] && i < 3; i++){
19585-
printf("Batch %d\n", i);
19586-
for(int j = 0; j < tensor->ne[1] && j < 3; j++){
19587-
printf("Layer %d\n", j);
19588-
for(int k = 0; k < tensor->ne[2] && k < max_elements; k++){
19589-
printf("| ");
19590-
for(int l = 0; l < tensor->ne[3] && l < 3; l++){
19591-
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2] + l*tensor->nb[3]));
19591+
printf("\n");
19592+
}
19593+
if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(" ... additional layers\n");
19594+
printf("%s", sep);
19595+
}
19596+
19597+
// For 4D tensor
19598+
else if(tensor->n_dims == 4) {
19599+
for(int batch = 0; batch < tensor->ne[0] && batch < MAX_ELEMENTS_BATCH; batch++){
19600+
printf("Batch %d\n", batch+1);
19601+
for(int i = 0; i < tensor->ne[1] && i < MAX_ELEMENTS_ROW; i++){
19602+
printf("| Row %d: ", i+1);
19603+
for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
19604+
printf("[");
19605+
for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
19606+
printf("%-7.3f", *(float *)((char *) tensor->data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
19607+
if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1)
19608+
printf(", ");
1959219609
}
19593-
printf("|\n");
19610+
if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
19611+
printf("] ");
1959419612
}
19595-
printf("%s\n", sep);
19613+
printf("\n");
1959619614
}
19615+
if(MAX_ELEMENTS_BATCH < tensor->ne[0]) printf(" ... additional batches\n");
19616+
printf("%s", sep);
1959719617
}
1959819618
}
1959919619
}
@@ -19614,11 +19634,13 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
1961419634
// nb[i] = nb[i-1] * ne[i-1]
1961519635
*/
1961619636
{
19617-
pos = 0;
19618-
for (int i = 0; i <= tensor->n_dims; i++) {
19619-
pos += snprintf(strides + pos, sizeof(strides) - pos, "%" PRId64, tensor->nb[i]);
19637+
strides[0] = '\0';
19638+
for (int i = 0; i < tensor->n_dims; i++) {
19639+
char dim_str[20];
19640+
snprintf(dim_str, sizeof(dim_str), "%" PRId64, tensor->nb[i]);
19641+
strncat(strides, dim_str, sizeof(strides) - strlen(strides) - 1);
1962019642
if (i != tensor->n_dims - 1) {
19621-
pos += snprintf(strides + pos, sizeof(strides) - pos, "x");
19643+
strncat(strides, "x", sizeof(strides) - strlen(strides) - 1);
1962219644
}
1962319645
}
1962419646
}

ggml.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,9 @@ extern "C" {
397397
float f_custom[4];
398398
int i_custom[4];
399399

400-
// uint8_t padding;
400+
uint8_t debug_flag;
401+
402+
char padding[15];
401403
} tensor_meta;
402404
static const tensor_meta GGML_DEFAULT_TENSOR_META = {
403405
/*.layer_id =*/ -1,
@@ -410,6 +412,8 @@ extern "C" {
410412
/*.f_custom =*/ {0.0f, 0.0f, 0.0f, 0.0f},
411413
/*.i_custom =*/ {0, 0, 0, 0},
412414

415+
/*.debug_flag =*/ 0,
416+
413417

414418
// /*.padding =*/ 0,
415419
};

0 commit comments

Comments
 (0)