Skip to content

Commit 5eee0c9

Browse files
committed
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee <[email protected]>
1 parent d886c14 commit 5eee0c9

File tree

2 files changed

+121
-31
lines changed

2 files changed

+121
-31
lines changed

builtin/survey.c

Lines changed: 110 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -73,35 +73,38 @@ struct survey_report_object_size_summary {
7373
size_t num_missing;
7474
};
7575

76-
typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1,
77-
struct survey_report_object_size_summary *s2);
76+
typedef int (*survey_top_cmp)(void *v1,
77+
void *v2);
7878

79-
MAYBE_UNUSED
80-
static int cmp_by_nr(struct survey_report_object_size_summary *s1,
81-
struct survey_report_object_size_summary *s2)
79+
static int cmp_by_nr(void *v1, void *v2)
8280
{
81+
struct survey_report_object_size_summary *s1 = v1;
82+
struct survey_report_object_size_summary *s2 = v2;
83+
8384
if (s1->nr < s2->nr)
8485
return -1;
8586
if (s1->nr > s2->nr)
8687
return 1;
8788
return 0;
8889
}
8990

90-
MAYBE_UNUSED
91-
static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
92-
struct survey_report_object_size_summary *s2)
91+
static int cmp_by_disk_size(void *v1, void *v2)
9392
{
93+
struct survey_report_object_size_summary *s1 = v1;
94+
struct survey_report_object_size_summary *s2 = v2;
95+
9496
if (s1->disk_size < s2->disk_size)
9597
return -1;
9698
if (s1->disk_size > s2->disk_size)
9799
return 1;
98100
return 0;
99101
}
100102

101-
MAYBE_UNUSED
102-
static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
103-
struct survey_report_object_size_summary *s2)
103+
static int cmp_by_inflated_size(void *v1, void *v2)
104104
{
105+
struct survey_report_object_size_summary *s1 = v1;
106+
struct survey_report_object_size_summary *s2 = v2;
107+
105108
if (s1->inflated_size < s2->inflated_size)
106109
return -1;
107110
if (s1->inflated_size > s2->inflated_size)
@@ -114,42 +117,51 @@ static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
114117
* inserting a new category, reorder the list and free the one that
115118
* got ejected (if any).
116119
*/
117-
struct survey_report_top_sizes {
120+
struct survey_report_top_table {
118121
const char *name;
119-
survey_top_size_cmp cmp_fn;
120-
struct survey_report_object_size_summary *data;
122+
survey_top_cmp cmp_fn;
121123
size_t nr;
122124
size_t alloc;
125+
126+
/**
127+
* 'data' stores an array of structs and must be cast into
128+
* the proper array type before evaluating an index.
129+
*/
130+
void *data;
123131
};
124132

125-
MAYBE_UNUSED
126-
static void init_top_sizes(struct survey_report_top_sizes *top,
133+
static void init_top_sizes(struct survey_report_top_table *top,
127134
size_t limit, const char *name,
128-
survey_top_size_cmp cmp)
135+
survey_top_cmp cmp)
129136
{
137+
struct survey_report_object_size_summary *sz_array;
138+
130139
top->name = name;
140+
top->cmp_fn = cmp;
131141
top->alloc = limit;
132142
top->nr = 0;
133-
CALLOC_ARRAY(top->data, limit);
134-
top->cmp_fn = cmp;
143+
144+
CALLOC_ARRAY(sz_array, limit);
145+
top->data = sz_array;
135146
}
136147

137148
MAYBE_UNUSED
138-
static void clear_top_sizes(struct survey_report_top_sizes *top)
149+
static void clear_top_sizes(struct survey_report_top_table *top)
139150
{
151+
struct survey_report_object_size_summary *sz_array = top->data;
140152
for (size_t i = 0; i < top->nr; i++)
141-
free(top->data[i].label);
153+
free(sz_array[i].label);
142154
free(top->data);
143155
}
144156

145-
MAYBE_UNUSED
146-
static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
157+
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
147158
struct survey_report_object_size_summary *summary)
148159
{
160+
struct survey_report_object_size_summary *sz_array = top->data;
149161
size_t pos = top->nr;
150162

151163
/* Compare against list from the bottom. */
152-
while (pos > 0 && top->cmp_fn(&top->data[pos - 1], summary) < 0)
164+
while (pos > 0 && top->cmp_fn(&sz_array[pos - 1], summary) < 0)
153165
pos--;
154166

155167
/* Not big enough! */
@@ -158,15 +170,15 @@ static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
158170

159171
/* We need to shift the data. */
160172
if (top->nr == top->alloc)
161-
free(top->data[top->nr - 1].label);
173+
free(sz_array[top->nr - 1].label);
162174
else
163175
top->nr++;
164176

165177
for (size_t i = top->nr - 1; i > pos; i--)
166-
memcpy(&top->data[i], &top->data[i - 1], sizeof(*top->data));
178+
memcpy(&sz_array[i], &sz_array[i - 1], sizeof(*sz_array));
167179

168-
memcpy(&top->data[pos], summary, sizeof(*summary));
169-
top->data[pos].label = xstrdup(summary->label);
180+
memcpy(&sz_array[pos], summary, sizeof(*summary));
181+
sz_array[pos].label = xstrdup(summary->label);
170182
}
171183

172184
/**
@@ -178,6 +190,10 @@ struct survey_report {
178190
struct survey_report_object_summary reachable_objects;
179191

180192
struct survey_report_object_size_summary *by_type;
193+
194+
struct survey_report_top_table *top_paths_by_count;
195+
struct survey_report_top_table *top_paths_by_disk;
196+
struct survey_report_top_table *top_paths_by_inflate;
181197
};
182198

183199
#define REPORT_TYPE_COMMIT 0
@@ -420,6 +436,13 @@ static void survey_report_object_sizes(const char *title,
420436
clear_table(&table);
421437
}
422438

439+
static void survey_report_plaintext_sorted_size(
440+
struct survey_report_top_table *top)
441+
{
442+
survey_report_object_sizes(top->name, _("Path"),
443+
top->data, top->nr);
444+
}
445+
423446
static void survey_report_plaintext(struct survey_context *ctx)
424447
{
425448
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -430,6 +453,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
430453
_("Object Type"),
431454
ctx->report.by_type,
432455
REPORT_TYPE_COUNT);
456+
457+
survey_report_plaintext_sorted_size(
458+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
459+
survey_report_plaintext_sorted_size(
460+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
461+
462+
survey_report_plaintext_sorted_size(
463+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
464+
survey_report_plaintext_sorted_size(
465+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
466+
467+
survey_report_plaintext_sorted_size(
468+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
469+
survey_report_plaintext_sorted_size(
470+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
433471
}
434472

435473
/*
@@ -670,7 +708,8 @@ static void increment_totals(struct survey_context *ctx,
670708

671709
static void increment_object_totals(struct survey_context *ctx,
672710
struct oid_array *oids,
673-
enum object_type type)
711+
enum object_type type,
712+
const char *path)
674713
{
675714
struct survey_report_object_size_summary *total;
676715
struct survey_report_object_size_summary summary = { 0 };
@@ -702,6 +741,27 @@ static void increment_object_totals(struct survey_context *ctx,
702741
total->disk_size += summary.disk_size;
703742
total->inflated_size += summary.inflated_size;
704743
total->num_missing += summary.num_missing;
744+
745+
if (type == OBJ_TREE || type == OBJ_BLOB) {
746+
int index = type == OBJ_TREE ?
747+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
748+
struct survey_report_top_table *top;
749+
750+
/*
751+
* Temporarily store (const char *) here, but it will
752+
* be duped if inserted and will not be freed.
753+
*/
754+
summary.label = (char *)path;
755+
756+
top = ctx->report.top_paths_by_count;
757+
maybe_insert_into_top_size(&top[index], &summary);
758+
759+
top = ctx->report.top_paths_by_disk;
760+
maybe_insert_into_top_size(&top[index], &summary);
761+
762+
top = ctx->report.top_paths_by_inflate;
763+
maybe_insert_into_top_size(&top[index], &summary);
764+
}
705765
}
706766

707767
static int survey_objects_path_walk_fn(const char *path,
@@ -713,7 +773,7 @@ static int survey_objects_path_walk_fn(const char *path,
713773

714774
increment_object_counts(&ctx->report.reachable_objects,
715775
type, oids->nr);
716-
increment_object_totals(ctx, oids, type);
776+
increment_object_totals(ctx, oids, type, path);
717777

718778
ctx->progress_nr += oids->nr;
719779
display_progress(ctx->progress, ctx->progress_nr);
@@ -723,11 +783,31 @@ static int survey_objects_path_walk_fn(const char *path,
723783

724784
static void initialize_report(struct survey_context *ctx)
725785
{
786+
const int top_limit = 100;
787+
726788
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
727789
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
728790
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
729791
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
730792
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
793+
794+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
795+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
796+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
797+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
798+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
799+
800+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
801+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
802+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
803+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
804+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
805+
806+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
807+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
808+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
809+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
810+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
731811
}
732812

733813
static void survey_phase_objects(struct survey_context *ctx)

t/t8100-git-survey.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,17 @@ test_expect_success 'git survey (default)' '
6666
Tags | 4 | 510 | 528
6767
EOF
6868
69-
test_cmp expect out
69+
lines=$(wc -l <expect) &&
70+
head -n $lines out >out-trimmed &&
71+
test_cmp expect out-trimmed &&
72+
73+
for type in "DIRECTORIES" "FILES"
74+
do
75+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
76+
do
77+
grep "TOP $type BY $metric" out || return 1
78+
done || return 1
79+
done
7080
'
7181

7282
test_done

0 commit comments

Comments
 (0)