Skip to content

Commit 96cc8ab

Browse files
derrickstoleegitster
authored andcommitted
sparse-checkout: use hashmaps for cone patterns
The parent and recursive patterns allowed by the "cone mode" option in sparse-checkout are restrictive enough that we can avoid using the regex parsing. Everything is based on prefix matches, so we can use hashsets to store the prefixes from the sparse-checkout file. When checking a path, we can strip path entries from the path and check the hashset for an exact match. As a test, I created a cone-mode sparse-checkout file for the Linux repository that actually includes every file. This was constructed by taking every folder in the Linux repo and creating the pattern pairs here: /$folder/ !/$folder/*/ This resulted in a sparse-checkout file sith 8,296 patterns. Running 'git read-tree -mu HEAD' on this file had the following performance: core.sparseCheckout=false: 0.21 s (0.00 s) core.sparseCheckout=true: 3.75 s (3.50 s) core.sparseCheckoutCone=true: 0.23 s (0.01 s) The times in parentheses above correspond to the time spent in the first clear_ce_flags() call, according to the trace2 performance traces. While this example is contrived, it demonstrates how these patterns can slow the sparse-checkout feature. Helped-by: Eric Wong <[email protected]> Helped-by: Johannes Schindelin <[email protected]> Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 879321e commit 96cc8ab

File tree

4 files changed

+241
-9
lines changed

4 files changed

+241
-9
lines changed

dir.c

Lines changed: 199 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,150 @@ void parse_path_pattern(const char **pattern,
611611
*patternlen = len;
612612
}
613613

614+
static int pl_hashmap_cmp(const void *unused_cmp_data,
615+
const struct hashmap_entry *a,
616+
const struct hashmap_entry *b,
617+
const void *key)
618+
{
619+
const struct pattern_entry *ee1 =
620+
container_of(a, struct pattern_entry, ent);
621+
const struct pattern_entry *ee2 =
622+
container_of(b, struct pattern_entry, ent);
623+
624+
size_t min_len = ee1->patternlen <= ee2->patternlen
625+
? ee1->patternlen
626+
: ee2->patternlen;
627+
628+
return strncmp(ee1->pattern, ee2->pattern, min_len);
629+
}
630+
631+
static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given)
632+
{
633+
struct pattern_entry *translated;
634+
char *truncated;
635+
char *data = NULL;
636+
637+
if (!pl->use_cone_patterns)
638+
return;
639+
640+
if (given->flags & PATTERN_FLAG_NEGATIVE &&
641+
given->flags & PATTERN_FLAG_MUSTBEDIR &&
642+
!strcmp(given->pattern, "/*")) {
643+
pl->full_cone = 0;
644+
return;
645+
}
646+
647+
if (!given->flags && !strcmp(given->pattern, "/*")) {
648+
pl->full_cone = 1;
649+
return;
650+
}
651+
652+
if (given->patternlen > 2 &&
653+
!strcmp(given->pattern + given->patternlen - 2, "/*")) {
654+
if (!(given->flags & PATTERN_FLAG_NEGATIVE)) {
655+
/* Not a cone pattern. */
656+
pl->use_cone_patterns = 0;
657+
warning(_("unrecognized pattern: '%s'"), given->pattern);
658+
goto clear_hashmaps;
659+
}
660+
661+
truncated = xstrdup(given->pattern);
662+
truncated[given->patternlen - 2] = 0;
663+
664+
translated = xmalloc(sizeof(struct pattern_entry));
665+
translated->pattern = truncated;
666+
translated->patternlen = given->patternlen - 2;
667+
hashmap_entry_init(&translated->ent,
668+
memhash(translated->pattern, translated->patternlen));
669+
670+
if (!hashmap_get_entry(&pl->recursive_hashmap,
671+
translated, ent, NULL)) {
672+
/* We did not see the "parent" included */
673+
warning(_("unrecognized negative pattern: '%s'"),
674+
given->pattern);
675+
free(truncated);
676+
free(translated);
677+
goto clear_hashmaps;
678+
}
679+
680+
hashmap_add(&pl->parent_hashmap, &translated->ent);
681+
hashmap_remove(&pl->recursive_hashmap, &translated->ent, &data);
682+
free(data);
683+
return;
684+
}
685+
686+
if (given->flags & PATTERN_FLAG_NEGATIVE) {
687+
warning(_("unrecognized negative pattern: '%s'"),
688+
given->pattern);
689+
goto clear_hashmaps;
690+
}
691+
692+
translated = xmalloc(sizeof(struct pattern_entry));
693+
694+
translated->pattern = xstrdup(given->pattern);
695+
translated->patternlen = given->patternlen;
696+
hashmap_entry_init(&translated->ent,
697+
memhash(translated->pattern, translated->patternlen));
698+
699+
hashmap_add(&pl->recursive_hashmap, &translated->ent);
700+
701+
if (hashmap_get_entry(&pl->parent_hashmap, translated, ent, NULL)) {
702+
/* we already included this at the parent level */
703+
warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"),
704+
given->pattern);
705+
hashmap_remove(&pl->parent_hashmap, &translated->ent, &data);
706+
free(data);
707+
free(translated);
708+
}
709+
710+
return;
711+
712+
clear_hashmaps:
713+
warning(_("disabling cone pattern matching"));
714+
hashmap_free_entries(&pl->parent_hashmap, struct pattern_entry, ent);
715+
hashmap_free_entries(&pl->recursive_hashmap, struct pattern_entry, ent);
716+
pl->use_cone_patterns = 0;
717+
}
718+
719+
static int hashmap_contains_path(struct hashmap *map,
720+
struct strbuf *pattern)
721+
{
722+
struct pattern_entry p;
723+
724+
/* Check straight mapping */
725+
p.pattern = pattern->buf;
726+
p.patternlen = pattern->len;
727+
hashmap_entry_init(&p.ent, memhash(p.pattern, p.patternlen));
728+
return !!hashmap_get_entry(map, &p, ent, NULL);
729+
}
730+
731+
int hashmap_contains_parent(struct hashmap *map,
732+
const char *path,
733+
struct strbuf *buffer)
734+
{
735+
char *slash_pos;
736+
737+
strbuf_setlen(buffer, 0);
738+
739+
if (path[0] != '/')
740+
strbuf_addch(buffer, '/');
741+
742+
strbuf_addstr(buffer, path);
743+
744+
slash_pos = strrchr(buffer->buf, '/');
745+
746+
while (slash_pos > buffer->buf) {
747+
strbuf_setlen(buffer, slash_pos - buffer->buf);
748+
749+
if (hashmap_contains_path(map, buffer))
750+
return 1;
751+
752+
slash_pos = strrchr(buffer->buf, '/');
753+
}
754+
755+
return 0;
756+
}
757+
614758
void add_pattern(const char *string, const char *base,
615759
int baselen, struct pattern_list *pl, int srcpos)
616760
{
@@ -635,6 +779,8 @@ void add_pattern(const char *string, const char *base,
635779
ALLOC_GROW(pl->patterns, pl->nr + 1, pl->alloc);
636780
pl->patterns[pl->nr++] = pattern;
637781
pattern->pl = pl;
782+
783+
add_pattern_to_hashsets(pl, pattern);
638784
}
639785

640786
static int read_skip_worktree_file_from_index(const struct index_state *istate,
@@ -860,6 +1006,9 @@ static int add_patterns_from_buffer(char *buf, size_t size,
8601006
int i, lineno = 1;
8611007
char *entry;
8621008

1009+
hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0);
1010+
hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0);
1011+
8631012
pl->filebuf = buf;
8641013

8651014
if (skip_utf8_bom(&buf, size))
@@ -1096,16 +1245,58 @@ enum pattern_match_result path_matches_pattern_list(
10961245
struct index_state *istate)
10971246
{
10981247
struct path_pattern *pattern;
1099-
pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
1100-
dtype, pl, istate);
1101-
if (pattern) {
1102-
if (pattern->flags & PATTERN_FLAG_NEGATIVE)
1103-
return NOT_MATCHED;
1104-
else
1105-
return MATCHED;
1248+
struct strbuf parent_pathname = STRBUF_INIT;
1249+
int result = NOT_MATCHED;
1250+
const char *slash_pos;
1251+
1252+
if (!pl->use_cone_patterns) {
1253+
pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
1254+
dtype, pl, istate);
1255+
if (pattern) {
1256+
if (pattern->flags & PATTERN_FLAG_NEGATIVE)
1257+
return NOT_MATCHED;
1258+
else
1259+
return MATCHED;
1260+
}
1261+
1262+
return UNDECIDED;
1263+
}
1264+
1265+
if (pl->full_cone)
1266+
return MATCHED;
1267+
1268+
strbuf_addch(&parent_pathname, '/');
1269+
strbuf_add(&parent_pathname, pathname, pathlen);
1270+
1271+
if (hashmap_contains_path(&pl->recursive_hashmap,
1272+
&parent_pathname)) {
1273+
result = MATCHED;
1274+
goto done;
1275+
}
1276+
1277+
slash_pos = strrchr(parent_pathname.buf, '/');
1278+
1279+
if (slash_pos == parent_pathname.buf) {
1280+
/* include every file in root */
1281+
result = MATCHED;
1282+
goto done;
11061283
}
11071284

1108-
return UNDECIDED;
1285+
strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf);
1286+
1287+
if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) {
1288+
result = MATCHED;
1289+
goto done;
1290+
}
1291+
1292+
if (hashmap_contains_parent(&pl->recursive_hashmap,
1293+
pathname,
1294+
&parent_pathname))
1295+
result = MATCHED;
1296+
1297+
done:
1298+
strbuf_release(&parent_pathname);
1299+
return result;
11091300
}
11101301

11111302
static struct path_pattern *last_matching_pattern_from_lists(

dir.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
/* See Documentation/technical/api-directory-listing.txt */
55

66
#include "cache.h"
7+
#include "hashmap.h"
78
#include "strbuf.h"
89

910
struct dir_entry {
@@ -37,6 +38,13 @@ struct path_pattern {
3738
int srcpos;
3839
};
3940

41+
/* used for hashmaps for cone patterns */
42+
struct pattern_entry {
43+
struct hashmap_entry ent;
44+
char *pattern;
45+
size_t patternlen;
46+
};
47+
4048
/*
4149
* Each excludes file will be parsed into a fresh exclude_list which
4250
* is appended to the relevant exclude_list_group (either EXC_DIRS or
@@ -55,6 +63,26 @@ struct pattern_list {
5563
const char *src;
5664

5765
struct path_pattern **patterns;
66+
67+
/*
68+
* While scanning the excludes, we attempt to match the patterns
69+
* with a more restricted set that allows us to use hashsets for
70+
* matching logic, which is faster than the linear lookup in the
71+
* excludes array above. If non-zero, that check succeeded.
72+
*/
73+
unsigned use_cone_patterns;
74+
unsigned full_cone;
75+
76+
/*
77+
* Stores paths where everything starting with those paths
78+
* is included.
79+
*/
80+
struct hashmap recursive_hashmap;
81+
82+
/*
83+
* Used to check single-level parents of blobs.
84+
*/
85+
struct hashmap parent_hashmap;
5886
};
5987

6088
/*
@@ -271,6 +299,9 @@ int is_excluded(struct dir_struct *dir,
271299
struct index_state *istate,
272300
const char *name, int *dtype);
273301

302+
int hashmap_contains_parent(struct hashmap *map,
303+
const char *path,
304+
struct strbuf *buffer);
274305
struct pattern_list *add_pattern_list(struct dir_struct *dir,
275306
int group_type, const char *src);
276307
int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen,

t/t1091-sparse-checkout-builtin.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,8 @@ test_expect_success 'set sparse-checkout using --stdin' '
151151
test_expect_success 'cone mode: match patterns' '
152152
git -C repo config --worktree core.sparseCheckoutCone true &&
153153
rm -rf repo/a repo/folder1 repo/folder2 &&
154-
git -C repo read-tree -mu HEAD &&
154+
git -C repo read-tree -mu HEAD 2>err &&
155+
test_i18ngrep ! "disabling cone patterns" err &&
155156
git -C repo reset --hard &&
156157
ls repo >dir &&
157158
cat >expect <<-EOF &&
@@ -162,6 +163,14 @@ test_expect_success 'cone mode: match patterns' '
162163
test_cmp expect dir
163164
'
164165

166+
test_expect_success 'cone mode: warn on bad pattern' '
167+
test_when_finished mv sparse-checkout repo/.git/info/ &&
168+
cp repo/.git/info/sparse-checkout . &&
169+
echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout &&
170+
git -C repo read-tree -mu HEAD 2>err &&
171+
test_i18ngrep "unrecognized negative pattern" err
172+
'
173+
165174
test_expect_success 'sparse-checkout disable' '
166175
git -C repo sparse-checkout disable &&
167176
test_path_is_missing repo/.git/info/sparse-checkout &&

unpack-trees.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,6 +1482,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options
14821482
o->skip_sparse_checkout = 1;
14831483
if (!o->skip_sparse_checkout) {
14841484
char *sparse = git_pathdup("info/sparse-checkout");
1485+
pl.use_cone_patterns = core_sparse_checkout_cone;
14851486
if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0)
14861487
o->skip_sparse_checkout = 1;
14871488
else

0 commit comments

Comments
 (0)