Skip to content

Commit 0a23331

Browse files
committed
Merge branch 'jk/fast-export-anonym-alt'
"git fast-export --anonymize" learned to take customized mapping to allow its users to tweak its output more usable for debugging. * jk/fast-export-anonym-alt: fast-export: use local array to store anonymized oid fast-export: anonymize "master" refname fast-export: allow seeding the anonymized mapping fast-export: add a "data" callback parameter to anonymize_str() fast-export: move global "idents" anonymize hashmap into function fast-export: use a flex array to store anonymized entries fast-export: stop storing lengths in anonymized hashmaps fast-export: tighten anonymize_mem() interface to handle only strings fast-export: store anonymized oids as hex strings fast-export: use xmemdupz() for anonymizing oids t9351: derive anonymized tree checks from original repo
2 parents 0ac0947 + f39ad38 commit 0a23331

File tree

3 files changed

+174
-71
lines changed

3 files changed

+174
-71
lines changed

Documentation/git-fast-export.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ by keeping the marks the same across runs.
119119
the shape of the history and stored tree. See the section on
120120
`ANONYMIZING` below.
121121

122+
--anonymize-map=<from>[:<to>]::
123+
Convert token `<from>` to `<to>` in the anonymized output. If
124+
`<to>` is omitted, map `<from>` to itself (i.e., do not
125+
anonymize it). See the section on `ANONYMIZING` below.
126+
122127
--reference-excluded-parents::
123128
By default, running a command such as `git fast-export
124129
master~5..master` will not include the commit master{tilde}5
@@ -238,6 +243,30 @@ collapse "User 0", "User 1", etc into "User X"). This produces a much
238243
smaller output, and it is usually easy to quickly confirm that there is
239244
no private data in the stream.
240245

246+
Reproducing some bugs may require referencing particular commits or
247+
paths, which becomes challenging after refnames and paths have been
248+
anonymized. You can ask for a particular token to be left as-is or
249+
mapped to a new value. For example, if you have a bug which reproduces
250+
with `git rev-list sensitive -- secret.c`, you can run:
251+
252+
---------------------------------------------------
253+
$ git fast-export --anonymize --all \
254+
--anonymize-map=sensitive:foo \
255+
--anonymize-map=secret.c:bar.c \
256+
>stream
257+
---------------------------------------------------
258+
259+
After importing the stream, you can then run `git rev-list foo -- bar.c`
260+
in the anonymized repository.
261+
262+
Note that paths and refnames are split into tokens at slash boundaries.
263+
The command above would anonymize `subdir/secret.c` as something like
264+
`path123/bar.c`; you could then search for `bar.c` in the anonymized
265+
repository to determine the final pathname.
266+
267+
To make referencing the final pathname simpler, you can map each path
268+
component; so if you also anonymize `subdir` to `publicdir`, then the
269+
final pathname would be `publicdir/bar.c`.
241270

242271
LIMITATIONS
243272
-----------

builtin/fast-export.c

Lines changed: 107 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ static struct string_list extra_refs = STRING_LIST_INIT_NODUP;
4545
static struct string_list tag_refs = STRING_LIST_INIT_NODUP;
4646
static struct refspec refspecs = REFSPEC_INIT_FETCH;
4747
static int anonymize;
48+
static struct hashmap anonymized_seeds;
4849
static struct revision_sources revision_sources;
4950

5051
static int parse_opt_signed_tag_mode(const struct option *opt,
@@ -119,57 +120,74 @@ static int has_unshown_parent(struct commit *commit)
119120
}
120121

121122
struct anonymized_entry {
123+
struct hashmap_entry hash;
124+
const char *anon;
125+
const char orig[FLEX_ARRAY];
126+
};
127+
128+
struct anonymized_entry_key {
122129
struct hashmap_entry hash;
123130
const char *orig;
124131
size_t orig_len;
125-
const char *anon;
126-
size_t anon_len;
127132
};
128133

129134
static int anonymized_entry_cmp(const void *unused_cmp_data,
130135
const struct hashmap_entry *eptr,
131136
const struct hashmap_entry *entry_or_key,
132-
const void *unused_keydata)
137+
const void *keydata)
133138
{
134139
const struct anonymized_entry *a, *b;
135140

136141
a = container_of(eptr, const struct anonymized_entry, hash);
137-
b = container_of(entry_or_key, const struct anonymized_entry, hash);
142+
if (keydata) {
143+
const struct anonymized_entry_key *key = keydata;
144+
int equal = !strncmp(a->orig, key->orig, key->orig_len) &&
145+
!a->orig[key->orig_len];
146+
return !equal;
147+
}
138148

139-
return a->orig_len != b->orig_len ||
140-
memcmp(a->orig, b->orig, a->orig_len);
149+
b = container_of(entry_or_key, const struct anonymized_entry, hash);
150+
return strcmp(a->orig, b->orig);
141151
}
142152

143153
/*
144154
* Basically keep a cache of X->Y so that we can repeatedly replace
145155
* the same anonymized string with another. The actual generation
146156
* is farmed out to the generate function.
147157
*/
148-
static const void *anonymize_mem(struct hashmap *map,
149-
void *(*generate)(const void *, size_t *),
150-
const void *orig, size_t *len)
158+
static const char *anonymize_str(struct hashmap *map,
159+
char *(*generate)(void *),
160+
const char *orig, size_t len,
161+
void *data)
151162
{
152-
struct anonymized_entry key, *ret;
163+
struct anonymized_entry_key key;
164+
struct anonymized_entry *ret;
153165

154166
if (!map->cmpfn)
155167
hashmap_init(map, anonymized_entry_cmp, NULL, 0);
156168

157-
hashmap_entry_init(&key.hash, memhash(orig, *len));
169+
hashmap_entry_init(&key.hash, memhash(orig, len));
158170
key.orig = orig;
159-
key.orig_len = *len;
160-
ret = hashmap_get_entry(map, &key, hash, NULL);
171+
key.orig_len = len;
172+
173+
/* First check if it's a token the user configured manually... */
174+
if (anonymized_seeds.cmpfn)
175+
ret = hashmap_get_entry(&anonymized_seeds, &key, hash, &key);
176+
else
177+
ret = NULL;
178+
179+
/* ...otherwise check if we've already seen it in this context... */
180+
if (!ret)
181+
ret = hashmap_get_entry(map, &key, hash, &key);
161182

183+
/* ...and finally generate a new mapping if necessary */
162184
if (!ret) {
163-
ret = xmalloc(sizeof(*ret));
185+
FLEX_ALLOC_MEM(ret, orig, orig, len);
164186
hashmap_entry_init(&ret->hash, key.hash.hash);
165-
ret->orig = xstrdup(orig);
166-
ret->orig_len = *len;
167-
ret->anon = generate(orig, len);
168-
ret->anon_len = *len;
187+
ret->anon = generate(data);
169188
hashmap_put(map, &ret->hash);
170189
}
171190

172-
*len = ret->anon_len;
173191
return ret->anon;
174192
}
175193

@@ -181,13 +199,13 @@ static const void *anonymize_mem(struct hashmap *map,
181199
*/
182200
static void anonymize_path(struct strbuf *out, const char *path,
183201
struct hashmap *map,
184-
void *(*generate)(const void *, size_t *))
202+
char *(*generate)(void *))
185203
{
186204
while (*path) {
187205
const char *end_of_component = strchrnul(path, '/');
188206
size_t len = end_of_component - path;
189-
const char *c = anonymize_mem(map, generate, path, &len);
190-
strbuf_add(out, c, len);
207+
const char *c = anonymize_str(map, generate, path, len, NULL);
208+
strbuf_addstr(out, c);
191209
path = end_of_component;
192210
if (*path)
193211
strbuf_addch(out, *path++);
@@ -361,12 +379,12 @@ static void print_path_1(const char *path)
361379
printf("%s", path);
362380
}
363381

364-
static void *anonymize_path_component(const void *path, size_t *len)
382+
static char *anonymize_path_component(void *data)
365383
{
366384
static int counter;
367385
struct strbuf out = STRBUF_INIT;
368386
strbuf_addf(&out, "path%d", counter++);
369-
return strbuf_detach(&out, len);
387+
return strbuf_detach(&out, NULL);
370388
}
371389

372390
static void print_path(const char *path)
@@ -383,20 +401,23 @@ static void print_path(const char *path)
383401
}
384402
}
385403

386-
static void *generate_fake_oid(const void *old, size_t *len)
404+
static char *generate_fake_oid(void *data)
387405
{
388406
static uint32_t counter = 1; /* avoid null oid */
389407
const unsigned hashsz = the_hash_algo->rawsz;
390-
unsigned char *out = xcalloc(hashsz, 1);
408+
unsigned char out[GIT_MAX_RAWSZ];
409+
char *hex = xmallocz(GIT_MAX_HEXSZ);
410+
411+
hashclr(out);
391412
put_be32(out + hashsz - 4, counter++);
392-
return out;
413+
return hash_to_hex_algop_r(hex, out, the_hash_algo);
393414
}
394415

395-
static const struct object_id *anonymize_oid(const struct object_id *oid)
416+
static const char *anonymize_oid(const char *oid_hex)
396417
{
397418
static struct hashmap objs;
398-
size_t len = the_hash_algo->rawsz;
399-
return anonymize_mem(&objs, generate_fake_oid, oid, &len);
419+
size_t len = strlen(oid_hex);
420+
return anonymize_str(&objs, generate_fake_oid, oid_hex, len, NULL);
400421
}
401422

402423
static void show_filemodify(struct diff_queue_struct *q,
@@ -455,9 +476,9 @@ static void show_filemodify(struct diff_queue_struct *q,
455476
*/
456477
if (no_data || S_ISGITLINK(spec->mode))
457478
printf("M %06o %s ", spec->mode,
458-
oid_to_hex(anonymize ?
459-
anonymize_oid(&spec->oid) :
460-
&spec->oid));
479+
anonymize ?
480+
anonymize_oid(oid_to_hex(&spec->oid)) :
481+
oid_to_hex(&spec->oid));
461482
else {
462483
struct object *object = lookup_object(the_repository,
463484
&spec->oid);
@@ -493,12 +514,12 @@ static const char *find_encoding(const char *begin, const char *end)
493514
return bol;
494515
}
495516

496-
static void *anonymize_ref_component(const void *old, size_t *len)
517+
static char *anonymize_ref_component(void *data)
497518
{
498519
static int counter;
499520
struct strbuf out = STRBUF_INIT;
500521
strbuf_addf(&out, "ref%d", counter++);
501-
return strbuf_detach(&out, len);
522+
return strbuf_detach(&out, NULL);
502523
}
503524

504525
static const char *anonymize_refname(const char *refname)
@@ -517,13 +538,6 @@ static const char *anonymize_refname(const char *refname)
517538
static struct strbuf anon = STRBUF_INIT;
518539
int i;
519540

520-
/*
521-
* We also leave "master" as a special case, since it does not reveal
522-
* anything interesting.
523-
*/
524-
if (!strcmp(refname, "refs/heads/master"))
525-
return refname;
526-
527541
strbuf_reset(&anon);
528542
for (i = 0; i < ARRAY_SIZE(prefixes); i++) {
529543
if (skip_prefix(refname, prefixes[i], &refname)) {
@@ -546,14 +560,13 @@ static char *anonymize_commit_message(const char *old)
546560
return xstrfmt("subject %d\n\nbody\n", counter++);
547561
}
548562

549-
static struct hashmap idents;
550-
static void *anonymize_ident(const void *old, size_t *len)
563+
static char *anonymize_ident(void *data)
551564
{
552565
static int counter;
553566
struct strbuf out = STRBUF_INIT;
554567
strbuf_addf(&out, "User %d <user%[email protected]>", counter, counter);
555568
counter++;
556-
return strbuf_detach(&out, len);
569+
return strbuf_detach(&out, NULL);
557570
}
558571

559572
/*
@@ -563,6 +576,7 @@ static void *anonymize_ident(const void *old, size_t *len)
563576
*/
564577
static void anonymize_ident_line(const char **beg, const char **end)
565578
{
579+
static struct hashmap idents;
566580
static struct strbuf buffers[] = { STRBUF_INIT, STRBUF_INIT };
567581
static unsigned which_buffer;
568582

@@ -588,9 +602,9 @@ static void anonymize_ident_line(const char **beg, const char **end)
588602
size_t len;
589603

590604
len = split.mail_end - split.name_begin;
591-
ident = anonymize_mem(&idents, anonymize_ident,
592-
split.name_begin, &len);
593-
strbuf_add(out, ident, len);
605+
ident = anonymize_str(&idents, anonymize_ident,
606+
split.name_begin, len, NULL);
607+
strbuf_addstr(out, ident);
594608
strbuf_addch(out, ' ');
595609
strbuf_add(out, split.date_begin, split.tz_end - split.date_begin);
596610
} else {
@@ -712,9 +726,10 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
712726
if (mark)
713727
printf(":%d\n", mark);
714728
else
715-
printf("%s\n", oid_to_hex(anonymize ?
716-
anonymize_oid(&obj->oid) :
717-
&obj->oid));
729+
printf("%s\n",
730+
anonymize ?
731+
anonymize_oid(oid_to_hex(&obj->oid)) :
732+
oid_to_hex(&obj->oid));
718733
i++;
719734
}
720735

@@ -729,12 +744,12 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
729744
show_progress();
730745
}
731746

732-
static void *anonymize_tag(const void *old, size_t *len)
747+
static char *anonymize_tag(void *data)
733748
{
734749
static int counter;
735750
struct strbuf out = STRBUF_INIT;
736751
strbuf_addf(&out, "tag message %d", counter++);
737-
return strbuf_detach(&out, len);
752+
return strbuf_detach(&out, NULL);
738753
}
739754

740755
static void handle_tail(struct object_array *commits, struct rev_info *revs,
@@ -804,8 +819,8 @@ static void handle_tag(const char *name, struct tag *tag)
804819
name = anonymize_refname(name);
805820
if (message) {
806821
static struct hashmap tags;
807-
message = anonymize_mem(&tags, anonymize_tag,
808-
message, &message_size);
822+
message = anonymize_str(&tags, anonymize_tag,
823+
message, message_size, NULL);
809824
}
810825
}
811826

@@ -1136,6 +1151,37 @@ static void handle_deletes(void)
11361151
}
11371152
}
11381153

1154+
static char *anonymize_seed(void *data)
1155+
{
1156+
return xstrdup(data);
1157+
}
1158+
1159+
static int parse_opt_anonymize_map(const struct option *opt,
1160+
const char *arg, int unset)
1161+
{
1162+
struct hashmap *map = opt->value;
1163+
const char *delim, *value;
1164+
size_t keylen;
1165+
1166+
BUG_ON_OPT_NEG(unset);
1167+
1168+
delim = strchr(arg, ':');
1169+
if (delim) {
1170+
keylen = delim - arg;
1171+
value = delim + 1;
1172+
} else {
1173+
keylen = strlen(arg);
1174+
value = arg;
1175+
}
1176+
1177+
if (!keylen || !*value)
1178+
return error(_("--anonymize-map token cannot be empty"));
1179+
1180+
anonymize_str(map, anonymize_seed, arg, keylen, (void *)value);
1181+
1182+
return 0;
1183+
}
1184+
11391185
int cmd_fast_export(int argc, const char **argv, const char *prefix)
11401186
{
11411187
struct rev_info revs;
@@ -1177,6 +1223,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
11771223
OPT_STRING_LIST(0, "refspec", &refspecs_list, N_("refspec"),
11781224
N_("Apply refspec to exported refs")),
11791225
OPT_BOOL(0, "anonymize", &anonymize, N_("anonymize output")),
1226+
OPT_CALLBACK_F(0, "anonymize-map", &anonymized_seeds, N_("from:to"),
1227+
N_("convert <from> to <to> in anonymized output"),
1228+
PARSE_OPT_NONEG, parse_opt_anonymize_map),
11801229
OPT_BOOL(0, "reference-excluded-parents",
11811230
&reference_excluded_commits, N_("Reference parents which are not in fast-export stream by object id")),
11821231
OPT_BOOL(0, "show-original-ids", &show_original_ids,
@@ -1204,6 +1253,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
12041253
if (argc > 1)
12051254
usage_with_options (fast_export_usage, options);
12061255

1256+
if (anonymized_seeds.cmpfn && !anonymize)
1257+
die(_("--anonymize-map without --anonymize does not make sense"));
1258+
12071259
if (refspecs_list.nr) {
12081260
int i;
12091261

0 commit comments

Comments
 (0)