Skip to content

Commit 65b5d9f

Browse files
peffgitster
authored andcommitted
fast-export: allow seeding the anonymized mapping
After you anonymize a repository, it can be hard to find which commits correspond between the original and the result, and thus hard to reproduce commands that triggered bugs in the original. Let's make it possible to seed the anonymization map. This lets users either: - mark names to be retained as-is, if they don't consider them secret (in which case their original commands would just work) - map names to new values, which lets them adapt the reproduction recipe to the new names without revealing the originals The implementation is fairly straight-forward. We already store each anonymized token in a hashmap (so that the same token appearing twice is converted to the same result). We can just introduce a new "seed" hashmap which is consulted first. This does make a few more promises to the user about how we'll anonymize things (e.g., token-splitting pathnames). But it's unlikely that we'd want to change those rules, even if the actual anonymization of a single token changes. And it makes things much easier for the user, who can unblind only a directory name without having to specify each path within it. One alternative to this approach would be to anonymize as we see fit, and then dump the whole refname and pathname mappings to a file. This does work, but it's a bit awkward to use (you have to manually dig the items you care about out of the mapping). Helped-by: Eric Sunshine <[email protected]> Signed-off-by: Jeff King <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent d5bf91f commit 65b5d9f

File tree

3 files changed

+88
-2
lines changed

3 files changed

+88
-2
lines changed

Documentation/git-fast-export.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ by keeping the marks the same across runs.
119119
the shape of the history and stored tree. See the section on
120120
`ANONYMIZING` below.
121121

122+
--anonymize-map=<from>[:<to>]::
123+
Convert token `<from>` to `<to>` in the anonymized output. If
124+
`<to>` is omitted, map `<from>` to itself (i.e., do not
125+
anonymize it). See the section on `ANONYMIZING` below.
126+
122127
--reference-excluded-parents::
123128
By default, running a command such as `git fast-export
124129
master~5..master` will not include the commit master{tilde}5
@@ -238,6 +243,30 @@ collapse "User 0", "User 1", etc into "User X"). This produces a much
238243
smaller output, and it is usually easy to quickly confirm that there is
239244
no private data in the stream.
240245

246+
Reproducing some bugs may require referencing particular commits or
247+
paths, which becomes challenging after refnames and paths have been
248+
anonymized. You can ask for a particular token to be left as-is or
249+
mapped to a new value. For example, if you have a bug which reproduces
250+
with `git rev-list sensitive -- secret.c`, you can run:
251+
252+
---------------------------------------------------
253+
$ git fast-export --anonymize --all \
254+
--anonymize-map=sensitive:foo \
255+
--anonymize-map=secret.c:bar.c \
256+
>stream
257+
---------------------------------------------------
258+
259+
After importing the stream, you can then run `git rev-list foo -- bar.c`
260+
in the anonymized repository.
261+
262+
Note that paths and refnames are split into tokens at slash boundaries.
263+
The command above would anonymize `subdir/secret.c` as something like
264+
`path123/bar.c`; you could then search for `bar.c` in the anonymized
265+
repository to determine the final pathname.
266+
267+
To make referencing the final pathname simpler, you can map each path
268+
component; so if you also anonymize `subdir` to `publicdir`, then the
269+
final pathname would be `publicdir/bar.c`.
241270

242271
LIMITATIONS
243272
-----------

builtin/fast-export.c

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ static struct string_list extra_refs = STRING_LIST_INIT_NODUP;
4545
static struct string_list tag_refs = STRING_LIST_INIT_NODUP;
4646
static struct refspec refspecs = REFSPEC_INIT_FETCH;
4747
static int anonymize;
48+
static struct hashmap anonymized_seeds;
4849
static struct revision_sources revision_sources;
4950

5051
static int parse_opt_signed_tag_mode(const struct option *opt,
@@ -168,8 +169,18 @@ static const char *anonymize_str(struct hashmap *map,
168169
hashmap_entry_init(&key.hash, memhash(orig, len));
169170
key.orig = orig;
170171
key.orig_len = len;
171-
ret = hashmap_get_entry(map, &key, hash, &key);
172172

173+
/* First check if it's a token the user configured manually... */
174+
if (anonymized_seeds.cmpfn)
175+
ret = hashmap_get_entry(&anonymized_seeds, &key, hash, &key);
176+
else
177+
ret = NULL;
178+
179+
/* ...otherwise check if we've already seen it in this context... */
180+
if (!ret)
181+
ret = hashmap_get_entry(map, &key, hash, &key);
182+
183+
/* ...and finally generate a new mapping if necessary */
173184
if (!ret) {
174185
FLEX_ALLOC_MEM(ret, orig, orig, len);
175186
hashmap_entry_init(&ret->hash, key.hash.hash);
@@ -1147,6 +1158,37 @@ static void handle_deletes(void)
11471158
}
11481159
}
11491160

1161+
static char *anonymize_seed(void *data)
1162+
{
1163+
return xstrdup(data);
1164+
}
1165+
1166+
static int parse_opt_anonymize_map(const struct option *opt,
1167+
const char *arg, int unset)
1168+
{
1169+
struct hashmap *map = opt->value;
1170+
const char *delim, *value;
1171+
size_t keylen;
1172+
1173+
BUG_ON_OPT_NEG(unset);
1174+
1175+
delim = strchr(arg, ':');
1176+
if (delim) {
1177+
keylen = delim - arg;
1178+
value = delim + 1;
1179+
} else {
1180+
keylen = strlen(arg);
1181+
value = arg;
1182+
}
1183+
1184+
if (!keylen || !*value)
1185+
return error(_("--anonymize-map token cannot be empty"));
1186+
1187+
anonymize_str(map, anonymize_seed, arg, keylen, (void *)value);
1188+
1189+
return 0;
1190+
}
1191+
11501192
int cmd_fast_export(int argc, const char **argv, const char *prefix)
11511193
{
11521194
struct rev_info revs;
@@ -1188,6 +1230,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
11881230
OPT_STRING_LIST(0, "refspec", &refspecs_list, N_("refspec"),
11891231
N_("Apply refspec to exported refs")),
11901232
OPT_BOOL(0, "anonymize", &anonymize, N_("anonymize output")),
1233+
OPT_CALLBACK_F(0, "anonymize-map", &anonymized_seeds, N_("from:to"),
1234+
N_("convert <from> to <to> in anonymized output"),
1235+
PARSE_OPT_NONEG, parse_opt_anonymize_map),
11911236
OPT_BOOL(0, "reference-excluded-parents",
11921237
&reference_excluded_commits, N_("Reference parents which are not in fast-export stream by object id")),
11931238
OPT_BOOL(0, "show-original-ids", &show_original_ids,
@@ -1215,6 +1260,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
12151260
if (argc > 1)
12161261
usage_with_options (fast_export_usage, options);
12171262

1263+
if (anonymized_seeds.cmpfn && !anonymize)
1264+
die(_("--anonymize-map without --anonymize does not make sense"));
1265+
12181266
if (refspecs_list.nr) {
12191267
int i;
12201268

t/t9351-fast-export-anonymize.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ test_description='basic tests for fast-export --anonymize'
66
test_expect_success 'setup simple repo' '
77
test_commit base &&
88
test_commit foo &&
9+
test_commit retain-me &&
910
git checkout -b other HEAD^ &&
1011
mkdir subdir &&
1112
test_commit subdir/bar &&
@@ -18,7 +19,10 @@ test_expect_success 'setup simple repo' '
1819
'
1920

2021
test_expect_success 'export anonymized stream' '
21-
git fast-export --anonymize --all >stream
22+
git fast-export --anonymize --all \
23+
--anonymize-map=retain-me \
24+
--anonymize-map=xyzzy:custom-name \
25+
>stream
2226
'
2327

2428
# this also covers commit messages
@@ -30,6 +34,11 @@ test_expect_success 'stream omits path names' '
3034
! grep xyzzy stream
3135
'
3236

37+
test_expect_success 'stream contains user-specified names' '
38+
grep retain-me stream &&
39+
grep custom-name stream
40+
'
41+
3342
test_expect_success 'stream omits gitlink oids' '
3443
# avoid relying on the whole oid to remain hash-agnostic; this is
3544
# plenty to be unique within our test case

0 commit comments

Comments
 (0)