Skip to content

Commit 2575412

Browse files
avargitster
authored andcommitted
grep: make the behavior for NUL-byte in patterns sane
The behavior of "grep" when patterns contained a NUL-byte has always been haphazard, and has served the vagaries of the implementation more than anything else. A pattern containing a NUL-byte can only be provided via "-f <file>". Since pickaxe (log search) has no such flag the NUL-byte in patterns has only ever been supported by "grep" (and not "log --grep"). Since 9ecedde ("Use kwset in grep", 2011-08-21) patterns containing "\0" were considered fixed. In 966be95 ("grep: add tests to fix blind spots with \0 patterns", 2017-05-20) I added tests for this behavior. Change the behavior to do the obvious thing, i.e. don't silently discard a regex pattern and make it implicitly fixed just because they contain a NUL-byte. Instead die if the backend in question can't handle them, e.g. --basic-regexp is combined with such a pattern. This is desired because from a user's point of view it's the obvious thing to do. Whether we support BRE/ERE/Perl syntax is different from whether our implementation is limited by C-strings. These patterns are obscure enough that I think this behavior change is OK, especially since we never documented the old behavior. Doing this also makes it easier to replace the kwset backend with something else, since we'll no longer strictly need it for anything we can't easily use another fixed-string backend for. Signed-off-by: Ævar Arnfjörð Bjarmason <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent d316af0 commit 2575412

File tree

3 files changed

+110
-89
lines changed

3 files changed

+110
-89
lines changed

Documentation/git-grep.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,23 @@ providing this option will cause it to die.
271271

272272
-f <file>::
273273
Read patterns from <file>, one per line.
274+
+
275+
Passing the pattern via <file> allows for providing a search pattern
276+
containing a \0.
277+
+
278+
Not all pattern types support patterns containing \0. Git will error
279+
out if a given pattern type can't support such a pattern. The
280+
`--perl-regexp` pattern type when compiled against the PCRE v2 backend
281+
has the widest support for these types of patterns.
282+
+
283+
In versions of Git before 2.23.0 patterns containing \0 would be
284+
silently considered fixed. This was never documented, there were also
285+
odd and undocumented interactions between e.g. non-ASCII patterns
286+
containing \0 and `--ignore-case`.
287+
+
288+
In future versions we may learn to support patterns containing \0 for
289+
more search backends, until then we'll die when the pattern type in
290+
question doesn't support them.
274291

275292
-e::
276293
The next parameter is the pattern. This option has to be

grep.c

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -368,18 +368,6 @@ static int is_fixed(const char *s, size_t len)
368368
return 1;
369369
}
370370

371-
static int has_null(const char *s, size_t len)
372-
{
373-
/*
374-
* regcomp cannot accept patterns with NULs so when using it
375-
* we consider any pattern containing a NUL fixed.
376-
*/
377-
if (memchr(s, 0, len))
378-
return 1;
379-
380-
return 0;
381-
}
382-
383371
#ifdef USE_LIBPCRE1
384372
static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
385373
{
@@ -668,17 +656,20 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
668656
* simple string match using kws. p->fixed tells us if we
669657
* want to use kws.
670658
*/
671-
if (opt->fixed ||
672-
has_null(p->pattern, p->patternlen) ||
673-
is_fixed(p->pattern, p->patternlen))
659+
if (opt->fixed || is_fixed(p->pattern, p->patternlen))
674660
p->fixed = !p->ignore_case || !has_non_ascii(p->pattern);
675661

676662
if (p->fixed) {
677663
p->kws = kwsalloc(p->ignore_case ? tolower_trans_tbl : NULL);
678664
kwsincr(p->kws, p->pattern, p->patternlen);
679665
kwsprep(p->kws);
680666
return;
681-
} else if (opt->fixed) {
667+
}
668+
669+
if (memchr(p->pattern, 0, p->patternlen) && !opt->pcre2)
670+
die(_("given pattern contains NULL byte (via -f <file>). This is only supported with -P under PCRE v2"));
671+
672+
if (opt->fixed) {
682673
/*
683674
* We come here when the pattern has the non-ascii
684675
* characters we cannot case-fold, and asked to

t/t7816-grep-binary-pattern.sh

Lines changed: 86 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2,113 +2,126 @@
22

33
test_description='git grep with a binary pattern files'
44

5-
. ./test-lib.sh
5+
. ./lib-gettext.sh
66

7-
nul_match () {
7+
nul_match_internal () {
88
matches=$1
9-
flags=$2
10-
pattern=$3
9+
prereqs=$2
10+
lc_all=$3
11+
extra_flags=$4
12+
flags=$5
13+
pattern=$6
1114
pattern_human=$(echo "$pattern" | sed 's/Q/<NUL>/g')
1215

1316
if test "$matches" = 1
1417
then
15-
test_expect_success "git grep -f f $flags '$pattern_human' a" "
18+
test_expect_success $prereqs "LC_ALL='$lc_all' git grep $extra_flags -f f $flags '$pattern_human' a" "
1619
printf '$pattern' | q_to_nul >f &&
17-
git grep -f f $flags a
20+
LC_ALL='$lc_all' git grep $extra_flags -f f $flags a
1821
"
1922
elif test "$matches" = 0
2023
then
21-
test_expect_success "git grep -f f $flags '$pattern_human' a" "
24+
test_expect_success $prereqs "LC_ALL='$lc_all' git grep $extra_flags -f f $flags '$pattern_human' a" "
25+
>stderr &&
2226
printf '$pattern' | q_to_nul >f &&
23-
test_must_fail git grep -f f $flags a
27+
test_must_fail env LC_ALL=\"$lc_all\" git grep $extra_flags -f f $flags a 2>stderr &&
28+
test_i18ngrep ! 'This is only supported with -P under PCRE v2' stderr
2429
"
25-
elif test "$matches" = T1
30+
elif test "$matches" = P
2631
then
27-
test_expect_failure "git grep -f f $flags '$pattern_human' a" "
32+
test_expect_success $prereqs "error, PCRE v2 only: LC_ALL='$lc_all' git grep -f f $flags '$pattern_human' a" "
33+
>stderr &&
2834
printf '$pattern' | q_to_nul >f &&
29-
git grep -f f $flags a
30-
"
31-
elif test "$matches" = T0
32-
then
33-
test_expect_failure "git grep -f f $flags '$pattern_human' a" "
34-
printf '$pattern' | q_to_nul >f &&
35-
test_must_fail git grep -f f $flags a
35+
test_must_fail env LC_ALL=\"$lc_all\" git grep -f f $flags a 2>stderr &&
36+
test_i18ngrep 'This is only supported with -P under PCRE v2' stderr
3637
"
3738
else
3839
test_expect_success "PANIC: Test framework error. Unknown matches value $matches" 'false'
3940
fi
4041
}
4142

43+
nul_match () {
44+
matches=$1
45+
matches_pcre2=$2
46+
matches_pcre2_locale=$3
47+
flags=$4
48+
pattern=$5
49+
pattern_human=$(echo "$pattern" | sed 's/Q/<NUL>/g')
50+
51+
nul_match_internal "$matches" "" "C" "" "$flags" "$pattern"
52+
nul_match_internal "$matches_pcre2" "LIBPCRE2" "C" "-P" "$flags" "$pattern"
53+
nul_match_internal "$matches_pcre2_locale" "LIBPCRE2,GETTEXT_LOCALE" "$is_IS_locale" "-P" "$flags" "$pattern"
54+
}
55+
4256
test_expect_success 'setup' "
4357
echo 'binaryQfileQm[*]cQ*æQð' | q_to_nul >a &&
4458
git add a &&
4559
git commit -m.
4660
"
4761

48-
nul_match 1 '-F' 'yQf'
49-
nul_match 0 '-F' 'yQx'
50-
nul_match 1 '-Fi' 'YQf'
51-
nul_match 0 '-Fi' 'YQx'
52-
nul_match 1 '' 'yQf'
53-
nul_match 0 '' 'yQx'
54-
nul_match 1 '' 'æQð'
55-
nul_match 1 '-F' 'eQm[*]c'
56-
nul_match 1 '-Fi' 'EQM[*]C'
62+
# Simple fixed-string matching that can use kwset (no -i && non-ASCII)
63+
nul_match 1 1 1 '-F' 'yQf'
64+
nul_match 0 0 0 '-F' 'yQx'
65+
nul_match 1 1 1 '-Fi' 'YQf'
66+
nul_match 0 0 0 '-Fi' 'YQx'
67+
nul_match 1 1 1 '' 'yQf'
68+
nul_match 0 0 0 '' 'yQx'
69+
nul_match 1 1 1 '' 'æQð'
70+
nul_match 1 1 1 '-F' 'eQm[*]c'
71+
nul_match 1 1 1 '-Fi' 'EQM[*]C'
5772

5873
# Regex patterns that would match but shouldn't with -F
59-
nul_match 0 '-F' 'yQ[f]'
60-
nul_match 0 '-F' '[y]Qf'
61-
nul_match 0 '-Fi' 'YQ[F]'
62-
nul_match 0 '-Fi' '[Y]QF'
63-
nul_match 0 '-F' 'æQ[ð]'
64-
nul_match 0 '-F' '[æ]Qð'
65-
nul_match 0 '-Fi' 'ÆQ[Ð]'
66-
nul_match 0 '-Fi' '[Æ]QÐ'
74+
nul_match 0 0 0 '-F' 'yQ[f]'
75+
nul_match 0 0 0 '-F' '[y]Qf'
76+
nul_match 0 0 0 '-Fi' 'YQ[F]'
77+
nul_match 0 0 0 '-Fi' '[Y]QF'
78+
nul_match 0 0 0 '-F' 'æQ[ð]'
79+
nul_match 0 0 0 '-F' '[æ]Qð'
6780

68-
# kwset is disabled on -i & non-ASCII. No way to match non-ASCII \0
69-
# patterns case-insensitively.
70-
nul_match T1 '-i' 'ÆQÐ'
81+
# The -F kwset codepath can't handle -i && non-ASCII...
82+
nul_match P 1 1 '-i' '[æ]Qð'
7183

72-
# \0 implicitly disables regexes. This is an undocumented internal
73-
# limitation.
74-
nul_match T1 '' 'yQ[f]'
75-
nul_match T1 '' '[y]Qf'
76-
nul_match T1 '-i' 'YQ[F]'
77-
nul_match T1 '-i' '[Y]Qf'
78-
nul_match T1 '' 'æQ[ð]'
79-
nul_match T1 '' '[æ]Qð'
80-
nul_match T1 '-i' 'ÆQ[Ð]'
84+
# ...PCRE v2 only matches non-ASCII with -i casefolding under UTF-8
85+
# semantics
86+
nul_match P P P '-Fi' 'ÆQ[Ð]'
87+
nul_match P 0 1 '-i' 'ÆQ[Ð]'
88+
nul_match P 0 1 '-i' '[Æ]QÐ'
89+
nul_match P 0 1 '-i' '[Æ]Qð'
90+
nul_match P 0 1 '-i' 'ÆQÐ'
8191

82-
# ... because of \0 implicitly disabling regexes regexes that
83-
# should/shouldn't match don't do the right thing.
84-
nul_match T1 '' 'eQm.*cQ'
85-
nul_match T1 '-i' 'EQM.*cQ'
86-
nul_match T0 '' 'eQm[*]c'
87-
nul_match T0 '-i' 'EQM[*]C'
92+
# \0 in regexes can only work with -P & PCRE v2
93+
nul_match P 1 1 '' 'yQ[f]'
94+
nul_match P 1 1 '' '[y]Qf'
95+
nul_match P 1 1 '-i' 'YQ[F]'
96+
nul_match P 1 1 '-i' '[Y]Qf'
97+
nul_match P 1 1 '' 'æQ[ð]'
98+
nul_match P 1 1 '' '[æ]Qð'
99+
nul_match P 0 1 '-i' 'ÆQ[Ð]'
100+
nul_match P 1 1 '' 'eQm.*cQ'
101+
nul_match P 1 1 '-i' 'EQM.*cQ'
102+
nul_match P 0 0 '' 'eQm[*]c'
103+
nul_match P 0 0 '-i' 'EQM[*]C'
88104

89-
# Due to the REG_STARTEND extension when kwset() is disabled on -i &
90-
# non-ASCII the string will be matched in its entirety, but the
91-
# pattern will be cut off at the first \0.
92-
nul_match 0 '-i' 'NOMATCHQð'
93-
nul_match T0 '-i' '[Æ]QNOMATCH'
94-
nul_match T0 '-i' '[æ]QNOMATCH'
95-
# Matches, but for the wrong reasons, just stops at [æ]
96-
nul_match 1 '-i' '[Æ]Qð'
97-
nul_match 1 '-i' '[æ]Qð'
105+
# Assert that we're using REG_STARTEND and the pattern doesn't match
106+
# just because it's cut off at the first \0.
107+
nul_match 0 0 0 '-i' 'NOMATCHQð'
108+
nul_match P 0 0 '-i' '[Æ]QNOMATCH'
109+
nul_match P 0 0 '-i' '[æ]QNOMATCH'
98110

99111
# Ensure that the matcher doesn't regress to something that stops at
100112
# \0
101-
nul_match 0 '-F' 'yQ[f]'
102-
nul_match 0 '-Fi' 'YQ[F]'
103-
nul_match 0 '' 'yQNOMATCH'
104-
nul_match 0 '' 'QNOMATCH'
105-
nul_match 0 '-i' 'YQNOMATCH'
106-
nul_match 0 '-i' 'QNOMATCH'
107-
nul_match 0 '-F' 'æQ[ð]'
108-
nul_match 0 '-Fi' 'ÆQ[Ð]'
109-
nul_match 0 '' 'yQNÓMATCH'
110-
nul_match 0 '' 'QNÓMATCH'
111-
nul_match 0 '-i' 'YQNÓMATCH'
112-
nul_match 0 '-i' 'QNÓMATCH'
113+
nul_match 0 0 0 '-F' 'yQ[f]'
114+
nul_match 0 0 0 '-Fi' 'YQ[F]'
115+
nul_match 0 0 0 '' 'yQNOMATCH'
116+
nul_match 0 0 0 '' 'QNOMATCH'
117+
nul_match 0 0 0 '-i' 'YQNOMATCH'
118+
nul_match 0 0 0 '-i' 'QNOMATCH'
119+
nul_match 0 0 0 '-F' 'æQ[ð]'
120+
nul_match P P P '-Fi' 'ÆQ[Ð]'
121+
nul_match P 0 1 '-i' 'ÆQ[Ð]'
122+
nul_match 0 0 0 '' 'yQNÓMATCH'
123+
nul_match 0 0 0 '' 'QNÓMATCH'
124+
nul_match 0 0 0 '-i' 'YQNÓMATCH'
125+
nul_match 0 0 0 '-i' 'QNÓMATCH'
113126

114127
test_done

0 commit comments

Comments
 (0)