Skip to content

Commit ce025ae

Browse files
rscharfettaylorr
authored andcommitted
grep: disable lookahead on error
regexec(3) can fail. E.g. on macOS it fails if it is used with an UTF-8 locale to match a valid regex against a buffer containing invalid UTF-8 characters. git grep has two ways to search for matches in a file: Either it splits its contents into lines and matches them separately, or it matches the whole content and figures out line boundaries later. The latter is done by look_ahead() and it's quicker in the common case where most files don't contain a match. Fall back to line-by-line matching if look_ahead() encounters an regexec(3) error by propagating errors out of patmatch() and bailing out of look_ahead() if there is one. This way we at least can find matches in lines that contain only valid characters. That matches the behavior of grep(1) on macOS. pcre2match() dies if pcre2_jit_match() or pcre2_match() fail, but since we use the flag PCRE2_MATCH_INVALID_UTF it handles invalid UTF-8 characters gracefully. So implement the fall-back only for regexec(3) and leave the PCRE2 matching unchanged. Reported-by: David Gstir <[email protected]> Signed-off-by: René Scharfe <[email protected]> Tested-by: David Gstir <[email protected]> Signed-off-by: Taylor Blau <[email protected]>
1 parent 34b6ce9 commit ce025ae

File tree

2 files changed

+29
-10
lines changed

2 files changed

+29
-10
lines changed

grep.c

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -906,15 +906,17 @@ static int patmatch(struct grep_pat *p,
906906
const char *line, const char *eol,
907907
regmatch_t *match, int eflags)
908908
{
909-
int hit;
910-
911909
if (p->pcre2_pattern)
912-
hit = !pcre2match(p, line, eol, match, eflags);
913-
else
914-
hit = !regexec_buf(&p->regexp, line, eol - line, 1, match,
915-
eflags);
910+
return !pcre2match(p, line, eol, match, eflags);
916911

917-
return hit;
912+
switch (regexec_buf(&p->regexp, line, eol - line, 1, match, eflags)) {
913+
case 0:
914+
return 1;
915+
case REG_NOMATCH:
916+
return 0;
917+
default:
918+
return -1;
919+
}
918920
}
919921

920922
static void strip_timestamp(const char *bol, const char **eol_p)
@@ -952,6 +954,8 @@ static int headerless_match_one_pattern(struct grep_pat *p,
952954

953955
again:
954956
hit = patmatch(p, bol, eol, pmatch, eflags);
957+
if (hit < 0)
958+
hit = 0;
955959

956960
if (hit && p->word_regexp) {
957961
if ((pmatch[0].rm_so < 0) ||
@@ -1461,6 +1465,8 @@ static int look_ahead(struct grep_opt *opt,
14611465
regmatch_t m;
14621466

14631467
hit = patmatch(p, bol, bol + *left_p, &m, 0);
1468+
if (hit < 0)
1469+
return -1;
14641470
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
14651471
continue;
14661472
if (earliest < 0 || m.rm_so < earliest)
@@ -1655,9 +1661,13 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
16551661
if (try_lookahead
16561662
&& !(last_hit
16571663
&& (show_function ||
1658-
lno <= last_hit + opt->post_context))
1659-
&& look_ahead(opt, &left, &lno, &bol))
1660-
break;
1664+
lno <= last_hit + opt->post_context))) {
1665+
hit = look_ahead(opt, &left, &lno, &bol);
1666+
if (hit < 0)
1667+
try_lookahead = 0;
1668+
else if (hit)
1669+
break;
1670+
}
16611671
eol = end_of_line(bol, &left);
16621672

16631673
if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))

t/t7810-grep.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ test_expect_success setup '
8787
# Still a no-op.
8888
function dummy() {}
8989
EOF
90+
printf "\200\nASCII\n" >invalid-utf8 &&
9091
if test_have_prereq FUNNYNAMES
9192
then
9293
echo unusual >"\"unusual\" pathname" &&
@@ -534,6 +535,14 @@ do
534535
test_cmp expected actual
535536
'
536537

538+
test_expect_success "grep $L searches past invalid lines on UTF-8 locale" '
539+
LC_ALL=en_US.UTF-8 git grep A. invalid-utf8 >actual &&
540+
cat >expected <<-EOF &&
541+
invalid-utf8:ASCII
542+
EOF
543+
test_cmp expected actual
544+
'
545+
537546
test_expect_success FUNNYNAMES "grep $L should quote unusual pathnames" '
538547
cat >expected <<-EOF &&
539548
${HC}"\"unusual\" pathname":unusual

0 commit comments

Comments
 (0)