Skip to content

Commit 65a1453

Browse files
committed
Add Progress instruction to avoid infinite loops in repeated empty expressions
With certain repeated empty expressions similar to (x*)*?, the backtracker can go into an infinite loop. This change adds the Progress instruction which requires the engine to make progress to continue matching a repeated subexpression. Fixes #375
1 parent 2f18730 commit 65a1453

File tree

6 files changed

+126
-6
lines changed

6 files changed

+126
-6
lines changed

src/backtrack.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ pub struct Bounded<'a, 'm, 'r, 's, I> {
5656
input: I,
5757
matches: &'m mut [bool],
5858
slots: &'s mut [Slot],
59+
/// Stores input positions for the Progress instruction.
60+
progress_slots: Vec<Slot>,
5961
m: &'a mut Cache,
6062
}
6163

@@ -84,6 +86,8 @@ impl Cache {
8486
enum Job {
8587
Inst { ip: InstPtr, at: InputAt },
8688
SaveRestore { slot: usize, old_pos: Option<usize> },
89+
/// Restores a saved progress_slot.
90+
SaveRestoreProgress { slot: usize, old_pos: Option<usize> },
8791
}
8892

8993
impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
@@ -102,11 +106,16 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
102106
let mut cache = cache.borrow_mut();
103107
let mut cache = &mut cache.backtrack;
104108
let start = input.at(start);
109+
let mut progress_slots = Vec::new();
110+
for _ in 0..prog.num_progress_slots {
111+
progress_slots.push(None);
112+
}
105113
let mut b = Bounded {
106114
prog: prog,
107115
input: input,
108116
matches: matches,
109117
slots: slots,
118+
progress_slots: progress_slots,
110119
m: cache,
111120
};
112121
b.exec_(start)
@@ -204,6 +213,11 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
204213
self.slots[slot] = old_pos;
205214
}
206215
}
216+
Job::SaveRestoreProgress { slot, old_pos } => {
217+
if slot < self.slots.len() {
218+
self.progress_slots[slot] = old_pos;
219+
}
220+
}
207221
}
208222
}
209223
matched
@@ -274,6 +288,25 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
274288
}
275289
return false;
276290
}
291+
Progress(ref inst) => {
292+
if let Some(&old_pos) = self.progress_slots.get(inst.slot) {
293+
if let Some(p) = old_pos {
294+
if p >= at.pos() {
295+
return false;
296+
}
297+
}
298+
// If this path doesn't work out, then we save the old
299+
// progress index (if one exists) in an alternate
300+
// job. If the next path fails, then the alternate
301+
// job is popped and the old progress index is restored.
302+
self.m.jobs.push(Job::SaveRestoreProgress {
303+
slot: inst.slot,
304+
old_pos: old_pos,
305+
});
306+
self.progress_slots[inst.slot] = Some(at.pos());
307+
}
308+
ip = inst.goto;
309+
}
277310
}
278311
if self.has_visited(ip, at) {
279312
return false;

src/compile.rs

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
2222
use prog::{
2323
Program, Inst, InstPtr, EmptyLook,
2424
InstSave, InstSplit, InstEmptyLook, InstChar, InstRanges, InstBytes,
25+
InstProgress,
2526
};
2627

2728
use Error;
@@ -45,6 +46,8 @@ pub struct Compiler {
4546
suffix_cache: SuffixCache,
4647
utf8_seqs: Option<Utf8Sequences>,
4748
byte_classes: ByteClassSet,
49+
/// Number of progress slots used for the Progress instruction.
50+
progress_count: usize,
4851
}
4952

5053
impl Compiler {
@@ -61,6 +64,7 @@ impl Compiler {
6164
suffix_cache: SuffixCache::new(1000),
6265
utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
6366
byte_classes: ByteClassSet::new(),
67+
progress_count: 0,
6468
}
6569
}
6670

@@ -535,13 +539,17 @@ impl Compiler {
535539
) -> Result {
536540
let split_entry = self.insts.len();
537541
let split = self.push_split_hole();
538-
let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
542+
let begin = self.insts.len();
543+
if !greedy && matches_empty(expr) {
544+
self.push_progress();
545+
}
546+
let Patch { hole: hole_rep, entry: _ } = try!(self.c(expr));
539547

540548
self.fill(hole_rep, split_entry);
541549
let split_hole = if greedy {
542-
self.fill_split(split, Some(entry_rep), None)
550+
self.fill_split(split, Some(begin), None)
543551
} else {
544-
self.fill_split(split, None, Some(entry_rep))
552+
self.fill_split(split, None, Some(begin))
545553
};
546554
Ok(Patch { hole: split_hole, entry: split_entry })
547555
}
@@ -553,6 +561,9 @@ impl Compiler {
553561
) -> Result {
554562
let Patch { hole: hole_rep, entry: entry_rep } = try!(self.c(expr));
555563
self.fill_to_next(hole_rep);
564+
if !greedy && matches_empty(expr) {
565+
self.push_progress();
566+
}
556567
let split = self.push_split_hole();
557568

558569
let split_hole = if greedy {
@@ -702,6 +713,14 @@ impl Compiler {
702713
Hole::One(hole)
703714
}
704715

716+
/// Pushes a Progress instruction onto the program.
717+
fn push_progress(&mut self) {
718+
let slot = self.progress_count;
719+
self.progress_count += 1;
720+
let next = self.insts.len() + 1;
721+
self.push_compiled(Inst::Progress(InstProgress{goto: next, slot: slot}));
722+
}
723+
705724
fn check_size(&self) -> result::Result<(), Error> {
706725
use std::mem::size_of;
707726

@@ -1059,6 +1078,30 @@ fn u32_to_usize(n: u32) -> usize {
10591078
n as usize
10601079
}
10611080

1081+
/// Returns true if and only if this expression can match without consuming
1082+
/// characters or bytes.
1083+
fn matches_empty(expr: &Expr) -> bool {
1084+
match expr {
1085+
&Expr::Empty | &Expr::StartLine | &Expr::EndLine | &Expr::StartText
1086+
| &Expr::EndText | &Expr::WordBoundary | &Expr::NotWordBoundary
1087+
| &Expr::WordBoundaryAscii | &Expr::NotWordBoundaryAscii => true,
1088+
&Expr::AnyChar | &Expr::AnyCharNoNL | &Expr::AnyByte
1089+
| &Expr::AnyByteNoNL | &Expr::Class(_) | &Expr::ClassBytes(_)
1090+
=> false,
1091+
&Expr::Literal { ref chars, casei: _ } => chars.len() == 0,
1092+
&Expr::LiteralBytes { ref bytes, casei: _ } => bytes.len() == 0,
1093+
&Expr::Group { ref e, i: _, name: _ } => matches_empty(&e),
1094+
&Expr::Repeat { ref e, r, greedy: _ } => (match r {
1095+
Repeater::ZeroOrOne => true,
1096+
Repeater::ZeroOrMore => true,
1097+
Repeater::OneOrMore => false,
1098+
Repeater::Range { min, .. } => min == 0,
1099+
} || matches_empty(&e)),
1100+
&Expr::Concat(ref v) => v.iter().all(|e| matches_empty(e)),
1101+
&Expr::Alternate(ref v) => v.iter().any(|e| matches_empty(e)),
1102+
}
1103+
}
1104+
10621105
#[cfg(test)]
10631106
mod tests {
10641107
use super::ByteClassSet;

src/dfa.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ pub fn can_exec(insts: &Program) -> bool {
7777
for inst in insts {
7878
match *inst {
7979
Char(_) | Ranges(_) => return false,
80-
EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
80+
EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) | Progress(_) => {}
8181
}
8282
}
8383
true
@@ -980,7 +980,7 @@ impl<'a> Fsm<'a> {
980980
// These states never happen in a byte-based program.
981981
Char(_) | Ranges(_) => unreachable!(),
982982
// These states are handled when following epsilon transitions.
983-
Save(_) | Split(_) | EmptyLook(_) => {}
983+
Save(_) | Split(_) | EmptyLook(_) | Progress(_) => {}
984984
Match(_) => {
985985
state_flags.set_match();
986986
if !self.continue_past_first_match() {
@@ -1125,6 +1125,7 @@ impl<'a> Fsm<'a> {
11251125
}
11261126
}
11271127
Save(ref inst) => self.cache.stack.push(inst.goto as InstPtr),
1128+
Progress(ref inst) => self.cache.stack.push(inst.goto as InstPtr),
11281129
Split(ref inst) => {
11291130
self.cache.stack.push(inst.goto2 as InstPtr);
11301131
self.cache.stack.push(inst.goto1 as InstPtr);
@@ -1217,6 +1218,7 @@ impl<'a> Fsm<'a> {
12171218
match self.prog[ip as usize] {
12181219
Char(_) | Ranges(_) => unreachable!(),
12191220
Save(_) | Split(_) => {}
1221+
Progress(_) => {}
12201222
Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
12211223
EmptyLook(_) => {
12221224
state_flags.set_empty();

src/pikevm.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ impl<'r, I: Input> Fsm<'r, I> {
275275
}
276276
false
277277
}
278-
EmptyLook(_) | Save(_) | Split(_) => false,
278+
EmptyLook(_) | Save(_) | Split(_) | Progress(_) => false,
279279
}
280280
}
281281

@@ -347,6 +347,11 @@ impl<'r, I: Input> Fsm<'r, I> {
347347
}
348348
return;
349349
}
350+
Progress(ref inst) => {
351+
// Progress instructions are only useful for backtracking,
352+
// so we don't need to manage progress state.
353+
ip = inst.goto;
354+
}
350355
}
351356
}
352357
}

src/prog.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ pub struct Program {
7272
/// simultaneously, then the DFA cache is not shared. Instead, copies are
7373
/// made.
7474
pub dfa_size_limit: usize,
75+
/// The number of progress slots.
76+
pub num_progress_slots: usize,
7577
}
7678

7779
impl Program {
@@ -94,6 +96,7 @@ impl Program {
9496
has_unicode_word_boundary: false,
9597
prefixes: LiteralSearcher::empty(),
9698
dfa_size_limit: 2 * (1<<20),
99+
num_progress_slots: 0,
97100
}
98101
}
99102

@@ -224,6 +227,10 @@ impl fmt::Debug for Program {
224227
try!(write!(f, "{:04} {}",
225228
pc, with_goto(pc, inst.goto, s)));
226229
}
230+
Progress(ref inst) => {
231+
let s = format!("{:04} Progress({})", pc, inst.slot);
232+
try!(write!(f, "{}", with_goto(pc, inst.goto, s)));
233+
}
227234
}
228235
if pc == self.start {
229236
try!(write!(f, " (start)"));
@@ -286,6 +293,12 @@ pub enum Inst {
286293
/// used in conjunction with Split instructions to implement multi-byte
287294
/// character classes.
288295
Bytes(InstBytes),
296+
/// Progress marks a location that a program is required to make progress.
297+
/// If the same progress instruction is encountered multiple times, each
298+
/// encounter only matches if more of the input has been matched than at the
299+
/// previous encounter. This is useful for preventing infinite loops in
300+
/// regexes like `(x*)*`.
301+
Progress(InstProgress),
289302
}
290303

291304
impl Inst {
@@ -423,3 +436,13 @@ impl InstBytes {
423436
self.start <= byte && byte <= self.end
424437
}
425438
}
439+
440+
/// Representation of the Progress instruction.
441+
#[derive(Clone, Debug)]
442+
pub struct InstProgress {
443+
/// The next location to execute in the program if this instruction
444+
/// succeeds.
445+
pub goto: InstPtr,
446+
/// The progress slot to store the minimum index of the next input to read.
447+
pub slot: usize,
448+
}

tests/crazy.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,20 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
4747
mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
4848
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));
4949

50+
// Test that repeated empty expressions don't loop forever.
51+
mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2)));
52+
mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2)));
53+
mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2)));
54+
mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2)));
55+
mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2)));
56+
mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2)));
57+
mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2)));
58+
mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2)));
59+
mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2)));
60+
mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2)));
61+
mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2)));
62+
mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2)));
63+
5064
// Test that the DFA can handle pathological cases.
5165
// (This should result in the DFA's cache being flushed too frequently, which
5266
// should cause it to quit and fall back to the NFA algorithm.)

0 commit comments

Comments
 (0)