Skip to content

Commit bb2e658

Browse files
committed
expression: add explicit well-formedness check to parser
When we convert the parser to be non-recursive, it will be simpler if we first do a pass to confirm that the expression is well-formed (all parens match and are balanced, commas in the right place, etc.). After that we can assume the string is well-formed and write an algorithm that basically just translates it into a data structure. Conveniently, because this check is independent of the parsing algorithm, we can add it to the existing code as a pre-check. Since all errors during parsing should occur during the pre-check, we can then turn the existing stringly-typed errors into unreachable!()s. By fuzzing the resulting code we can guarantee that our new pre-check is at least as strict as the existing parser. In a later PR we will generalize the pre-check a bit so that it treats *both* () and {} as parentheses, and markes tree nodes based on which parens are used. But until we change the Taproot parser (which is is an ad-hoc mess of combination manual string parsing and tree parsing, and expects tapleaves to show up as the "name"s of childless nodes in a tree of {} combinators) we can't effectively do this. There is a new unit test `parse_tree_taproot` that will track this progress.
1 parent 9b03043 commit bb2e658

File tree

3 files changed

+269
-20
lines changed

3 files changed

+269
-20
lines changed

src/expression/error.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,105 @@ use core::fmt;
77
use crate::prelude::*;
88
use crate::ThresholdError;
99

10+
/// An error parsing an expression tree.
11+
#[derive(Debug, PartialEq, Eq)]
12+
pub enum ParseTreeError {
13+
/// Expression tree had depth exceeding our hard cap.
14+
MaxRecursionDepthExceeded {
15+
/// The depth of the tree that was attempted to be parsed.
16+
actual: usize,
17+
/// The maximum depth.
18+
maximum: u32,
19+
},
20+
/// Character occurred which was not part of the valid descriptor character set.
21+
InvalidCharacter {
22+
/// The character in question.
23+
ch: char,
24+
/// Its byte-index into the string.
25+
pos: usize,
26+
},
27+
/// After a close-paren, the only valid next characters are close-parens and commas. Got
28+
/// something else.
29+
ExpectedParenOrComma {
30+
/// What we got instead.
31+
ch: char,
32+
/// Its byte-index into the string.
33+
pos: usize,
34+
},
35+
/// An open-parenthesis had no corresponding close-parenthesis.
36+
UnmatchedOpenParen {
37+
/// The character in question ('(' or '{')
38+
ch: char,
39+
/// Its byte-index into the string.
40+
pos: usize,
41+
},
42+
/// A close-parenthesis had no corresponding open-parenthesis.
43+
UnmatchedCloseParen {
44+
/// The character in question (')' or '}')
45+
ch: char,
46+
/// Its byte-index into the string.
47+
pos: usize,
48+
},
49+
/// A `(` was matched with a `}` or vice-versa.
50+
MismatchedParens {
51+
/// The opening parenthesis ('(' or '{')
52+
open_ch: char,
53+
/// The position of the opening parethesis.
54+
open_pos: usize,
55+
/// The closing parenthesis (')' or '}')
56+
close_ch: char,
57+
/// The position of the closing parethesis.
58+
close_pos: usize,
59+
},
60+
/// Data occurred after the final ).
61+
TrailingCharacter {
62+
/// The first trailing character.
63+
ch: char,
64+
/// Its byte-index into the string.
65+
pos: usize,
66+
},
67+
}
68+
69+
impl fmt::Display for ParseTreeError {
70+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
71+
match self {
72+
ParseTreeError::MaxRecursionDepthExceeded { actual, maximum } => {
73+
write!(f, "maximum recursion depth exceeded (max {}, got {})", maximum, actual)
74+
}
75+
ParseTreeError::InvalidCharacter { ch, pos } => {
76+
write!(f, "character `{}` (position {}) not allowed in descriptor", ch, pos)
77+
}
78+
ParseTreeError::ExpectedParenOrComma { ch, pos } => {
79+
write!(
80+
f,
81+
"invalid character `{}` (position {}); expected comma or close-paren",
82+
ch, pos
83+
)
84+
}
85+
ParseTreeError::UnmatchedOpenParen { ch, pos } => {
86+
write!(f, "`{}` (position {}) not closed", ch, pos)
87+
}
88+
ParseTreeError::UnmatchedCloseParen { ch, pos } => {
89+
write!(f, "`{}` (position {}) not opened", ch, pos)
90+
}
91+
ParseTreeError::MismatchedParens { open_ch, open_pos, close_ch, close_pos } => {
92+
write!(
93+
f,
94+
"`{}` (position {}) closed by `{}` (position {})",
95+
open_ch, open_pos, close_ch, close_pos
96+
)
97+
}
98+
ParseTreeError::TrailingCharacter { ch, pos } => {
99+
write!(f, "trailing data `{}...` (position {})", ch, pos)
100+
}
101+
}
102+
}
103+
}
104+
#[cfg(feature = "std")]
105+
impl std::error::Error for ParseTreeError {
106+
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { None }
107+
}
108+
10109
/// Error parsing a threshold expression.
11110
#[derive(Clone, Debug, PartialEq, Eq)]
12111
pub enum ParseThresholdError {

src/expression/mod.rs

Lines changed: 165 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ mod error;
88
use core::fmt;
99
use core::str::FromStr;
1010

11-
pub use self::error::ParseThresholdError;
11+
pub use self::error::{ParseThresholdError, ParseTreeError};
1212
use crate::prelude::*;
1313
use crate::{errstr, Error, Threshold, MAX_RECURSION_DEPTH};
1414

@@ -145,13 +145,107 @@ impl<'a> Tree<'a> {
145145
Self::from_slice_delim(sl, 0u32, '(')
146146
}
147147

148+
fn parse_pre_check(s: &str, open: u8, close: u8) -> Result<(), ParseTreeError> {
149+
// First, scan through string to make sure it is well-formed.
150+
let mut max_depth = 0;
151+
// Do ASCII check first; after this we can use .bytes().enumerate() rather
152+
// than .char_indices(), which is *significantly* faster.
153+
for (pos, ch) in s.char_indices() {
154+
if !(32..128).contains(&u32::from(ch)) {
155+
return Err(ParseTreeError::InvalidCharacter { ch, pos });
156+
}
157+
}
158+
159+
let mut open_paren_stack = Vec::with_capacity(128);
160+
161+
for (pos, ch) in s.bytes().enumerate() {
162+
if ch == open {
163+
open_paren_stack.push((ch, pos));
164+
if max_depth < open_paren_stack.len() {
165+
max_depth = open_paren_stack.len();
166+
}
167+
} else if ch == close {
168+
if let Some((open_ch, open_pos)) = open_paren_stack.pop() {
169+
if (open_ch == b'(' && ch == b'}') || (open_ch == b'{' && ch == b')') {
170+
return Err(ParseTreeError::MismatchedParens {
171+
open_ch: open_ch.into(),
172+
open_pos,
173+
close_ch: ch.into(),
174+
close_pos: pos,
175+
});
176+
}
177+
178+
if let Some(&(paren_ch, paren_pos)) = open_paren_stack.last() {
179+
// not last paren; this should not be the end of the string,
180+
// and the next character should be a , ) or }.
181+
if pos == s.len() - 1 {
182+
return Err(ParseTreeError::UnmatchedOpenParen {
183+
ch: paren_ch.into(),
184+
pos: paren_pos,
185+
});
186+
} else {
187+
let next_byte = s.as_bytes()[pos + 1];
188+
if next_byte != b')' && next_byte != b'}' && next_byte != b',' {
189+
return Err(ParseTreeError::ExpectedParenOrComma {
190+
ch: next_byte.into(),
191+
pos: pos + 1,
192+
});
193+
//
194+
}
195+
}
196+
} else {
197+
// last paren; this SHOULD be the end of the string
198+
if pos < s.len() - 1 {
199+
return Err(ParseTreeError::TrailingCharacter {
200+
ch: s.as_bytes()[pos + 1].into(),
201+
pos: pos + 1,
202+
});
203+
}
204+
}
205+
} else {
206+
// In practice, this is only hit if there are no open parens at all.
207+
// If there are open parens, like in "())", then on the first ), we
208+
// would have returned TrailingCharacter in the previous clause.
209+
//
210+
// From a user point of view, UnmatchedCloseParen would probably be
211+
// a clearer error to get, but it complicates the parser to do this,
212+
// and "TralingCharacter" is technically correct, so we leave it for
213+
// now.
214+
return Err(ParseTreeError::UnmatchedCloseParen { ch: ch.into(), pos });
215+
}
216+
} else if ch == b',' && open_paren_stack.is_empty() {
217+
// We consider commas outside of the tree to be "trailing characters"
218+
return Err(ParseTreeError::TrailingCharacter { ch: ch.into(), pos });
219+
}
220+
}
221+
// Catch "early end of string"
222+
if let Some((ch, pos)) = open_paren_stack.pop() {
223+
return Err(ParseTreeError::UnmatchedOpenParen { ch: ch.into(), pos });
224+
}
225+
226+
// FIXME should be able to remove this once we eliminate all recursion
227+
// in the library.
228+
if u32::try_from(max_depth).unwrap_or(u32::MAX) > MAX_RECURSION_DEPTH {
229+
return Err(ParseTreeError::MaxRecursionDepthExceeded {
230+
actual: max_depth,
231+
maximum: MAX_RECURSION_DEPTH,
232+
});
233+
}
234+
235+
Ok(())
236+
}
237+
148238
pub(crate) fn from_slice_delim(
149239
mut sl: &'a str,
150240
depth: u32,
151241
delim: char,
152242
) -> Result<(Tree<'a>, &'a str), Error> {
153-
if depth >= MAX_RECURSION_DEPTH {
154-
return Err(Error::MaxRecursiveDepthExceeded);
243+
if depth == 0 {
244+
if delim == '{' {
245+
Self::parse_pre_check(sl, b'{', b'}').map_err(Error::ParseTree)?;
246+
} else {
247+
Self::parse_pre_check(sl, b'(', b')').map_err(Error::ParseTree)?;
248+
}
155249
}
156250

157251
match next_expr(sl, delim) {
@@ -171,7 +265,7 @@ impl<'a> Tree<'a> {
171265
ret.args.push(arg);
172266

173267
if new_sl.is_empty() {
174-
return Err(Error::ExpectedChar(closing_delim(delim)));
268+
unreachable!()
175269
}
176270

177271
sl = &new_sl[1..];
@@ -181,7 +275,7 @@ impl<'a> Tree<'a> {
181275
if last_byte == closing_delim(delim) as u8 {
182276
break;
183277
} else {
184-
return Err(Error::ExpectedChar(closing_delim(delim)));
278+
unreachable!()
185279
}
186280
}
187281
}
@@ -200,7 +294,7 @@ impl<'a> Tree<'a> {
200294
if rem.is_empty() {
201295
Ok(top)
202296
} else {
203-
Err(errstr(rem))
297+
unreachable!()
204298
}
205299
}
206300

@@ -337,36 +431,88 @@ mod tests {
337431
fn parse_tree_basic() {
338432
assert_eq!(Tree::from_str("thresh").unwrap(), leaf("thresh"));
339433

340-
assert!(matches!(Tree::from_str("thresh,"), Err(Error::Unexpected(s)) if s == ","));
434+
assert!(matches!(
435+
Tree::from_str("thresh,").unwrap_err(),
436+
Error::ParseTree(ParseTreeError::TrailingCharacter { ch: ',', pos: 6 }),
437+
));
341438

342439
assert!(matches!(
343-
Tree::from_str("thresh,thresh"),
344-
Err(Error::Unexpected(s)) if s == ",thresh",
440+
Tree::from_str("thresh,thresh").unwrap_err(),
441+
Error::ParseTree(ParseTreeError::TrailingCharacter { ch: ',', pos: 6 }),
345442
));
346443

347444
assert!(matches!(
348-
Tree::from_str("thresh()thresh()"),
349-
Err(Error::Unexpected(s)) if s == "thresh()",
445+
Tree::from_str("thresh()thresh()").unwrap_err(),
446+
Error::ParseTree(ParseTreeError::TrailingCharacter { ch: 't', pos: 8 }),
350447
));
351448

352449
assert_eq!(Tree::from_str("thresh()").unwrap(), paren_node("thresh", vec![leaf("")]));
353450

354-
// FIXME even for our current crappy error handling, this one is pretty bad
355-
assert!(matches!(Tree::from_str("thresh(a()b)"), Err(Error::ExpectedChar(')'))));
451+
assert!(matches!(
452+
Tree::from_str("thresh(a()b)"),
453+
Err(Error::ParseTree(ParseTreeError::ExpectedParenOrComma { ch: 'b', pos: 10 })),
454+
));
356455

357-
assert!(matches!(Tree::from_str("thresh()xyz"), Err(Error::Unexpected(s)) if s == "xyz"));
456+
assert!(matches!(
457+
Tree::from_str("thresh()xyz"),
458+
Err(Error::ParseTree(ParseTreeError::TrailingCharacter { ch: 'x', pos: 8 })),
459+
));
358460
}
359461

360462
#[test]
361463
fn parse_tree_parens() {
362-
assert!(matches!(Tree::from_str("a("), Err(Error::ExpectedChar(')'))));
464+
assert!(matches!(
465+
Tree::from_str("a(").unwrap_err(),
466+
Error::ParseTree(ParseTreeError::UnmatchedOpenParen { ch: '(', pos: 1 }),
467+
));
468+
469+
assert!(matches!(
470+
Tree::from_str(")").unwrap_err(),
471+
Error::ParseTree(ParseTreeError::UnmatchedCloseParen { ch: ')', pos: 0 }),
472+
));
473+
474+
assert!(matches!(
475+
Tree::from_str("x(y))").unwrap_err(),
476+
Error::ParseTree(ParseTreeError::TrailingCharacter { ch: ')', pos: 4 }),
477+
));
478+
479+
/* Will be enabled in a later PR which unifies TR and non-TR parsing.
480+
assert!(matches!(
481+
Tree::from_str("a{").unwrap_err(),
482+
Error::ParseTree(ParseTreeError::UnmatchedOpenParen { ch: '{', pos: 1 }),
483+
));
484+
485+
assert!(matches!(
486+
Tree::from_str("}").unwrap_err(),
487+
Error::ParseTree(ParseTreeError::UnmatchedCloseParen { ch: '}', pos: 0 }),
488+
));
489+
*/
363490

364-
assert!(matches!(Tree::from_str(")"), Err(Error::Unexpected(s)) if s == ")"));
491+
assert!(matches!(
492+
Tree::from_str("x(y)}").unwrap_err(),
493+
Error::ParseTree(ParseTreeError::TrailingCharacter { ch: '}', pos: 4 }),
494+
));
365495

366-
assert!(matches!(Tree::from_str("x(y))"), Err(Error::Unexpected(s)) if s == ")"));
496+
/* Will be enabled in a later PR which unifies TR and non-TR parsing.
497+
assert!(matches!(
498+
Tree::from_str("x{y)").unwrap_err(),
499+
Error::ParseTree(ParseTreeError::MismatchedParens {
500+
open_ch: '{',
501+
open_pos: 1,
502+
close_ch: ')',
503+
close_pos: 3,
504+
}),
505+
));
506+
*/
507+
}
367508

368-
// In next commit will add tests related to {}s; currently we ignore
369-
// these except in Taproot mode.
509+
#[test]
510+
fn parse_tree_taproot() {
511+
// This test will change in a later PR which unifies TR and non-TR parsing.
512+
assert!(matches!(
513+
Tree::from_str("a{b(c),d}").unwrap_err(),
514+
Error::ParseTree(ParseTreeError::TrailingCharacter { ch: ',', pos: 6 }),
515+
));
370516
}
371517

372518
#[test]

src/lib.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ use bitcoin::{script, Opcode};
137137

138138
pub use crate::blanket_traits::FromStrKey;
139139
pub use crate::descriptor::{DefiniteDescriptorKey, Descriptor, DescriptorPublicKey};
140-
pub use crate::expression::ParseThresholdError;
140+
pub use crate::expression::{ParseThresholdError, ParseTreeError};
141141
pub use crate::interpreter::Interpreter;
142142
pub use crate::miniscript::analyzable::{AnalysisError, ExtParams};
143143
pub use crate::miniscript::context::{BareCtx, Legacy, ScriptContext, Segwitv0, SigType, Tap};
@@ -494,6 +494,8 @@ pub enum Error {
494494
ParseThreshold(ParseThresholdError),
495495
/// Checksum error parsing a descriptor.
496496
Checksum(descriptor::checksum::Error),
497+
/// Invalid expression tree.
498+
ParseTree(ParseTreeError),
497499
}
498500

499501
// https://github.com/sipa/miniscript/pull/5 for discussion on this number
@@ -556,6 +558,7 @@ impl fmt::Display for Error {
556558
Error::Threshold(ref e) => e.fmt(f),
557559
Error::ParseThreshold(ref e) => e.fmt(f),
558560
Error::Checksum(ref e) => e.fmt(f),
561+
Error::ParseTree(ref e) => e.fmt(f),
559562
}
560563
}
561564
}
@@ -607,6 +610,7 @@ impl error::Error for Error {
607610
Threshold(e) => Some(e),
608611
ParseThreshold(e) => Some(e),
609612
Checksum(e) => Some(e),
613+
ParseTree(e) => Some(e),
610614
}
611615
}
612616
}

0 commit comments

Comments
 (0)