Skip to content

Commit e6a0c55

Browse files
committed
syntax: add Utf8Sequence::reverse method
This is very convenient when compiling reverse UTF-8 automata.
1 parent a0f541b commit e6a0c55

File tree

1 file changed

+64
-2
lines changed

1 file changed

+64
-2
lines changed

regex-syntax/src/utf8.rs

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,31 @@ impl Utf8Sequence {
152152
self.as_slice().len()
153153
}
154154

155+
/// Reverses the ranges in this sequence.
156+
///
157+
/// For example, if this corresponds to the following sequence:
158+
///
159+
/// ```ignore
160+
/// [D0-D3][80-BF]
161+
/// ```
162+
///
163+
/// Then after reversal, it will be
164+
///
165+
/// ```ignore
166+
/// [80-BF][D0-D3]
167+
/// ```
168+
///
169+
/// This is useful when one is constructing a UTF-8 automaton to match
170+
/// character classes in reverse.
171+
pub fn reverse(&mut self) {
172+
match *self {
173+
Utf8Sequence::One(_) => {}
174+
Utf8Sequence::Two(ref mut x) => x.reverse(),
175+
Utf8Sequence::Three(ref mut x) => x.reverse(),
176+
Utf8Sequence::Four(ref mut x) => x.reverse(),
177+
}
178+
}
179+
155180
/// Returns true if and only if a prefix of `bytes` matches this sequence
156181
/// of byte ranges.
157182
pub fn matches(&self, bytes: &[u8]) -> bool {
@@ -201,7 +226,7 @@ pub struct Utf8Range {
201226

202227
impl Utf8Range {
203228
fn new(start: u8, end: u8) -> Self {
204-
Utf8Range { start: start, end: end }
229+
Utf8Range { start, end }
205230
}
206231

207232
/// Returns true if and only if the given byte is in this range.
@@ -294,7 +319,7 @@ impl Utf8Sequences {
294319
}
295320

296321
fn push(&mut self, start: u32, end: u32) {
297-
self.range_stack.push(ScalarRange { start: start, end: end });
322+
self.range_stack.push(ScalarRange { start, end });
298323
}
299324
}
300325

@@ -507,6 +532,43 @@ mod tests {
507532
);
508533
}
509534

535+
#[test]
536+
fn reverse() {
537+
use utf8::Utf8Sequence::*;
538+
539+
let mut s = One(rutf8(0xA, 0xB));
540+
s.reverse();
541+
assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
542+
543+
let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
544+
s.reverse();
545+
assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
546+
547+
let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
548+
s.reverse();
549+
assert_eq!(
550+
s.as_slice(),
551+
&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
552+
);
553+
554+
let mut s = Four([
555+
rutf8(0xA, 0xB),
556+
rutf8(0xB, 0xC),
557+
rutf8(0xC, 0xD),
558+
rutf8(0xD, 0xE),
559+
]);
560+
s.reverse();
561+
assert_eq!(
562+
s.as_slice(),
563+
&[
564+
rutf8(0xD, 0xE),
565+
rutf8(0xC, 0xD),
566+
rutf8(0xB, 0xC),
567+
rutf8(0xA, 0xB)
568+
]
569+
);
570+
}
571+
510572
fn encode_surrogate(cp: u32) -> [u8; 3] {
511573
const TAG_CONT: u8 = 0b1000_0000;
512574
const TAG_THREE_B: u8 = 0b1110_0000;

0 commit comments

Comments
 (0)