Skip to content

Convert BufferQueue to use Interior Mutability #542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions html5ever/benches/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ fn run_bench(c: &mut Criterion, name: &str) {
c.bench_function(&test_name, move |b| {
b.iter(|| {
let mut tok = Tokenizer::new(Sink, Default::default());
let mut buffer = BufferQueue::default();
let buffer = BufferQueue::default();
// We are doing clone inside the bench function, this is not ideal, but possibly
// necessary since our iterator consumes the underlying buffer.
for buf in input.clone().into_iter() {
buffer.push_back(buf);
let _ = tok.feed(&mut buffer);
let _ = tok.feed(&buffer);
}
let _ = tok.feed(&mut buffer);
let _ = tok.feed(&buffer);
tok.end();
})
});
Expand Down
4 changes: 2 additions & 2 deletions html5ever/examples/noop-tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ fn main() {
let mut chunk = ByteTendril::new();
io::stdin().read_to_tendril(&mut chunk).unwrap();

let mut input = BufferQueue::default();
let input = BufferQueue::default();
input.push_back(chunk.try_reinterpret().unwrap());

let mut tok = Tokenizer::new(Sink(Vec::new()), Default::default());
let _ = tok.feed(&mut input);
let _ = tok.feed(&input);
assert!(input.is_empty());
tok.end();
}
4 changes: 2 additions & 2 deletions html5ever/examples/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ fn main() {
let mut chunk = ByteTendril::new();
io::stdin().read_to_tendril(&mut chunk).unwrap();

let mut input = BufferQueue::default();
let input = BufferQueue::default();
input.push_back(chunk.try_reinterpret().unwrap());

let mut tok = Tokenizer::new(
Expand All @@ -100,7 +100,7 @@ fn main() {
..Default::default()
},
);
let _ = tok.feed(&mut input);
let _ = tok.feed(&input);

assert!(input.is_empty());
tok.end();
Expand Down
4 changes: 2 additions & 2 deletions html5ever/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
fn process(&mut self, t: StrTendril) {
self.input_buffer.push_back(t);
// FIXME: Properly support </script> somehow.
while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {}
}

// FIXME: Is it too noisy to report every character decoding error?
Expand All @@ -118,7 +118,7 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {

fn finish(mut self) -> Self::Output {
// FIXME: Properly support </script> somehow.
while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {}
assert!(self.input_buffer.is_empty());
self.tokenizer.end();
self.tokenizer.sink.sink.finish()
Expand Down
22 changes: 11 additions & 11 deletions html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl CharRefTokenizer {
pub(super) fn step<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
if self.result.is_some() {
return Done;
Expand All @@ -135,7 +135,7 @@ impl CharRefTokenizer {
fn do_begin<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
'a'..='z' | 'A'..='Z' | '0'..='9' => {
Expand All @@ -156,7 +156,7 @@ impl CharRefTokenizer {
fn do_octothorpe<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
match c {
Expand All @@ -177,7 +177,7 @@ impl CharRefTokenizer {
fn do_numeric<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
base: u32,
) -> Status {
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
Expand Down Expand Up @@ -207,7 +207,7 @@ impl CharRefTokenizer {
fn do_numeric_semicolon<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
';' => tokenizer.discard_char(input),
Expand All @@ -221,7 +221,7 @@ impl CharRefTokenizer {
fn unconsume_numeric<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
let mut unconsume = StrTendril::from_char('#');
if let Some(c) = self.hex_marker {
Expand Down Expand Up @@ -270,7 +270,7 @@ impl CharRefTokenizer {
fn do_named<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
Expand Down Expand Up @@ -304,14 +304,14 @@ impl CharRefTokenizer {
tokenizer.emit_error(msg);
}

fn unconsume_name(&mut self, input: &mut BufferQueue) {
fn unconsume_name(&mut self, input: &BufferQueue) {
input.push_front(self.name_buf_opt.take().unwrap());
}

fn finish_named<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
end_char: Option<char>,
) -> Status {
match self.name_match {
Expand Down Expand Up @@ -395,7 +395,7 @@ impl CharRefTokenizer {
fn do_bogus_name<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) -> Status {
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
Expand All @@ -414,7 +414,7 @@ impl CharRefTokenizer {
pub(super) fn end_of_file<Sink: TokenSink>(
&mut self,
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
input: &BufferQueue,
) {
while self.result.is_none() {
match self.state {
Expand Down
37 changes: 17 additions & 20 deletions html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}

/// Feed an input string into the tokenizer.
pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
pub fn feed(&mut self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
if input.is_empty() {
return TokenizerResult::Done;
}
Expand Down Expand Up @@ -248,7 +248,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
//§ preprocessing-the-input-stream
// Get the next input character, which might be the character
// 'c' that we already consumed from the buffers.
fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
fn get_preprocessed_char(&mut self, mut c: char, input: &BufferQueue) -> Option<char> {
if self.ignore_lf {
self.ignore_lf = false;
if c == '\n' {
Expand Down Expand Up @@ -283,7 +283,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {

//§ tokenization
// Get the next input character, if one is available.
fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
fn get_char(&mut self, input: &BufferQueue) -> Option<char> {
if self.reconsume {
self.reconsume = false;
Some(self.current_char)
Expand All @@ -294,7 +294,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
}

fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
fn pop_except_from(&mut self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
// Bail to the slow path for various corner cases.
// This means that `FromSet` can contain characters not in the set!
// It shouldn't matter because the fallback `FromSet` case should
Expand All @@ -319,12 +319,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
// BufferQueue::eat.
//
// NB: this doesn't set the current input character.
fn eat(
&mut self,
input: &mut BufferQueue,
pat: &str,
eq: fn(&u8, &u8) -> bool,
) -> Option<bool> {
fn eat(&mut self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
if self.ignore_lf {
self.ignore_lf = false;
if self.peek(input) == Some('\n') {
Expand All @@ -336,15 +331,17 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match input.eat(pat, eq) {
None if self.at_eof => Some(false),
None => {
self.temp_buf.extend(input);
while let Some(data) = input.next() {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jdm I extracted next() and and reimplemented extend

self.temp_buf.push_char(data);
}
None
},
Some(matched) => Some(matched),
}
}

/// Run the state machine for as long as we can.
fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
fn run(&mut self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
if self.opts.profile {
loop {
let state = self.state;
Expand Down Expand Up @@ -567,7 +564,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
}

fn discard_char(&mut self, input: &mut BufferQueue) {
fn discard_char(&mut self, input: &BufferQueue) {
// peek() deals in un-processed characters (no newline normalization), while get_char()
// does.
//
Expand Down Expand Up @@ -696,7 +693,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
// Return true if we should be immediately re-invoked
// (this just simplifies control flow vs. break / continue).
#[allow(clippy::never_loop)]
fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
fn step(&mut self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
if self.char_ref_tokenizer.is_some() {
return self.step_char_ref_tokenizer(input);
}
Expand Down Expand Up @@ -1382,7 +1379,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
}
}

fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
fn step_char_ref_tokenizer(&mut self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
// FIXME HACK: Take and replace the tokenizer so we don't
// double-mut-borrow self. This is why it's boxed.
let mut tok = self.char_ref_tokenizer.take().unwrap();
Expand Down Expand Up @@ -1432,19 +1429,19 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
pub fn end(&mut self) {
// Handle EOF in the char ref sub-tokenizer, if there is one.
// Do this first because it might un-consume stuff.
let mut input = BufferQueue::default();
let input = BufferQueue::default();
match self.char_ref_tokenizer.take() {
None => (),
Some(mut tok) => {
tok.end_of_file(self, &mut input);
tok.end_of_file(self, &input);
self.process_char_ref(tok.get_result());
},
}

// Process all remaining buffered input.
// If we're waiting for lookahead, we're not gonna get it.
self.at_eof = true;
assert!(matches!(self.run(&mut input), TokenizerResult::Done));
assert!(matches!(self.run(&input), TokenizerResult::Done));
assert!(input.is_empty());

loop {
Expand Down Expand Up @@ -1668,10 +1665,10 @@ mod test {
fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
let sink = LinesMatch::new();
let mut tok = Tokenizer::new(sink, opts);
let mut buffer = BufferQueue::default();
let buffer = BufferQueue::default();
for chunk in input.into_iter() {
buffer.push_back(chunk);
let _ = tok.feed(&mut buffer);
let _ = tok.feed(&buffer);
}
tok.end();
tok.sink.lines
Expand Down
Loading