diff --git a/crates/parser/src/event.rs b/crates/parser/src/event.rs index 5be9cb2a2469..9177f4210b5d 100644 --- a/crates/parser/src/event.rs +++ b/crates/parser/src/event.rs @@ -2,7 +2,7 @@ //! It is intended to be completely decoupled from the //! parser, so as to allow to evolve the tree representation //! and the parser algorithm independently. -use std::mem; +use std::{mem, num::NonZeroU32}; use crate::{ SyntaxKind::{self, *}, @@ -12,6 +12,12 @@ use crate::{ /// `Parser` produces a flat list of `Event`s. /// They are converted to a tree-structure in /// a separate pass, via `TreeBuilder`. +/// +/// Kept to 8 bytes: error messages live in a side table on the `Parser` +/// (the `errors` vec) and `Event::Error` only stores an index into it. +/// `forward_parent` uses `NonZeroU32` so `Option` is niche-optimised away +/// (the offset is always ≥ 1 because the forward parent sits later in the +/// event stream). #[derive(Debug, PartialEq)] pub(crate) enum Event { /// This event signifies the start of the node. @@ -53,10 +59,7 @@ pub(crate) enum Event { /// ``` /// /// See also `CompletedMarker::precede`. - Start { - kind: SyntaxKind, - forward_parent: Option, - }, + Start { kind: SyntaxKind, forward_parent: Option }, /// Complete the previous `Start` event Finish, @@ -65,20 +68,14 @@ pub(crate) enum Event { /// `n_raw_tokens` is used to glue complex contextual tokens. /// For example, lexer tokenizes `>>` as `>`, `>`, and /// `n_raw_tokens = 2` is used to produced a single `>>`. - Token { - kind: SyntaxKind, - n_raw_tokens: u8, - }, + Token { kind: SyntaxKind, n_raw_tokens: u8 }, /// When we parse `foo.0.0` or `foo. 0. 0` the lexer will hand us a float literal /// instead of an integer literal followed by a dot as the lexer has no contextual knowledge. /// This event instructs whatever consumes the events to split the float literal into /// the corresponding parts. - FloatSplitHack { - ends_in_dot: bool, - }, - Error { - msg: String, - }, + FloatSplitHack { ends_in_dot: bool }, + /// Index into the parser's side `errors` vec. + Error { err: u32 }, } impl Event { @@ -87,9 +84,12 @@ impl Event { } } -/// Generate the syntax tree with the control of events. -pub(super) fn process(mut events: Vec) -> Output { - let mut res = Output::default(); +/// Generate the syntax tree with the control of events. `errors` is the +/// side table of error messages built up alongside the `events` stream. +pub(super) fn process(mut events: Vec, mut errors: Vec) -> Output { + // Each event becomes roughly one u32 in Output, so preallocate to avoid + // the amortized grow-one churn we used to see in Output::enter_node. + let mut res = Output::with_event_capacity(events.len()); let mut forward_parents = Vec::new(); for i in 0..events.len() { @@ -104,7 +104,7 @@ pub(super) fn process(mut events: Vec) -> Output { let mut idx = i; let mut fp = forward_parent; while let Some(fwd) = fp { - idx += fwd as usize; + idx += fwd.get() as usize; // append `A`'s forward_parent `B` fp = match mem::replace(&mut events[idx], Event::tombstone()) { Event::Start { kind, forward_parent } => { @@ -131,7 +131,13 @@ pub(super) fn process(mut events: Vec) -> Output { let ev = mem::replace(&mut events[i + 1], Event::tombstone()); assert!(matches!(ev, Event::Finish), "{ev:?}"); } - Event::Error { msg } => res.error(msg), + Event::Error { err } => { + // Move the string out of the side table; each index is visited + // exactly once, so swapping with an empty String is cheap and + // avoids any clone. + let msg = mem::take(&mut errors[err as usize]); + res.error(msg); + } } } diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 4478bf4e3733..3e0e0d4c42bf 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -104,8 +104,8 @@ impl TopEntryPoint { }; let mut p = parser::Parser::new(input); entry_point(&mut p); - let events = p.finish(); - let res = event::process(events); + let (events, errors) = p.finish(); + let res = event::process(events, errors); if cfg!(debug_assertions) { let mut depth = 0; @@ -169,8 +169,8 @@ impl PrefixEntryPoint { }; let mut p = parser::Parser::new(input); entry_point(&mut p); - let events = p.finish(); - event::process(events) + let (events, errors) = p.finish(); + event::process(events, errors) } } @@ -195,7 +195,7 @@ impl Reparser { let Reparser(r) = self; let mut p = parser::Parser::new(tokens); r(&mut p); - let events = p.finish(); - event::process(events) + let (events, errors) = p.finish(); + event::process(events, errors) } } diff --git a/crates/parser/src/output.rs b/crates/parser/src/output.rs index 0ea15a656c1b..2f09b1121891 100644 --- a/crates/parser/src/output.rs +++ b/crates/parser/src/output.rs @@ -33,6 +33,14 @@ pub enum Step<'a> { } impl Output { + /// Preallocate the event buffer. Each `Event` in the input stream maps to + /// roughly one `u32` in the output, so passing the event count as the + /// initial capacity avoids most of the amortized `Vec::grow` allocations + /// during `event::process`. + pub(crate) fn with_event_capacity(cap: usize) -> Self { + Output { event: Vec::with_capacity(cap), error: Vec::new() } + } + const EVENT_MASK: u32 = 0b1; const TAG_MASK: u32 = 0x0000_00F0; const N_INPUT_TOKEN_MASK: u32 = 0x0000_FF00; diff --git a/crates/parser/src/parser.rs b/crates/parser/src/parser.rs index 4557078de991..57c501eda224 100644 --- a/crates/parser/src/parser.rs +++ b/crates/parser/src/parser.rs @@ -1,6 +1,6 @@ //! See [`Parser`]. -use std::cell::Cell; +use std::{cell::Cell, num::NonZeroU32}; use drop_bomb::DropBomb; @@ -12,6 +12,14 @@ use crate::{ input::Input, }; +/// Build a forward-parent offset. The offset is always ≥ 1 because the +/// forward-parent event is created *after* the event it forwards to, so +/// `NonZeroU32` is always valid here. Panics only on a parser bug. +#[inline] +fn fwd_parent(offset: u32) -> NonZeroU32 { + NonZeroU32::new(offset).expect("forward-parent offset must be non-zero") +} + /// `Parser` struct provides the low-level API for /// navigating through the stream of tokens and /// constructing the parse tree. The actual parsing @@ -25,6 +33,9 @@ pub(crate) struct Parser<'t> { inp: &'t Input, pos: usize, events: Vec, + /// Side table of error messages. `Event::Error { err }` carries an index + /// into this vec, keeping `Event` itself a flat 8-byte enum. + errors: Vec, steps: Cell, } @@ -32,11 +43,17 @@ const PARSER_STEP_LIMIT: usize = if cfg!(debug_assertions) { 150_000 } else { 15 impl<'t> Parser<'t> { pub(super) fn new(inp: &'t Input) -> Parser<'t> { - Parser { inp, pos: 0, events: Vec::with_capacity(2 * inp.len()), steps: Cell::new(0) } + Parser { + inp, + pos: 0, + events: Vec::with_capacity(2 * inp.len()), + errors: Vec::new(), + steps: Cell::new(0), + } } - pub(crate) fn finish(self) -> Vec { - self.events + pub(crate) fn finish(self) -> (Vec, Vec) { + (self.events, self.errors) } /// Returns the kind of the current token. @@ -206,7 +223,7 @@ impl<'t> Parser<'t> { match &mut self.events[idx] { Event::Start { forward_parent, kind } => { *kind = SyntaxKind::FIELD_EXPR; - *forward_parent = Some(new_marker.pos - marker.pos); + *forward_parent = Some(fwd_parent(new_marker.pos - marker.pos)); } _ => unreachable!(), } @@ -237,8 +254,9 @@ impl<'t> Parser<'t> { /// structured errors with spans and notes, like rustc /// does. pub(crate) fn error>(&mut self, message: T) { - let msg = message.into(); - self.push_event(Event::Error { msg }); + let err = self.errors.len() as u32; + self.errors.push(message.into()); + self.push_event(Event::Error { err }); } /// Consume the next token if it is `kind` or emit an error @@ -366,7 +384,7 @@ impl CompletedMarker { let idx = self.start_pos as usize; match &mut p.events[idx] { Event::Start { forward_parent, .. } => { - *forward_parent = Some(new_pos.pos - self.start_pos); + *forward_parent = Some(fwd_parent(new_pos.pos - self.start_pos)); } _ => unreachable!(), } @@ -379,7 +397,7 @@ impl CompletedMarker { let idx = m.pos as usize; match &mut p.events[idx] { Event::Start { forward_parent, .. } => { - *forward_parent = Some(self.start_pos - m.pos); + *forward_parent = Some(fwd_parent(self.start_pos - m.pos)); } _ => unreachable!(), }