//! Contains an implementation of pull-based XML parser. use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char}; use crate::common::{Position, TextPosition, XmlVersion}; use crate::name::OwnedName; use crate::namespace::NamespaceStack; use crate::reader::config::ParserConfig2; use crate::reader::error::SyntaxError; use crate::reader::events::XmlEvent; use crate::reader::indexset::AttributesSet; use crate::reader::lexer::{Lexer, Token}; use super::{Error, ErrorKind}; use std::collections::HashMap; use std::io::Read; macro_rules! gen_takes( ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( $( impl MarkupData { #[inline] #[allow(clippy::mem_replace_option_with_none)] fn $method(&mut self) -> $t { std::mem::replace(&mut self.$field, $def) } } )+ ) ); gen_takes!( name -> take_name, String, String::new(); ref_data -> take_ref_data, String, String::new(); encoding -> take_encoding, Option, None; element_name -> take_element_name, Option, None; attr_name -> take_attr_name, Option, None; attributes -> take_attributes, AttributesSet, AttributesSet::new() ); mod inside_cdata; mod inside_closing_tag_name; mod inside_comment; mod inside_declaration; mod inside_doctype; mod inside_opening_tag; mod inside_processing_instruction; mod inside_reference; mod outside_tag; static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; static DEFAULT_STANDALONE: Option = None; type ElementStack = Vec; pub type Result = super::Result; /// Pull-based XML parser. pub(crate) struct PullParser { config: ParserConfig2, lexer: Lexer, st: State, state_after_reference: State, buf: String, /// From DTD internal subset entities: HashMap, nst: NamespaceStack, data: MarkupData, final_result: Option, next_event: Option, est: ElementStack, pos: Vec, encountered: Encountered, inside_whitespace: bool, read_prefix_separator: bool, pop_namespace: bool, } // Keeps track when XML declaration can happen #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] enum Encountered { None = 0, AnyChars, // whitespace before ) -> PullParser { let config = config.into(); Self::new_with_config2(config) } #[inline] fn new_with_config2(config: ParserConfig2) -> PullParser { let mut lexer = Lexer::new(&config); if let Some(enc) = config.override_encoding { lexer.set_encoding(enc); } let mut pos = Vec::with_capacity(16); pos.push(TextPosition::new()); PullParser { config, lexer, st: State::DocumentStart, state_after_reference: State::OutsideTag, buf: String::new(), entities: HashMap::new(), nst: NamespaceStack::default(), data: MarkupData { name: String::new(), version: None, encoding: None, standalone: None, ref_data: String::new(), element_name: None, quote: None, attr_name: None, attributes: AttributesSet::new(), }, final_result: None, next_event: None, est: Vec::new(), pos, encountered: Encountered::None, inside_whitespace: true, read_prefix_separator: false, pop_namespace: false, } } /// Checks if this parser ignores the end of stream errors. pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream } #[inline(never)] fn set_encountered(&mut self, new_encounter: Encountered) -> Option { if new_encounter <= self.encountered { return None; } let prev_enc = self.encountered; self.encountered = new_encounter; // If declaration was not parsed and we have encountered an element, // emit this declaration as the next event. if prev_enc == Encountered::None { self.push_pos(); Some(Ok(XmlEvent::StartDocument { version: DEFAULT_VERSION, encoding: self.lexer.encoding().to_string(), standalone: DEFAULT_STANDALONE, })) } else { None } } } impl Position for PullParser { /// Returns the position of the last event produced by the parser #[inline] fn position(&self) -> TextPosition { self.pos[0] } } #[derive(Copy, Clone, PartialEq)] pub enum State { OutsideTag, InsideOpeningTag(OpeningTagSubstate), InsideClosingTag(ClosingTagSubstate), InsideProcessingInstruction(ProcessingInstructionSubstate), InsideComment, InsideCData, InsideDeclaration(DeclarationSubstate), InsideDoctype(DoctypeSubstate), InsideReference, DocumentStart, } #[derive(Copy, Clone, PartialEq)] pub enum DoctypeSubstate { Outside, String, InsideName, BeforeEntityName, EntityName, BeforeEntityValue, EntityValue, NumericReferenceStart, NumericReference, /// expansion PEReferenceInValue, PEReferenceInDtd, /// name definition PEReferenceDefinitionStart, PEReferenceDefinition, SkipDeclaration, Comment, } #[derive(Copy, Clone, PartialEq)] pub enum OpeningTagSubstate { InsideName, InsideTag, InsideAttributeName, AfterAttributeName, InsideAttributeValue, AfterAttributeValue, } #[derive(Copy, Clone, PartialEq)] pub enum ClosingTagSubstate { CTInsideName, CTAfterName, } #[derive(Copy, Clone, PartialEq)] pub enum ProcessingInstructionSubstate { PIInsideName, PIInsideData, } #[derive(Copy, Clone, PartialEq)] pub enum DeclarationSubstate { BeforeVersion, InsideVersion, AfterVersion, InsideVersionValue, AfterVersionValue, BeforeEncoding, InsideEncoding, AfterEncoding, InsideEncodingValue, AfterEncodingValue, BeforeStandaloneDecl, InsideStandaloneDecl, AfterStandaloneDecl, InsideStandaloneDeclValue, AfterStandaloneDeclValue, } #[derive(PartialEq)] enum QualifiedNameTarget { AttributeNameTarget, OpeningTagNameTarget, ClosingTagNameTarget, } #[derive(Copy, Clone, PartialEq, Eq)] enum QuoteToken { SingleQuoteToken, DoubleQuoteToken, } impl QuoteToken { fn from_token(t: &Token) -> QuoteToken { match *t { Token::SingleQuote => QuoteToken::SingleQuoteToken, Token::DoubleQuote => QuoteToken::DoubleQuoteToken, _ => panic!("Unexpected token: {t}"), } } fn as_token(self) -> Token { match self { QuoteToken::SingleQuoteToken => Token::SingleQuote, QuoteToken::DoubleQuoteToken => Token::DoubleQuote, } } } struct MarkupData { name: String, // used for processing instruction name ref_data: String, // used for reference content version: Option, // used for XML declaration version encoding: Option, // used for XML declaration encoding standalone: Option, // used for XML declaration standalone parameter element_name: Option, // used for element name quote: Option, // used to hold opening quote for attribute value attr_name: Option, // used to hold attribute name attributes: AttributesSet, // used to hold all accumulated attributes } impl PullParser { /// Returns next event read from the given buffer. /// /// This method should be always called with the same buffer. If you call it /// providing different buffers each time, the result will be undefined. pub fn next(&mut self, r: &mut R) -> Result { if let Some(ref ev) = self.final_result { return ev.clone(); } if let Some(ev) = self.next_event.take() { return ev; } if self.pop_namespace { self.pop_namespace = false; self.nst.pop(); } loop { debug_assert!(self.next_event.is_none()); debug_assert!(!self.pop_namespace); // While lexer gives us Ok(maybe_token) -- we loop. // Upon having a complete XML-event -- we return from the whole function. match self.lexer.next_token(r) { Ok(Some(token)) => { match self.dispatch_token(token) { None => {} // continue Some(Ok(xml_event)) => { self.next_pos(); return Ok(xml_event) }, Some(Err(xml_error)) => { self.next_pos(); return self.set_final_result(Err(xml_error)) }, } }, Ok(None) => break, Err(lexer_error) => { return self.set_final_result(Err(lexer_error)) }, } } self.handle_eof() } /// Handle end of stream fn handle_eof(&mut self) -> std::result::Result { // Forward pos to the lexer head self.next_pos(); let ev = if self.depth() == 0 { if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok Ok(XmlEvent::EndDocument) } else if self.encountered < Encountered::Element { self.error(SyntaxError::NoRootElement) } else { // self.st != State::OutsideTag self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? } } else if self.config.c.ignore_end_of_stream { self.final_result = None; self.lexer.reset_eof_handled(); return self.error(SyntaxError::UnbalancedRootElement); } else { self.error(SyntaxError::UnbalancedRootElement) }; self.set_final_result(ev) } // This function is to be called when a terminal event is reached. // The function sets up the `self.final_result` into `Some(result)` and return `result`. #[inline] fn set_final_result(&mut self, result: Result) -> Result { self.final_result = Some(result.clone()); result } #[cold] fn error(&self, e: SyntaxError) -> Result { Err(Error { pos: self.lexer.position(), kind: ErrorKind::Syntax(e.to_cow()), }) } #[inline] fn next_pos(&mut self) { // unfortunately calls to next_pos will never be perfectly balanced with push_pos, // at very least because parse errors and EOF can happen unexpectedly without a prior push. if !self.pos.is_empty() { if self.pos.len() > 1 { self.pos.remove(0); } else { self.pos[0] = self.lexer.position(); } } } #[inline] #[track_caller] fn push_pos(&mut self) { debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. This case is ignored in release mode, and merely causes document positions to be out of sync. Please file a bug and include the XML document that triggers this assert."); // it has capacity preallocated for more than it ever needs, so this reduces code size if self.pos.len() != self.pos.capacity() { self.pos.push(self.lexer.position()); } else if self.pos.len() > 1 { self.pos.remove(0); // this mitigates the excessive push_pos() call } } #[inline(never)] fn dispatch_token(&mut self, t: Token) -> Option { match self.st { State::OutsideTag => self.outside_tag(t), State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), State::InsideReference => self.inside_reference(t), State::InsideComment => self.inside_comment(t), State::InsideCData => self.inside_cdata(t), State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), State::InsideDoctype(s) => self.inside_doctype(t, s), State::InsideDeclaration(s) => self.inside_declaration(t, s), State::DocumentStart => self.document_start(t), } } #[inline] fn depth(&self) -> usize { self.est.len() } #[inline] fn buf_has_data(&self) -> bool { !self.buf.is_empty() } #[inline] fn take_buf(&mut self) -> String { std::mem::take(&mut self.buf) } #[inline] fn into_state(&mut self, st: State, ev: Option) -> Option { self.st = st; ev } #[inline] fn into_state_continue(&mut self, st: State) -> Option { self.into_state(st, None) } #[inline] fn into_state_emit(&mut self, st: State, ev: Result) -> Option { self.into_state(st, Some(ev)) } /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, /// an error is returned. /// /// # Parameters /// * `t` --- next token; /// * `on_name` --- a callback which is executed when whitespace is encountered. fn read_qualified_name(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option where F: Fn(&mut PullParser, Token, OwnedName) -> Option { // We can get here for the first time only when self.data.name contains zero or one character, // but first character cannot be a colon anyway if self.buf.len() <= 1 { self.read_prefix_separator = false; } let invoke_callback = move |this: &mut PullParser, t| { let name = this.take_buf(); match name.parse() { Ok(name) => on_name(this, t, name), Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))), } }; match t { // There can be only one colon, and not as the first character Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { self.buf.push(':'); self.read_prefix_separator = true; None } Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || self.buf_has_data() && is_name_char(c)) => { if self.buf.len() > self.config.max_name_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } self.buf.push(c); None }, Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))), } } /// Dispatches tokens in order to process attribute value. /// /// # Parameters /// * `t` --- next token; /// * `on_value` --- a callback which is called when terminating quote is encountered. fn read_attribute_value(&mut self, t: Token, on_value: F) -> Option where F: Fn(&mut PullParser, String) -> Option { match t { Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace Token::DoubleQuote | Token::SingleQuote => match self.data.quote { None => { // Entered attribute value self.data.quote = Some(QuoteToken::from_token(&t)); None } Some(q) if q.as_token() == t => { self.data.quote = None; let value = self.take_buf(); on_value(self, value) } _ => { if let Token::Character(c) = t { if !self.is_valid_xml_char_not_restricted(c) { return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); } } if self.buf.len() > self.config.max_attribute_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None } }, Token::ReferenceStart if self.data.quote.is_some() => { self.state_after_reference = self.st; self.into_state_continue(State::InsideReference) }, Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)), Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) }, // Every character except " and ' and < is okay _ if self.data.quote.is_some() => { if self.buf.len() > self.config.max_attribute_length { return Some(self.error(SyntaxError::ExceededConfiguredLimit)); } t.push_to_string(&mut self.buf); None } _ => Some(self.error(SyntaxError::UnexpectedToken(t))), } } fn emit_start_element(&mut self, emit_end_element: bool) -> Option { let mut name = self.data.take_element_name()?; let mut attributes = self.data.take_attributes().into_vec(); // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } // check and fix accumulated attributes prefixes for attr in &mut attributes { if let Some(ref pfx) = attr.name.prefix { let new_ns = match self.nst.get(pfx) { Some("") => None, // default namespace Some(ns) => Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))) }; attr.name.namespace = new_ns; } } if emit_end_element { self.pop_namespace = true; self.next_event = Some(Ok(XmlEvent::EndElement { name: name.clone() })); } else { self.est.push(name.clone()); } let namespace = self.nst.squash(); self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { name, attributes, namespace })) } fn emit_end_element(&mut self) -> Option { let mut name = self.data.take_element_name()?; // check whether the name prefix is bound and fix its namespace match self.nst.get(name.borrow().prefix_repr()) { Some("") => name.namespace = None, // default namespace Some(ns) => name.namespace = Some(ns.into()), None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))) } let op_name = self.est.pop()?; if name == op_name { self.pop_namespace = true; self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) } else { Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into()))) } } #[inline] fn is_valid_xml_char(&self, c: char) -> bool { if Some(XmlVersion::Version11) == self.data.version { is_xml11_char(c) } else { is_xml10_char(c) } } #[inline] fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { if Some(XmlVersion::Version11) == self.data.version { is_xml11_char_not_restricted(c) } else { is_xml10_char(c) } } } #[cfg(test)] mod tests { use std::io::BufReader; use crate::attribute::OwnedAttribute; use crate::common::TextPosition; use crate::name::OwnedName; use crate::reader::events::XmlEvent; use crate::reader::parser::PullParser; use crate::reader::ParserConfig; fn new_parser() -> PullParser { PullParser::new(ParserConfig::new()) } macro_rules! expect_event( ($r:expr, $p:expr, $t:pat) => ( match $p.next(&mut $r) { $t => {} e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t)) } ); ($r:expr, $p:expr, $t:pat => $c:expr ) => ( match $p.next(&mut $r) { $t if $c => {} e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c)) } ) ); macro_rules! test_data( ($d:expr) => ({ static DATA: &'static str = $d; let r = BufReader::new(DATA.as_bytes()); let p = new_parser(); (r, p) }) ); #[test] fn issue_3_semicolon_in_attribute_value() { let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => *name == OwnedName::local("a") && attributes.len() == 1 && attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && namespace.is_essentially_empty() ); expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); expect_event!(r, p, Ok(XmlEvent::EndDocument)); } #[test] fn issue_140_entity_reference_inside_tag() { let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); expect_event!(r, p, Ok(XmlEvent::EndDocument)); } #[test] fn issue_220_comment() { let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); expect_event!(r, p, Ok(XmlEvent::EndDocument)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Err(_)); // ---> is forbidden in comments let (mut r, mut p) = test_data!(r#""#); p.config.c.ignore_comments = false; expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == " "#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); let (mut r, mut p) = test_data!(r#""#); expect_event!(r, p, Err(_)); } #[test] fn opening_tag_in_attribute_value() { use crate::reader::error::{SyntaxError, Error, ErrorKind}; let (mut r, mut p) = test_data!(r#" "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Err(ref e) => *e == Error { kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()), pos: TextPosition { row: 1, column: 24 } } ); } #[test] fn reference_err() { let (mut r, mut p) = test_data!(r#" && "#); expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); expect_event!(r, p, Err(_)); } #[test] fn state_size() { assert_eq!(2, std::mem::size_of::()); assert_eq!(1, std::mem::size_of::()); } }