Skip to main content
Module

x/deno_dom/html-parser/core/src/rcdom.rs

Browser DOM & HTML parser in Deno
Extremely Popular
Go to Latest
File
// Copyright 2014-2017 The html5ever Project Developers. See the// COPYRIGHT file at the top-level directory of this distribution.//// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your// option. This file may not be copied, modified, or distributed// except according to those terms.
//! A simple reference-counted DOM.//!//! This is sufficient as a static parse tree, but don't build a//! web browser using it. :)//!//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like//! format. For example, the following graph//!//! ```text//! div//! +- "text node"//! +- span//! ```//! in HTML would be serialized as//!//! ```html//! <div>text node<span></span></div>//! ```//!//! See the [document object model article on wikipedia][dom wiki] for more information.//!//! This implementation stores the information associated with each node once, and then hands out//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create//! a new ref and then a node will outlive the document. Nodes own their children, but only have//! weak references to their parents.//!//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
use html5ever::tendril;
use std::borrow::Cow;use std::cell::{Cell, RefCell};use std::collections::HashSet;use std::default::Default;use std::fmt;use std::io;use std::mem;use std::rc::{Rc, Weak};
use tendril::StrTendril;
use markup5ever::interface::tree_builder;use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};use markup5ever::serialize::TraversalScope;use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};use markup5ever::serialize::{Serialize, Serializer};use markup5ever::Attribute;use markup5ever::ExpandedName;use markup5ever::QualName;
/// The different kinds of nodes in the DOM.#[derive(Debug)]pub enum NodeData { /// The `Document` itself - the root node of a HTML document. Document,
/// A `DOCTYPE` with name, public id, and system id. See /// [document type declaration on wikipedia][dtd wiki]. /// /// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration Doctype { name: StrTendril, public_id: StrTendril, system_id: StrTendril, },
/// A text node. Text { contents: RefCell<StrTendril> },
/// A comment. Comment { contents: StrTendril },
/// An element with attributes. Element { name: QualName, attrs: RefCell<Vec<Attribute>>,
/// For HTML \<template\> elements, the [template contents]. /// /// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents template_contents: Option<Handle>,
/// Whether the node is a [HTML integration point]. /// /// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point mathml_annotation_xml_integration_point: bool, },
/// A Processing instruction. ProcessingInstruction { target: StrTendril, contents: StrTendril, },}
/// A DOM node.pub struct Node { /// Parent node. pub parent: Cell<Option<WeakHandle>>, /// Child nodes of this node. pub children: RefCell<Vec<Handle>>, /// Represents this node's data. pub data: NodeData,}
impl Node { /// Create a new node from its contents pub fn new(data: NodeData) -> Rc<Self> { Rc::new(Node { data, parent: Cell::new(None), children: RefCell::new(vec![]), }) }}
impl fmt::Debug for Node { fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { fmt.debug_struct("Node") .field("data", &self.data) .field("children", &self.children) .finish() }}
/// Reference to a DOM node.pub type Handle = Rc<Node>;
/// Weak reference to a DOM node, used for parent pointers.pub type WeakHandle = Weak<Node>;
/// Append a parentless node to another nodes' childrenfn append(new_parent: &Handle, child: Handle) { let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
assert!( previous_parent.is_none(), "child passed to append cannot have existing parent" );
new_parent.children.borrow_mut().push(child);}
/// If the node has a parent, get it and this node's position in its childrenfn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> { let weak = target.parent.take()?; let parent = weak.upgrade().expect("dangling weak pointer to parent"); target.parent.set(Some(weak)); let i = match parent .children .borrow() .iter() .enumerate() .find(|&(_, child)| Rc::ptr_eq(&child, &target)) { Some((i, _)) => i, None => panic!("have parent but couldn't find in parent's children!"), }; Some((parent, i))}
fn append_to_existing_text(prev: &Handle, text: &str) -> bool { match prev.data { NodeData::Text { ref contents } => { contents.borrow_mut().push_slice(text); true } _ => false, }}
fn remove_from_parent(target: &Handle) { if let Some((parent, i)) = get_parent_and_index(target) { parent.children.borrow_mut().remove(i); target.parent.set(None); }}
/// The DOM itself; the result of parsing.pub struct RcDom { /// The `Document` itself. pub document: Handle,
/// Errors that occurred during parsing. pub errors: Vec<Cow<'static, str>>,
/// The document's quirks mode. pub quirks_mode: QuirksMode,}
impl TreeSink for RcDom { type Output = Self; fn finish(self) -> Self { self }
type Handle = Handle;
fn parse_error(&mut self, msg: Cow<'static, str>) { self.errors.push(msg); }
fn get_document(&mut self) -> Handle { self.document.clone() }
fn get_template_contents(&mut self, target: &Handle) -> Handle { if let NodeData::Element { template_contents: Some(ref contents), .. } = target.data { contents.clone() } else { panic!("not a template element!") } }
fn set_quirks_mode(&mut self, mode: QuirksMode) { self.quirks_mode = mode; }
fn same_node(&self, x: &Handle, y: &Handle) -> bool { Rc::ptr_eq(x, y) }
fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> { match target.data { NodeData::Element { ref name, .. } => name.expanded(), _ => panic!("not an element!"), } }
fn create_element( &mut self, name: QualName, attrs: Vec<Attribute>, flags: ElementFlags, ) -> Handle { Node::new(NodeData::Element { name, attrs: RefCell::new(attrs), template_contents: if flags.template { Some(Node::new(NodeData::Document)) } else { None }, mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, }) }
fn create_comment(&mut self, text: StrTendril) -> Handle { Node::new(NodeData::Comment { contents: text }) }
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Handle { Node::new(NodeData::ProcessingInstruction { target, contents: data, }) }
fn append(&mut self, parent: &Handle, child: NodeOrText<Handle>) { // Append to an existing Text node if we have one. if let NodeOrText::AppendText(ref text) = child { if let Some(h) = parent.children.borrow().last() { if append_to_existing_text(h, &text) { return; } } }
append( &parent, match child { NodeOrText::AppendText(text) => Node::new(NodeData::Text { contents: RefCell::new(text), }), NodeOrText::AppendNode(node) => node, }, ); }
fn append_before_sibling(&mut self, sibling: &Handle, child: NodeOrText<Handle>) { let (parent, i) = get_parent_and_index(&sibling) .expect("append_before_sibling called on node without parent");
let child = match (child, i) { // No previous node. (NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text { contents: RefCell::new(text), }),
// Look for a text node before the insertion point. (NodeOrText::AppendText(text), i) => { let children = parent.children.borrow(); let prev = &children[i - 1]; if append_to_existing_text(prev, &text) { return; } Node::new(NodeData::Text { contents: RefCell::new(text), }) }
// The tree builder promises we won't have a text node after // the insertion point.
// Any other kind of node. (NodeOrText::AppendNode(node), _) => node, };
remove_from_parent(&child);
child.parent.set(Some(Rc::downgrade(&parent))); parent.children.borrow_mut().insert(i, child); }
fn append_based_on_parent_node( &mut self, element: &Self::Handle, prev_element: &Self::Handle, child: NodeOrText<Self::Handle>, ) { let parent = element.parent.take(); let has_parent = parent.is_some(); element.parent.set(parent);
if has_parent { self.append_before_sibling(element, child); } else { self.append(prev_element, child); } }
fn append_doctype_to_document( &mut self, name: StrTendril, public_id: StrTendril, system_id: StrTendril, ) { append( &self.document, Node::new(NodeData::Doctype { name, public_id, system_id, }), ); }
fn add_attrs_if_missing(&mut self, target: &Handle, attrs: Vec<Attribute>) { let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { attrs.borrow_mut() } else { panic!("not an element") };
let existing_names = existing .iter() .map(|e| e.name.clone()) .collect::<HashSet<_>>(); existing.extend( attrs .into_iter() .filter(|attr| !existing_names.contains(&attr.name)), ); }
fn remove_from_parent(&mut self, target: &Handle) { remove_from_parent(&target); }
fn reparent_children(&mut self, node: &Handle, new_parent: &Handle) { let mut children = node.children.borrow_mut(); let mut new_children = new_parent.children.borrow_mut(); for child in children.iter() { let previous_parent = child.parent.replace(Some(Rc::downgrade(&new_parent))); assert!(Rc::ptr_eq( &node, &previous_parent.unwrap().upgrade().expect("dangling weak") )) } new_children.extend(mem::replace(&mut *children, Vec::new())); }
fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool { if let NodeData::Element { mathml_annotation_xml_integration_point, .. } = target.data { mathml_annotation_xml_integration_point } else { panic!("not an element!") } }}
impl Default for RcDom { fn default() -> RcDom { RcDom { document: Node::new(NodeData::Document), errors: vec![], quirks_mode: tree_builder::NoQuirks, } }}
enum SerializeOp { Open(Handle), Close(QualName),}
pub struct SerializableHandle(Handle);
impl From<Handle> for SerializableHandle { fn from(h: Handle) -> SerializableHandle { SerializableHandle(h) }}
impl Serialize for SerializableHandle { fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()> where S: Serializer, { let mut ops = match traversal_scope { IncludeNode => vec![SerializeOp::Open(self.0.clone())], ChildrenOnly(_) => self .0 .children .borrow() .iter() .rev() .map(|h| SerializeOp::Open(h.clone())) .collect(), };
while let Some(op) = ops.pop() { match op { SerializeOp::Open(handle) => match &handle.data { NodeData::Element { name, attrs, .. } => { serializer.start_elem( name.clone(), attrs.borrow().iter().map(|at| (&at.name, &at.value[..])), )?;
ops.push(SerializeOp::Close(name.clone()));
for child in handle.children.borrow().iter().rev() { ops.push(SerializeOp::Open(child.clone())); } }
NodeData::Doctype { name, .. } => serializer.write_doctype(&name)?,
NodeData::Text { contents } => serializer.write_text(&contents.borrow())?,
NodeData::Comment { contents } => serializer.write_comment(&contents)?,
NodeData::ProcessingInstruction { target, contents } => { serializer.write_processing_instruction(target, contents)? }
NodeData::Document => panic!("Can't serialize Document node itself"), },
SerializeOp::Close(name) => { serializer.end_elem(name)?; } } }
Ok(()) }}