aboutsummaryrefslogtreecommitdiffstats
path: root/components/script/parse/html.rs
diff options
context:
space:
mode:
authorKeegan McAllister <kmcallister@mozilla.com>2014-09-19 14:02:22 -0700
committerKeegan McAllister <kmcallister@mozilla.com>2014-10-16 13:06:34 -0700
commit9da7679367eba53d0f86bff30bc8b005940f044e (patch)
treeac2dadb2f3f98939c3245b5ed58c6beba1730c8c /components/script/parse/html.rs
parent3fbb25cc430c9dcf3ed06b6e86b8a64738493e86 (diff)
downloadservo-9da7679367eba53d0f86bff30bc8b005940f044e.tar.gz
servo-9da7679367eba53d0f86bff30bc8b005940f044e.zip
Use html5ever for HTML parsing
Diffstat (limited to 'components/script/parse/html.rs')
-rw-r--r--components/script/parse/html.rs521
1 files changed, 521 insertions, 0 deletions
diff --git a/components/script/parse/html.rs b/components/script/parse/html.rs
new file mode 100644
index 00000000000..829d5d0c40b
--- /dev/null
+++ b/components/script/parse/html.rs
@@ -0,0 +1,521 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use dom::attr::AttrHelpers;
+use dom::bindings::codegen::Bindings::AttrBinding::AttrMethods;
+use dom::bindings::codegen::Bindings::NodeBinding::NodeMethods;
+use dom::bindings::codegen::InheritTypes::{NodeCast, ElementCast, HTMLScriptElementCast};
+use dom::bindings::js::{JS, JSRef, Temporary, OptionalRootable, Root};
+use dom::document::{Document, DocumentHelpers};
+use dom::element::{AttributeHandlers, ElementHelpers};
+use dom::htmlelement::HTMLElement;
+use dom::htmlheadingelement::{Heading1, Heading2, Heading3, Heading4, Heading5, Heading6};
+use dom::htmlformelement::HTMLFormElement;
+use dom::htmlscriptelement::HTMLScriptElementHelpers;
+use dom::node::{Node, NodeHelpers, TrustedNodeAddress};
+use dom::servohtmlparser;
+use dom::servohtmlparser::ServoHTMLParser;
+use dom::types::*;
+use page::Page;
+
+use encoding::all::UTF_8;
+use encoding::types::{Encoding, DecodeReplace};
+
+use servo_net::resource_task::{Load, LoadData, Payload, Done, ResourceTask, load_whole_resource};
+use servo_msg::constellation_msg::LoadData as MsgLoadData;
+use servo_util::task::spawn_named;
+use servo_util::str::DOMString;
+use std::ascii::StrAsciiExt;
+use std::comm::{channel, Sender, Receiver};
+use std::str::MaybeOwned;
+use url::{Url, UrlParser};
+use http::headers::HeaderEnum;
+use time;
+use html5ever::Attribute;
+use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText};
+use string_cache::QualName;
+
+pub struct JSFile {
+ pub data: String,
+ pub url: Option<Url>,
+}
+
+pub type JSResult = Vec<JSFile>;
+
+pub enum HTMLInput {
+ InputString(String),
+ InputUrl(Url),
+}
+
+pub enum JSMessage {
+ JSTaskNewFile(Url),
+ JSTaskNewInlineScript(String, Option<Url>),
+ JSTaskExit
+}
+
+/// Messages generated by the HTML parser upon discovery of additional resources
+pub enum HtmlDiscoveryMessage {
+ HtmlDiscoveredScript(JSResult)
+}
+
+pub struct HtmlParserResult {
+ pub discovery_port: Receiver<HtmlDiscoveryMessage>,
+}
+
+fn js_script_listener(to_parent: Sender<HtmlDiscoveryMessage>,
+ from_parent: Receiver<JSMessage>,
+ resource_task: ResourceTask) {
+ let mut result_vec = vec!();
+
+ loop {
+ match from_parent.recv_opt() {
+ Ok(JSTaskNewFile(url)) => {
+ match load_whole_resource(&resource_task, url.clone()) {
+ Err(_) => {
+ error!("error loading script {:s}", url.serialize());
+ }
+ Ok((metadata, bytes)) => {
+ let decoded = UTF_8.decode(bytes.as_slice(), DecodeReplace).unwrap();
+ result_vec.push(JSFile {
+ data: decoded.to_string(),
+ url: Some(metadata.final_url),
+ });
+ }
+ }
+ }
+ Ok(JSTaskNewInlineScript(data, url)) => {
+ result_vec.push(JSFile { data: data, url: url });
+ }
+ Ok(JSTaskExit) | Err(()) => {
+ break;
+ }
+ }
+ }
+
+ assert!(to_parent.send_opt(HtmlDiscoveredScript(result_vec)).is_ok());
+}
+
+// Parses an RFC 2616 compliant date/time string, and returns a localized
+// date/time string in a format suitable for document.lastModified.
+fn parse_last_modified(timestamp: &str) -> String {
+ let format = "%m/%d/%Y %H:%M:%S";
+
+ // RFC 822, updated by RFC 1123
+ match time::strptime(timestamp, "%a, %d %b %Y %T %Z") {
+ Ok(t) => return t.to_local().strftime(format),
+ Err(_) => ()
+ }
+
+ // RFC 850, obsoleted by RFC 1036
+ match time::strptime(timestamp, "%A, %d-%b-%y %T %Z") {
+ Ok(t) => return t.to_local().strftime(format),
+ Err(_) => ()
+ }
+
+ // ANSI C's asctime() format
+ match time::strptime(timestamp, "%c") {
+ Ok(t) => t.to_local().strftime(format),
+ Err(_) => String::from_str("")
+ }
+}
+
+pub fn build_element_from_tag(name: QualName,
+ prefix: Option<DOMString>,
+ document: JSRef<Document>) -> Temporary<Element> {
+ if name.ns != ns!(HTML) {
+ return Element::new(name.local.as_slice().to_string(), name.ns, None, document);
+ }
+
+ macro_rules! make(
+ ($ctor:ident $(, $arg:expr)*) => ({
+ let obj = $ctor::new(name.local.as_slice().to_string(), prefix, document $(, $arg)*);
+ ElementCast::from_temporary(obj)
+ })
+ )
+
+ // This is a big match, and the IDs for inline-interned atoms are not very structured.
+ // Perhaps we should build a perfect hash from those IDs instead.
+ match name.local {
+ atom!("a") => make!(HTMLAnchorElement),
+ atom!("abbr") => make!(HTMLElement),
+ atom!("acronym") => make!(HTMLElement),
+ atom!("address") => make!(HTMLElement),
+ atom!("applet") => make!(HTMLAppletElement),
+ atom!("area") => make!(HTMLAreaElement),
+ atom!("article") => make!(HTMLElement),
+ atom!("aside") => make!(HTMLElement),
+ atom!("audio") => make!(HTMLAudioElement),
+ atom!("b") => make!(HTMLElement),
+ atom!("base") => make!(HTMLBaseElement),
+ atom!("bdi") => make!(HTMLElement),
+ atom!("bdo") => make!(HTMLElement),
+ atom!("bgsound") => make!(HTMLElement),
+ atom!("big") => make!(HTMLElement),
+ atom!("blockquote") => make!(HTMLElement),
+ atom!("body") => make!(HTMLBodyElement),
+ atom!("br") => make!(HTMLBRElement),
+ atom!("button") => make!(HTMLButtonElement),
+ atom!("canvas") => make!(HTMLCanvasElement),
+ atom!("caption") => make!(HTMLTableCaptionElement),
+ atom!("center") => make!(HTMLElement),
+ atom!("cite") => make!(HTMLElement),
+ atom!("code") => make!(HTMLElement),
+ atom!("col") => make!(HTMLTableColElement),
+ atom!("colgroup") => make!(HTMLTableColElement),
+ atom!("data") => make!(HTMLDataElement),
+ atom!("datalist") => make!(HTMLDataListElement),
+ atom!("dd") => make!(HTMLElement),
+ atom!("del") => make!(HTMLModElement),
+ atom!("details") => make!(HTMLElement),
+ atom!("dfn") => make!(HTMLElement),
+ atom!("dir") => make!(HTMLDirectoryElement),
+ atom!("div") => make!(HTMLDivElement),
+ atom!("dl") => make!(HTMLDListElement),
+ atom!("dt") => make!(HTMLElement),
+ atom!("em") => make!(HTMLElement),
+ atom!("embed") => make!(HTMLEmbedElement),
+ atom!("fieldset") => make!(HTMLFieldSetElement),
+ atom!("figcaption") => make!(HTMLElement),
+ atom!("figure") => make!(HTMLElement),
+ atom!("font") => make!(HTMLFontElement),
+ atom!("footer") => make!(HTMLElement),
+ atom!("form") => make!(HTMLFormElement),
+ atom!("frame") => make!(HTMLFrameElement),
+ atom!("frameset") => make!(HTMLFrameSetElement),
+ atom!("h1") => make!(HTMLHeadingElement, Heading1),
+ atom!("h2") => make!(HTMLHeadingElement, Heading2),
+ atom!("h3") => make!(HTMLHeadingElement, Heading3),
+ atom!("h4") => make!(HTMLHeadingElement, Heading4),
+ atom!("h5") => make!(HTMLHeadingElement, Heading5),
+ atom!("h6") => make!(HTMLHeadingElement, Heading6),
+ atom!("head") => make!(HTMLHeadElement),
+ atom!("header") => make!(HTMLElement),
+ atom!("hgroup") => make!(HTMLElement),
+ atom!("hr") => make!(HTMLHRElement),
+ atom!("html") => make!(HTMLHtmlElement),
+ atom!("i") => make!(HTMLElement),
+ atom!("iframe") => make!(HTMLIFrameElement),
+ atom!("img") => make!(HTMLImageElement),
+ atom!("input") => make!(HTMLInputElement),
+ atom!("ins") => make!(HTMLModElement),
+ atom!("isindex") => make!(HTMLElement),
+ atom!("kbd") => make!(HTMLElement),
+ atom!("label") => make!(HTMLLabelElement),
+ atom!("legend") => make!(HTMLLegendElement),
+ atom!("li") => make!(HTMLLIElement),
+ atom!("link") => make!(HTMLLinkElement),
+ atom!("main") => make!(HTMLElement),
+ atom!("map") => make!(HTMLMapElement),
+ atom!("mark") => make!(HTMLElement),
+ atom!("marquee") => make!(HTMLElement),
+ atom!("meta") => make!(HTMLMetaElement),
+ atom!("meter") => make!(HTMLMeterElement),
+ atom!("nav") => make!(HTMLElement),
+ atom!("nobr") => make!(HTMLElement),
+ atom!("noframes") => make!(HTMLElement),
+ atom!("noscript") => make!(HTMLElement),
+ atom!("object") => make!(HTMLObjectElement),
+ atom!("ol") => make!(HTMLOListElement),
+ atom!("optgroup") => make!(HTMLOptGroupElement),
+ atom!("option") => make!(HTMLOptionElement),
+ atom!("output") => make!(HTMLOutputElement),
+ atom!("p") => make!(HTMLParagraphElement),
+ atom!("param") => make!(HTMLParamElement),
+ atom!("pre") => make!(HTMLPreElement),
+ atom!("progress") => make!(HTMLProgressElement),
+ atom!("q") => make!(HTMLQuoteElement),
+ atom!("rp") => make!(HTMLElement),
+ atom!("rt") => make!(HTMLElement),
+ atom!("ruby") => make!(HTMLElement),
+ atom!("s") => make!(HTMLElement),
+ atom!("samp") => make!(HTMLElement),
+ atom!("script") => make!(HTMLScriptElement),
+ atom!("section") => make!(HTMLElement),
+ atom!("select") => make!(HTMLSelectElement),
+ atom!("small") => make!(HTMLElement),
+ atom!("source") => make!(HTMLSourceElement),
+ atom!("spacer") => make!(HTMLElement),
+ atom!("span") => make!(HTMLSpanElement),
+ atom!("strike") => make!(HTMLElement),
+ atom!("strong") => make!(HTMLElement),
+ atom!("style") => make!(HTMLStyleElement),
+ atom!("sub") => make!(HTMLElement),
+ atom!("summary") => make!(HTMLElement),
+ atom!("sup") => make!(HTMLElement),
+ atom!("table") => make!(HTMLTableElement),
+ atom!("tbody") => make!(HTMLTableSectionElement),
+ atom!("td") => make!(HTMLTableDataCellElement),
+ atom!("template") => make!(HTMLTemplateElement),
+ atom!("textarea") => make!(HTMLTextAreaElement),
+ atom!("th") => make!(HTMLTableHeaderCellElement),
+ atom!("time") => make!(HTMLTimeElement),
+ atom!("title") => make!(HTMLTitleElement),
+ atom!("tr") => make!(HTMLTableRowElement),
+ atom!("tt") => make!(HTMLElement),
+ atom!("track") => make!(HTMLTrackElement),
+ atom!("u") => make!(HTMLElement),
+ atom!("ul") => make!(HTMLUListElement),
+ atom!("var") => make!(HTMLElement),
+ atom!("video") => make!(HTMLVideoElement),
+ atom!("wbr") => make!(HTMLElement),
+ _ => make!(HTMLUnknownElement),
+ }
+}
+
+trait SinkHelpers {
+ fn get_or_create(&self, child: NodeOrText<TrustedNodeAddress>) -> Temporary<Node>;
+}
+
+impl SinkHelpers for servohtmlparser::Sink {
+ fn get_or_create(&self, child: NodeOrText<TrustedNodeAddress>) -> Temporary<Node> {
+ match child {
+ AppendNode(n) => Temporary::new(unsafe { JS::from_trusted_node_address(n) }),
+ AppendText(t) => {
+ let doc = self.document.root();
+ let text = Text::new(t, *doc);
+ NodeCast::from_temporary(text)
+ }
+ }
+ }
+}
+
+impl<'a> TreeSink<TrustedNodeAddress> for servohtmlparser::Sink {
+ fn get_document(&mut self) -> TrustedNodeAddress {
+ let doc = self.document.root();
+ let node: JSRef<Node> = NodeCast::from_ref(*doc);
+ node.to_trusted_node_address()
+ }
+
+ fn same_node(&self, x: TrustedNodeAddress, y: TrustedNodeAddress) -> bool {
+ x == y
+ }
+
+ fn elem_name(&self, target: TrustedNodeAddress) -> QualName {
+ let node: Root<Node> = unsafe { JS::from_trusted_node_address(target).root() };
+ let elem: JSRef<Element> = ElementCast::to_ref(*node)
+ .expect("tried to get name of non-Element in HTML parsing");
+ QualName {
+ ns: elem.get_namespace().clone(),
+ local: elem.get_local_name().clone(),
+ }
+ }
+
+ fn create_element(&mut self, name: QualName, attrs: Vec<Attribute>)
+ -> TrustedNodeAddress {
+ let doc = self.document.root();
+ let elem = build_element_from_tag(name, None, *doc).root();
+
+ for attr in attrs.into_iter() {
+ elem.set_attribute_from_parser(attr.name, attr.value, None);
+ }
+
+ let node: JSRef<Node> = NodeCast::from_ref(*elem);
+ node.to_trusted_node_address()
+ }
+
+ fn create_comment(&mut self, text: String) -> TrustedNodeAddress {
+ let doc = self.document.root();
+ let comment = Comment::new(text, *doc);
+ let node: Root<Node> = NodeCast::from_temporary(comment).root();
+ node.to_trusted_node_address()
+ }
+
+ fn append_before_sibling(&mut self,
+ sibling: TrustedNodeAddress,
+ new_node: NodeOrText<TrustedNodeAddress>) -> Result<(), NodeOrText<TrustedNodeAddress>> {
+ // If there is no parent, return the node to the parser.
+ let sibling: Root<Node> = unsafe { JS::from_trusted_node_address(sibling).root() };
+ let parent = match sibling.parent_node() {
+ Some(p) => p.root(),
+ None => return Err(new_node),
+ };
+
+ let child = self.get_or_create(new_node).root();
+ assert!(parent.InsertBefore(*child, Some(*sibling)).is_ok());
+ Ok(())
+ }
+
+ fn parse_error(&mut self, msg: MaybeOwned<'static>) {
+ error!("Parse error: {:s}", msg);
+ }
+
+ fn set_quirks_mode(&mut self, mode: QuirksMode) {
+ let doc = self.document.root();
+ doc.set_quirks_mode(mode);
+ }
+
+ fn append(&mut self, parent: TrustedNodeAddress, child: NodeOrText<TrustedNodeAddress>) {
+ let parent: Root<Node> = unsafe { JS::from_trusted_node_address(parent).root() };
+ let child = self.get_or_create(child).root();
+
+ // FIXME(#3701): Use a simpler algorithm and merge adjacent text nodes
+ assert!(parent.AppendChild(*child).is_ok());
+ }
+
+ fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) {
+ let doc = self.document.root();
+ let doc_node: JSRef<Node> = NodeCast::from_ref(*doc);
+ let doctype = DocumentType::new(name, Some(public_id), Some(system_id), *doc);
+ let node: Root<Node> = NodeCast::from_temporary(doctype).root();
+
+ assert!(doc_node.AppendChild(*node).is_ok());
+ }
+
+ fn add_attrs_if_missing(&mut self, target: TrustedNodeAddress, attrs: Vec<Attribute>) {
+ let node: Root<Node> = unsafe { JS::from_trusted_node_address(target).root() };
+ let elem: JSRef<Element> = ElementCast::to_ref(*node)
+ .expect("tried to set attrs on non-Element in HTML parsing");
+ for attr in attrs.into_iter() {
+ elem.set_attribute_from_parser(attr.name, attr.value, None);
+ }
+ }
+
+ fn remove_from_parent(&mut self, _target: TrustedNodeAddress) {
+ error!("remove_from_parent not implemented!");
+ }
+
+ fn mark_script_already_started(&mut self, _node: TrustedNodeAddress) {
+ error!("mark_script_already_started not implemented!");
+ }
+
+ fn complete_script(&mut self, node: TrustedNodeAddress) {
+ let node: Root<Node> = unsafe { JS::from_trusted_node_address(node).root() };
+ let script: Option<JSRef<HTMLScriptElement>> =
+ HTMLScriptElementCast::to_ref(*node);
+ let script = match script {
+ Some(script) if script.is_javascript() => script,
+ _ => return,
+ };
+
+ let script_element: JSRef<Element> = ElementCast::from_ref(script);
+ match script_element.get_attribute(ns!(""), &atom!("src")).root() {
+ Some(src) => {
+ debug!("found script: {:s}", src.deref().Value());
+ let mut url_parser = UrlParser::new();
+ match self.base_url {
+ None => (),
+ Some(ref base_url) => {
+ url_parser.base_url(base_url);
+ }
+ };
+ match url_parser.parse(src.deref().value().as_slice()) {
+ Ok(new_url) => self.js_chan.send(JSTaskNewFile(new_url)),
+ Err(e) => debug!("Parsing url {:s} failed: {:?}", src.deref().Value(), e)
+ };
+ }
+ None => {
+ let scriptnode: JSRef<Node> = NodeCast::from_ref(script);
+ let data = Node::collect_text_contents(scriptnode.children());
+ debug!("script data = {:?}", data);
+ self.js_chan.send(JSTaskNewInlineScript(data, self.base_url.clone()));
+ }
+ }
+ }
+}
+
+// The url from msg_load_data is ignored here
+pub fn parse_html(page: &Page,
+ document: JSRef<Document>,
+ input: HTMLInput,
+ resource_task: ResourceTask,
+ msg_load_data: Option<MsgLoadData>)
+ -> HtmlParserResult {
+ // Spawn a JS parser to receive JavaScript.
+ let (discovery_chan, discovery_port) = channel();
+ let resource_task2 = resource_task.clone();
+ let js_result_chan = discovery_chan.clone();
+ let (js_chan, js_msg_port) = channel();
+ spawn_named("parse_html:js", proc() {
+ js_script_listener(js_result_chan, js_msg_port, resource_task2.clone());
+ });
+
+ let (base_url, load_response) = match input {
+ InputUrl(ref url) => {
+ // Wait for the LoadResponse so that the parser knows the final URL.
+ let (input_chan, input_port) = channel();
+ let mut load_data = LoadData::new(url.clone());
+ msg_load_data.map(|m| {
+ load_data.headers = m.headers;
+ load_data.method = m.method;
+ load_data.data = m.data;
+ });
+ resource_task.send(Load(load_data, input_chan));
+
+ let load_response = input_port.recv();
+
+ debug!("Fetched page; metadata is {:?}", load_response.metadata);
+
+ load_response.metadata.headers.as_ref().map(|headers| {
+ let header = headers.iter().find(|h|
+ h.header_name().as_slice().to_ascii_lower() == "last-modified".to_string()
+ );
+
+ match header {
+ Some(h) => document.set_last_modified(
+ parse_last_modified(h.header_value().as_slice())),
+ None => {},
+ };
+ });
+
+ let base_url = load_response.metadata.final_url.clone();
+
+ {
+ // Store the final URL before we start parsing, so that DOM routines
+ // (e.g. HTMLImageElement::update_image) can resolve relative URLs
+ // correctly.
+ *page.mut_url() = Some((base_url.clone(), true));
+ }
+
+ (Some(base_url), Some(load_response))
+ },
+ InputString(_) => {
+ match *page.url() {
+ Some((ref page_url, _)) => (Some(page_url.clone()), None),
+ None => (None, None),
+ }
+ },
+ };
+
+ let parser = ServoHTMLParser::new(js_chan.clone(), base_url.clone(), document).root();
+ let parser: JSRef<ServoHTMLParser> = *parser;
+
+ match input {
+ InputString(s) => {
+ parser.tokenizer().borrow_mut().feed(s);
+ }
+ InputUrl(url) => {
+ let load_response = load_response.unwrap();
+ match load_response.metadata.content_type {
+ Some((ref t, _)) if t.as_slice().eq_ignore_ascii_case("image") => {
+ let page = format!("<html><body><img src='{:s}' /></body></html>", base_url.as_ref().unwrap().serialize());
+ parser.tokenizer().borrow_mut().feed(page);
+ },
+ _ => {
+ for msg in load_response.progress_port.iter() {
+ match msg {
+ Payload(data) => {
+ // FIXME: use Vec<u8> (html5ever #34)
+ let data = String::from_utf8(data).unwrap();
+ parser.tokenizer().borrow_mut().feed(data);
+ }
+ Done(Err(err)) => {
+ fail!("Failed to load page URL {:s}, error: {:s}", url.serialize(), err);
+ }
+ Done(Ok(())) => break,
+ }
+ }
+ }
+ }
+ }
+ }
+
+ parser.tokenizer().borrow_mut().end();
+
+ debug!("finished parsing");
+ js_chan.send(JSTaskExit);
+
+ HtmlParserResult {
+ discovery_port: discovery_port,
+ }
+}