diff options
author | Keegan McAllister <kmcallister@mozilla.com> | 2014-09-19 14:02:22 -0700 |
---|---|---|
committer | Keegan McAllister <kmcallister@mozilla.com> | 2014-10-16 13:06:34 -0700 |
commit | 9da7679367eba53d0f86bff30bc8b005940f044e (patch) | |
tree | ac2dadb2f3f98939c3245b5ed58c6beba1730c8c /components/script/parse/html.rs | |
parent | 3fbb25cc430c9dcf3ed06b6e86b8a64738493e86 (diff) | |
download | servo-9da7679367eba53d0f86bff30bc8b005940f044e.tar.gz servo-9da7679367eba53d0f86bff30bc8b005940f044e.zip |
Use html5ever for HTML parsing
Diffstat (limited to 'components/script/parse/html.rs')
-rw-r--r-- | components/script/parse/html.rs | 521 |
1 files changed, 521 insertions, 0 deletions
diff --git a/components/script/parse/html.rs b/components/script/parse/html.rs new file mode 100644 index 00000000000..829d5d0c40b --- /dev/null +++ b/components/script/parse/html.rs @@ -0,0 +1,521 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use dom::attr::AttrHelpers; +use dom::bindings::codegen::Bindings::AttrBinding::AttrMethods; +use dom::bindings::codegen::Bindings::NodeBinding::NodeMethods; +use dom::bindings::codegen::InheritTypes::{NodeCast, ElementCast, HTMLScriptElementCast}; +use dom::bindings::js::{JS, JSRef, Temporary, OptionalRootable, Root}; +use dom::document::{Document, DocumentHelpers}; +use dom::element::{AttributeHandlers, ElementHelpers}; +use dom::htmlelement::HTMLElement; +use dom::htmlheadingelement::{Heading1, Heading2, Heading3, Heading4, Heading5, Heading6}; +use dom::htmlformelement::HTMLFormElement; +use dom::htmlscriptelement::HTMLScriptElementHelpers; +use dom::node::{Node, NodeHelpers, TrustedNodeAddress}; +use dom::servohtmlparser; +use dom::servohtmlparser::ServoHTMLParser; +use dom::types::*; +use page::Page; + +use encoding::all::UTF_8; +use encoding::types::{Encoding, DecodeReplace}; + +use servo_net::resource_task::{Load, LoadData, Payload, Done, ResourceTask, load_whole_resource}; +use servo_msg::constellation_msg::LoadData as MsgLoadData; +use servo_util::task::spawn_named; +use servo_util::str::DOMString; +use std::ascii::StrAsciiExt; +use std::comm::{channel, Sender, Receiver}; +use std::str::MaybeOwned; +use url::{Url, UrlParser}; +use http::headers::HeaderEnum; +use time; +use html5ever::Attribute; +use html5ever::tree_builder::{TreeSink, QuirksMode, NodeOrText, AppendNode, AppendText}; +use string_cache::QualName; + +pub struct JSFile { + pub data: String, + pub url: Option<Url>, +} + +pub type JSResult = Vec<JSFile>; + +pub enum HTMLInput { + InputString(String), + InputUrl(Url), +} + +pub enum JSMessage { + JSTaskNewFile(Url), + JSTaskNewInlineScript(String, Option<Url>), + JSTaskExit +} + +/// Messages generated by the HTML parser upon discovery of additional resources +pub enum HtmlDiscoveryMessage { + HtmlDiscoveredScript(JSResult) +} + +pub struct HtmlParserResult { + pub discovery_port: Receiver<HtmlDiscoveryMessage>, +} + +fn js_script_listener(to_parent: Sender<HtmlDiscoveryMessage>, + from_parent: Receiver<JSMessage>, + resource_task: ResourceTask) { + let mut result_vec = vec!(); + + loop { + match from_parent.recv_opt() { + Ok(JSTaskNewFile(url)) => { + match load_whole_resource(&resource_task, url.clone()) { + Err(_) => { + error!("error loading script {:s}", url.serialize()); + } + Ok((metadata, bytes)) => { + let decoded = UTF_8.decode(bytes.as_slice(), DecodeReplace).unwrap(); + result_vec.push(JSFile { + data: decoded.to_string(), + url: Some(metadata.final_url), + }); + } + } + } + Ok(JSTaskNewInlineScript(data, url)) => { + result_vec.push(JSFile { data: data, url: url }); + } + Ok(JSTaskExit) | Err(()) => { + break; + } + } + } + + assert!(to_parent.send_opt(HtmlDiscoveredScript(result_vec)).is_ok()); +} + +// Parses an RFC 2616 compliant date/time string, and returns a localized +// date/time string in a format suitable for document.lastModified. +fn parse_last_modified(timestamp: &str) -> String { + let format = "%m/%d/%Y %H:%M:%S"; + + // RFC 822, updated by RFC 1123 + match time::strptime(timestamp, "%a, %d %b %Y %T %Z") { + Ok(t) => return t.to_local().strftime(format), + Err(_) => () + } + + // RFC 850, obsoleted by RFC 1036 + match time::strptime(timestamp, "%A, %d-%b-%y %T %Z") { + Ok(t) => return t.to_local().strftime(format), + Err(_) => () + } + + // ANSI C's asctime() format + match time::strptime(timestamp, "%c") { + Ok(t) => t.to_local().strftime(format), + Err(_) => String::from_str("") + } +} + +pub fn build_element_from_tag(name: QualName, + prefix: Option<DOMString>, + document: JSRef<Document>) -> Temporary<Element> { + if name.ns != ns!(HTML) { + return Element::new(name.local.as_slice().to_string(), name.ns, None, document); + } + + macro_rules! make( + ($ctor:ident $(, $arg:expr)*) => ({ + let obj = $ctor::new(name.local.as_slice().to_string(), prefix, document $(, $arg)*); + ElementCast::from_temporary(obj) + }) + ) + + // This is a big match, and the IDs for inline-interned atoms are not very structured. + // Perhaps we should build a perfect hash from those IDs instead. + match name.local { + atom!("a") => make!(HTMLAnchorElement), + atom!("abbr") => make!(HTMLElement), + atom!("acronym") => make!(HTMLElement), + atom!("address") => make!(HTMLElement), + atom!("applet") => make!(HTMLAppletElement), + atom!("area") => make!(HTMLAreaElement), + atom!("article") => make!(HTMLElement), + atom!("aside") => make!(HTMLElement), + atom!("audio") => make!(HTMLAudioElement), + atom!("b") => make!(HTMLElement), + atom!("base") => make!(HTMLBaseElement), + atom!("bdi") => make!(HTMLElement), + atom!("bdo") => make!(HTMLElement), + atom!("bgsound") => make!(HTMLElement), + atom!("big") => make!(HTMLElement), + atom!("blockquote") => make!(HTMLElement), + atom!("body") => make!(HTMLBodyElement), + atom!("br") => make!(HTMLBRElement), + atom!("button") => make!(HTMLButtonElement), + atom!("canvas") => make!(HTMLCanvasElement), + atom!("caption") => make!(HTMLTableCaptionElement), + atom!("center") => make!(HTMLElement), + atom!("cite") => make!(HTMLElement), + atom!("code") => make!(HTMLElement), + atom!("col") => make!(HTMLTableColElement), + atom!("colgroup") => make!(HTMLTableColElement), + atom!("data") => make!(HTMLDataElement), + atom!("datalist") => make!(HTMLDataListElement), + atom!("dd") => make!(HTMLElement), + atom!("del") => make!(HTMLModElement), + atom!("details") => make!(HTMLElement), + atom!("dfn") => make!(HTMLElement), + atom!("dir") => make!(HTMLDirectoryElement), + atom!("div") => make!(HTMLDivElement), + atom!("dl") => make!(HTMLDListElement), + atom!("dt") => make!(HTMLElement), + atom!("em") => make!(HTMLElement), + atom!("embed") => make!(HTMLEmbedElement), + atom!("fieldset") => make!(HTMLFieldSetElement), + atom!("figcaption") => make!(HTMLElement), + atom!("figure") => make!(HTMLElement), + atom!("font") => make!(HTMLFontElement), + atom!("footer") => make!(HTMLElement), + atom!("form") => make!(HTMLFormElement), + atom!("frame") => make!(HTMLFrameElement), + atom!("frameset") => make!(HTMLFrameSetElement), + atom!("h1") => make!(HTMLHeadingElement, Heading1), + atom!("h2") => make!(HTMLHeadingElement, Heading2), + atom!("h3") => make!(HTMLHeadingElement, Heading3), + atom!("h4") => make!(HTMLHeadingElement, Heading4), + atom!("h5") => make!(HTMLHeadingElement, Heading5), + atom!("h6") => make!(HTMLHeadingElement, Heading6), + atom!("head") => make!(HTMLHeadElement), + atom!("header") => make!(HTMLElement), + atom!("hgroup") => make!(HTMLElement), + atom!("hr") => make!(HTMLHRElement), + atom!("html") => make!(HTMLHtmlElement), + atom!("i") => make!(HTMLElement), + atom!("iframe") => make!(HTMLIFrameElement), + atom!("img") => make!(HTMLImageElement), + atom!("input") => make!(HTMLInputElement), + atom!("ins") => make!(HTMLModElement), + atom!("isindex") => make!(HTMLElement), + atom!("kbd") => make!(HTMLElement), + atom!("label") => make!(HTMLLabelElement), + atom!("legend") => make!(HTMLLegendElement), + atom!("li") => make!(HTMLLIElement), + atom!("link") => make!(HTMLLinkElement), + atom!("main") => make!(HTMLElement), + atom!("map") => make!(HTMLMapElement), + atom!("mark") => make!(HTMLElement), + atom!("marquee") => make!(HTMLElement), + atom!("meta") => make!(HTMLMetaElement), + atom!("meter") => make!(HTMLMeterElement), + atom!("nav") => make!(HTMLElement), + atom!("nobr") => make!(HTMLElement), + atom!("noframes") => make!(HTMLElement), + atom!("noscript") => make!(HTMLElement), + atom!("object") => make!(HTMLObjectElement), + atom!("ol") => make!(HTMLOListElement), + atom!("optgroup") => make!(HTMLOptGroupElement), + atom!("option") => make!(HTMLOptionElement), + atom!("output") => make!(HTMLOutputElement), + atom!("p") => make!(HTMLParagraphElement), + atom!("param") => make!(HTMLParamElement), + atom!("pre") => make!(HTMLPreElement), + atom!("progress") => make!(HTMLProgressElement), + atom!("q") => make!(HTMLQuoteElement), + atom!("rp") => make!(HTMLElement), + atom!("rt") => make!(HTMLElement), + atom!("ruby") => make!(HTMLElement), + atom!("s") => make!(HTMLElement), + atom!("samp") => make!(HTMLElement), + atom!("script") => make!(HTMLScriptElement), + atom!("section") => make!(HTMLElement), + atom!("select") => make!(HTMLSelectElement), + atom!("small") => make!(HTMLElement), + atom!("source") => make!(HTMLSourceElement), + atom!("spacer") => make!(HTMLElement), + atom!("span") => make!(HTMLSpanElement), + atom!("strike") => make!(HTMLElement), + atom!("strong") => make!(HTMLElement), + atom!("style") => make!(HTMLStyleElement), + atom!("sub") => make!(HTMLElement), + atom!("summary") => make!(HTMLElement), + atom!("sup") => make!(HTMLElement), + atom!("table") => make!(HTMLTableElement), + atom!("tbody") => make!(HTMLTableSectionElement), + atom!("td") => make!(HTMLTableDataCellElement), + atom!("template") => make!(HTMLTemplateElement), + atom!("textarea") => make!(HTMLTextAreaElement), + atom!("th") => make!(HTMLTableHeaderCellElement), + atom!("time") => make!(HTMLTimeElement), + atom!("title") => make!(HTMLTitleElement), + atom!("tr") => make!(HTMLTableRowElement), + atom!("tt") => make!(HTMLElement), + atom!("track") => make!(HTMLTrackElement), + atom!("u") => make!(HTMLElement), + atom!("ul") => make!(HTMLUListElement), + atom!("var") => make!(HTMLElement), + atom!("video") => make!(HTMLVideoElement), + atom!("wbr") => make!(HTMLElement), + _ => make!(HTMLUnknownElement), + } +} + +trait SinkHelpers { + fn get_or_create(&self, child: NodeOrText<TrustedNodeAddress>) -> Temporary<Node>; +} + +impl SinkHelpers for servohtmlparser::Sink { + fn get_or_create(&self, child: NodeOrText<TrustedNodeAddress>) -> Temporary<Node> { + match child { + AppendNode(n) => Temporary::new(unsafe { JS::from_trusted_node_address(n) }), + AppendText(t) => { + let doc = self.document.root(); + let text = Text::new(t, *doc); + NodeCast::from_temporary(text) + } + } + } +} + +impl<'a> TreeSink<TrustedNodeAddress> for servohtmlparser::Sink { + fn get_document(&mut self) -> TrustedNodeAddress { + let doc = self.document.root(); + let node: JSRef<Node> = NodeCast::from_ref(*doc); + node.to_trusted_node_address() + } + + fn same_node(&self, x: TrustedNodeAddress, y: TrustedNodeAddress) -> bool { + x == y + } + + fn elem_name(&self, target: TrustedNodeAddress) -> QualName { + let node: Root<Node> = unsafe { JS::from_trusted_node_address(target).root() }; + let elem: JSRef<Element> = ElementCast::to_ref(*node) + .expect("tried to get name of non-Element in HTML parsing"); + QualName { + ns: elem.get_namespace().clone(), + local: elem.get_local_name().clone(), + } + } + + fn create_element(&mut self, name: QualName, attrs: Vec<Attribute>) + -> TrustedNodeAddress { + let doc = self.document.root(); + let elem = build_element_from_tag(name, None, *doc).root(); + + for attr in attrs.into_iter() { + elem.set_attribute_from_parser(attr.name, attr.value, None); + } + + let node: JSRef<Node> = NodeCast::from_ref(*elem); + node.to_trusted_node_address() + } + + fn create_comment(&mut self, text: String) -> TrustedNodeAddress { + let doc = self.document.root(); + let comment = Comment::new(text, *doc); + let node: Root<Node> = NodeCast::from_temporary(comment).root(); + node.to_trusted_node_address() + } + + fn append_before_sibling(&mut self, + sibling: TrustedNodeAddress, + new_node: NodeOrText<TrustedNodeAddress>) -> Result<(), NodeOrText<TrustedNodeAddress>> { + // If there is no parent, return the node to the parser. + let sibling: Root<Node> = unsafe { JS::from_trusted_node_address(sibling).root() }; + let parent = match sibling.parent_node() { + Some(p) => p.root(), + None => return Err(new_node), + }; + + let child = self.get_or_create(new_node).root(); + assert!(parent.InsertBefore(*child, Some(*sibling)).is_ok()); + Ok(()) + } + + fn parse_error(&mut self, msg: MaybeOwned<'static>) { + error!("Parse error: {:s}", msg); + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + let doc = self.document.root(); + doc.set_quirks_mode(mode); + } + + fn append(&mut self, parent: TrustedNodeAddress, child: NodeOrText<TrustedNodeAddress>) { + let parent: Root<Node> = unsafe { JS::from_trusted_node_address(parent).root() }; + let child = self.get_or_create(child).root(); + + // FIXME(#3701): Use a simpler algorithm and merge adjacent text nodes + assert!(parent.AppendChild(*child).is_ok()); + } + + fn append_doctype_to_document(&mut self, name: String, public_id: String, system_id: String) { + let doc = self.document.root(); + let doc_node: JSRef<Node> = NodeCast::from_ref(*doc); + let doctype = DocumentType::new(name, Some(public_id), Some(system_id), *doc); + let node: Root<Node> = NodeCast::from_temporary(doctype).root(); + + assert!(doc_node.AppendChild(*node).is_ok()); + } + + fn add_attrs_if_missing(&mut self, target: TrustedNodeAddress, attrs: Vec<Attribute>) { + let node: Root<Node> = unsafe { JS::from_trusted_node_address(target).root() }; + let elem: JSRef<Element> = ElementCast::to_ref(*node) + .expect("tried to set attrs on non-Element in HTML parsing"); + for attr in attrs.into_iter() { + elem.set_attribute_from_parser(attr.name, attr.value, None); + } + } + + fn remove_from_parent(&mut self, _target: TrustedNodeAddress) { + error!("remove_from_parent not implemented!"); + } + + fn mark_script_already_started(&mut self, _node: TrustedNodeAddress) { + error!("mark_script_already_started not implemented!"); + } + + fn complete_script(&mut self, node: TrustedNodeAddress) { + let node: Root<Node> = unsafe { JS::from_trusted_node_address(node).root() }; + let script: Option<JSRef<HTMLScriptElement>> = + HTMLScriptElementCast::to_ref(*node); + let script = match script { + Some(script) if script.is_javascript() => script, + _ => return, + }; + + let script_element: JSRef<Element> = ElementCast::from_ref(script); + match script_element.get_attribute(ns!(""), &atom!("src")).root() { + Some(src) => { + debug!("found script: {:s}", src.deref().Value()); + let mut url_parser = UrlParser::new(); + match self.base_url { + None => (), + Some(ref base_url) => { + url_parser.base_url(base_url); + } + }; + match url_parser.parse(src.deref().value().as_slice()) { + Ok(new_url) => self.js_chan.send(JSTaskNewFile(new_url)), + Err(e) => debug!("Parsing url {:s} failed: {:?}", src.deref().Value(), e) + }; + } + None => { + let scriptnode: JSRef<Node> = NodeCast::from_ref(script); + let data = Node::collect_text_contents(scriptnode.children()); + debug!("script data = {:?}", data); + self.js_chan.send(JSTaskNewInlineScript(data, self.base_url.clone())); + } + } + } +} + +// The url from msg_load_data is ignored here +pub fn parse_html(page: &Page, + document: JSRef<Document>, + input: HTMLInput, + resource_task: ResourceTask, + msg_load_data: Option<MsgLoadData>) + -> HtmlParserResult { + // Spawn a JS parser to receive JavaScript. + let (discovery_chan, discovery_port) = channel(); + let resource_task2 = resource_task.clone(); + let js_result_chan = discovery_chan.clone(); + let (js_chan, js_msg_port) = channel(); + spawn_named("parse_html:js", proc() { + js_script_listener(js_result_chan, js_msg_port, resource_task2.clone()); + }); + + let (base_url, load_response) = match input { + InputUrl(ref url) => { + // Wait for the LoadResponse so that the parser knows the final URL. + let (input_chan, input_port) = channel(); + let mut load_data = LoadData::new(url.clone()); + msg_load_data.map(|m| { + load_data.headers = m.headers; + load_data.method = m.method; + load_data.data = m.data; + }); + resource_task.send(Load(load_data, input_chan)); + + let load_response = input_port.recv(); + + debug!("Fetched page; metadata is {:?}", load_response.metadata); + + load_response.metadata.headers.as_ref().map(|headers| { + let header = headers.iter().find(|h| + h.header_name().as_slice().to_ascii_lower() == "last-modified".to_string() + ); + + match header { + Some(h) => document.set_last_modified( + parse_last_modified(h.header_value().as_slice())), + None => {}, + }; + }); + + let base_url = load_response.metadata.final_url.clone(); + + { + // Store the final URL before we start parsing, so that DOM routines + // (e.g. HTMLImageElement::update_image) can resolve relative URLs + // correctly. + *page.mut_url() = Some((base_url.clone(), true)); + } + + (Some(base_url), Some(load_response)) + }, + InputString(_) => { + match *page.url() { + Some((ref page_url, _)) => (Some(page_url.clone()), None), + None => (None, None), + } + }, + }; + + let parser = ServoHTMLParser::new(js_chan.clone(), base_url.clone(), document).root(); + let parser: JSRef<ServoHTMLParser> = *parser; + + match input { + InputString(s) => { + parser.tokenizer().borrow_mut().feed(s); + } + InputUrl(url) => { + let load_response = load_response.unwrap(); + match load_response.metadata.content_type { + Some((ref t, _)) if t.as_slice().eq_ignore_ascii_case("image") => { + let page = format!("<html><body><img src='{:s}' /></body></html>", base_url.as_ref().unwrap().serialize()); + parser.tokenizer().borrow_mut().feed(page); + }, + _ => { + for msg in load_response.progress_port.iter() { + match msg { + Payload(data) => { + // FIXME: use Vec<u8> (html5ever #34) + let data = String::from_utf8(data).unwrap(); + parser.tokenizer().borrow_mut().feed(data); + } + Done(Err(err)) => { + fail!("Failed to load page URL {:s}, error: {:s}", url.serialize(), err); + } + Done(Ok(())) => break, + } + } + } + } + } + } + + parser.tokenizer().borrow_mut().end(); + + debug!("finished parsing"); + js_chan.send(JSTaskExit); + + HtmlParserResult { + discovery_port: discovery_port, + } +} |