Auto merge of #10796 - servo:character-data-surrogates, r=nox

Make /dom/nodes/CharacterData-surrogates.html not panic. It now fails since `DOMString` is currently based on `std::string::String` on the Rust side, which is strictly well-formed UTF-8 and can not contain unpaired surrogate code points. Fixes #10780 r? @Ms2ger  --- This change is [<img src="https://reviewable.io/review_button.svg" height="35" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/servo/10796)
author: bors-servo <lbergstrom+bors@mozilla.com> 2016-05-24 15:59:31 -0700
committer: bors-servo <lbergstrom+bors@mozilla.com> 2016-05-24 15:59:31 -0700
commit: a04e30d2471a92e4214d81302f46bcaed9503b3c (patch)
tree: 9f6ae973b1f01d1d4015472d60b39a06593e2261 /components/script/dom/characterdata.rs
parent: 2a2b88f42c55614f949f218b0444778e8e2bfc78 (diff)
parent: c11a3b958de8943ef85b7a10d63ce88a6a90c645 (diff)
download: servo-a04e30d2471a92e4214d81302f46bcaed9503b3c.tar.gz
servo-a04e30d2471a92e4214d81302f46bcaed9503b3c.zip
1 files changed, 92 insertions, 24 deletions
diff --git a/components/script/dom/characterdata.rs b/components/script/dom/characterdata.rs
index 74fd0db94c9..01f11e3ac50 100644
--- a/components/script/dom/characterdata.rs
+++ b/components/script/dom/characterdata.rs
@@ -20,6 +20,7 @@ use dom::node::{Node, NodeDamage};
 use dom::processinginstruction::ProcessingInstruction;
 use dom::text::Text;
 use std::cell::Ref;
+use util::opts;
 
 // https://dom.spec.whatwg.org/#characterdata
 #[dom_struct]
@@ -94,16 +95,34 @@ impl CharacterDataMethods for CharacterData {
     fn SubstringData(&self, offset: u32, count: u32) -> Fallible<DOMString> {
         let data = self.data.borrow();
         // Step 1.
-        let data_from_offset = match find_utf16_code_unit_offset(&data, offset) {
-            Some(offset_bytes) => &data[offset_bytes..],
+        let mut substring = String::new();
+        let remaining;
+        match split_at_utf16_code_unit_offset(&data, offset) {
+            Ok((_, astral, s)) => {
+                // As if we had split the UTF-16 surrogate pair in half
+                // and then transcoded that to UTF-8 lossily,
+                // since our DOMString is currently strict UTF-8.
+                if astral.is_some() {
+                    substring = substring + "\u{FFFD}";
+                }
+                remaining = s;
+            }
             // Step 2.
-            None => return Err(Error::IndexSize),
-        };
-        let substring = match find_utf16_code_unit_offset(data_from_offset, count) {
+            Err(()) => return Err(Error::IndexSize),
+        }
+        match split_at_utf16_code_unit_offset(remaining, count) {
             // Steps 3.
-            None => data_from_offset,
+            Err(()) => substring = substring + remaining,
             // Steps 4.
-            Some(count_bytes) => &data_from_offset[..count_bytes],
+            Ok((s, astral, _)) => {
+                substring = substring + s;
+                // As if we had split the UTF-16 surrogate pair in half
+                // and then transcoded that to UTF-8 lossily,
+                // since our DOMString is currently strict UTF-8.
+                if astral.is_some() {
+                    substring = substring + "\u{FFFD}";
+                }
+            }
         };
         Ok(DOMString::from(substring))
     }
@@ -126,26 +145,54 @@ impl CharacterDataMethods for CharacterData {
 
     // https://dom.spec.whatwg.org/#dom-characterdata-replacedata
     fn ReplaceData(&self, offset: u32, count: u32, arg: DOMString) -> ErrorResult {
-        let new_data = {
+        let mut new_data;
+        {
             let data = self.data.borrow();
-            let (prefix, data_from_offset) = match find_utf16_code_unit_offset(&data, offset) {
-                Some(offset_bytes) => data.split_at(offset_bytes),
+            let prefix;
+            let replacement_before;
+            let remaining;
+            match split_at_utf16_code_unit_offset(&data, offset) {
+                Ok((p, astral, r)) => {
+                    prefix = p;
+                    // As if we had split the UTF-16 surrogate pair in half
+                    // and then transcoded that to UTF-8 lossily,
+                    // since our DOMString is currently strict UTF-8.
+                    replacement_before = if astral.is_some() { "\u{FFFD}" } else { "" };
+                    remaining = r;
+                }
                 // Step 2.
-                None => return Err(Error::IndexSize),
+                Err(()) => return Err(Error::IndexSize),
             };
-            let suffix = match find_utf16_code_unit_offset(data_from_offset, count) {
+            let replacement_after;
+            let suffix;
+            match split_at_utf16_code_unit_offset(remaining, count) {
                 // Steps 3.
-                None => "",
-                Some(count_bytes) => &data_from_offset[count_bytes..],
+                Err(()) => {
+                    replacement_after = "";
+                    suffix = "";
+                }
+                Ok((_, astral, s)) => {
+                    // As if we had split the UTF-16 surrogate pair in half
+                    // and then transcoded that to UTF-8 lossily,
+                    // since our DOMString is currently strict UTF-8.
+                    replacement_after = if astral.is_some() { "\u{FFFD}" } else { "" };
+                    suffix = s;
+                }
             };
             // Step 4: Mutation observers.
             // Step 5 to 7.
-            let mut new_data = String::with_capacity(prefix.len() + arg.len() + suffix.len());
+            new_data = String::with_capacity(
+                prefix.len() +
+                replacement_before.len() +
+                arg.len() +
+                replacement_after.len() +
+                suffix.len());
             new_data.push_str(prefix);
+            new_data.push_str(replacement_before);
             new_data.push_str(&arg);
+            new_data.push_str(replacement_after);
             new_data.push_str(suffix);
-            new_data
-        };
+        }
         *self.data.borrow_mut() = DOMString::from(new_data);
         self.content_changed();
         // Steps 8-11.
@@ -200,19 +247,40 @@ impl LayoutCharacterDataHelpers for LayoutJS<CharacterData> {
     }
 }
 
-/// Given a number of UTF-16 code units from the start of the given string,
-/// return the corresponding number of UTF-8 bytes.
+/// Split the given string at the given position measured in UTF-16 code units from the start.
+///
+/// * `Err(())` indicates that `offset` if after the end of the string
+/// * `Ok((before, None, after))` indicates that `offset` is between Unicode code points.
+///   The two string slices are such that:
+///   `before == s.to_utf16()[..offset].to_utf8()` and
+///   `after == s.to_utf16()[offset..].to_utf8()`
+/// * `Ok((before, Some(ch), after))` indicates that `offset` is "in the middle"
+///   of a single Unicode code point that would be represented in UTF-16 by a surrogate pair
+///   of two 16-bit code units.
+///   `ch` is that code point.
+///   The two string slices are such that:
+///   `before == s.to_utf16()[..offset - 1].to_utf8()` and
+///   `after == s.to_utf16()[offset + 1..].to_utf8()`
+///
+/// # Panics
 ///
-/// s[find_utf16_code_unit_offset(s, o).unwrap()..] == s.to_utf16()[o..].to_utf8()
-fn find_utf16_code_unit_offset(s: &str, offset: u32) -> Option<usize> {
+/// Note that the third variant is only ever returned when the `-Z replace-surrogates`
+/// command-line option is specified.
+/// When it *would* be returned but the option is *not* specified, this function panics.
+fn split_at_utf16_code_unit_offset(s: &str, offset: u32) -> Result<(&str, Option<char>, &str), ()> {
     let mut code_units = 0;
     for (i, c) in s.char_indices() {
         if code_units == offset {
-            return Some(i);
+            let (a, b) = s.split_at(i);
+            return Ok((a, None, b));
         }
         code_units += 1;
         if c > '\u{FFFF}' {
             if code_units == offset {
+                if opts::get().replace_surrogates {
+                    debug_assert!(c.len_utf8() == 4);
+                    return Ok((&s[..i], Some(c), &s[i + c.len_utf8()..]))
+                }
                 panic!("\n\n\
                     Would split a surrogate pair in CharacterData API.\n\
                     If you see this in real content, please comment with the URL\n\
@@ -223,8 +291,8 @@ fn find_utf16_code_unit_offset(s: &str, offset: u32) -> Option<usize> {
         }
     }
     if code_units == offset {
-        Some(s.len())
+        Ok((s, None, ""))
     } else {
-        None
+        Err(())
     }
 }
author	bors-servo <lbergstrom+bors@mozilla.com>	2016-05-24 15:59:31 -0700
committer	bors-servo <lbergstrom+bors@mozilla.com>	2016-05-24 15:59:31 -0700
commit	a04e30d2471a92e4214d81302f46bcaed9503b3c (patch)
tree	9f6ae973b1f01d1d4015472d60b39a06593e2261 /components/script/dom/characterdata.rs
parent	2a2b88f42c55614f949f218b0444778e8e2bfc78 (diff)
parent	c11a3b958de8943ef85b7a10d63ce88a6a90c645 (diff)
download	servo-a04e30d2471a92e4214d81302f46bcaed9503b3c.tar.gz servo-a04e30d2471a92e4214d81302f46bcaed9503b3c.zip