bindings: Move string-related bindings code to script_bindings. (#35172)

Signed-off-by: Josh Matthews <josh@joshmatthews.net>
author: Josh Matthews <josh@joshmatthews.net> 2025-01-29 02:40:25 -0500
committer: GitHub <noreply@github.com> 2025-01-29 07:40:25 +0000
commit: a6218b42eafa9a1e6f845d7f709a108005ed58a8 (patch)
tree: fa3c1bf16caad7f45cdf2f700b7007965952d10b /components/script_bindings/str.rs
parent: 1188d2b2e7c18434f06df5505bed7cfd859f47e2 (diff)
download: servo-a6218b42eafa9a1e6f845d7f709a108005ed58a8.tar.gz
servo-a6218b42eafa9a1e6f845d7f709a108005ed58a8.zip
1 files changed, 422 insertions, 0 deletions
diff --git a/components/script_bindings/str.rs b/components/script_bindings/str.rs
new file mode 100644
index 00000000000..d7968e74523
--- /dev/null
+++ b/components/script_bindings/str.rs
@@ -0,0 +1,422 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
+
+//! The `ByteString` struct.
+use std::borrow::{Borrow, Cow, ToOwned};
+use std::default::Default;
+use std::hash::{Hash, Hasher};
+use std::marker::PhantomData;
+use std::ops::{Deref, DerefMut};
+use std::str::FromStr;
+use std::sync::LazyLock;
+use std::{fmt, ops, str};
+
+use cssparser::CowRcStr;
+use html5ever::{LocalName, Namespace};
+use num_traits::Zero;
+use regex::Regex;
+use servo_atoms::Atom;
+
+/// Encapsulates the IDL `ByteString` type.
+#[derive(Clone, Debug, Default, Eq, JSTraceable, MallocSizeOf, PartialEq)]
+pub struct ByteString(Vec<u8>);
+
+impl ByteString {
+    /// Creates a new `ByteString`.
+    pub fn new(value: Vec<u8>) -> ByteString {
+        ByteString(value)
+    }
+
+    /// Returns `self` as a string, if it encodes valid UTF-8, and `None`
+    /// otherwise.
+    pub fn as_str(&self) -> Option<&str> {
+        str::from_utf8(&self.0).ok()
+    }
+
+    /// Returns the length.
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Checks if the ByteString is empty.
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Returns `self` with A–Z replaced by a–z.
+    pub fn to_lower(&self) -> ByteString {
+        ByteString::new(self.0.to_ascii_lowercase())
+    }
+}
+
+impl From<ByteString> for Vec<u8> {
+    fn from(byte_string: ByteString) -> Vec<u8> {
+        byte_string.0
+    }
+}
+
+impl Hash for ByteString {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.hash(state);
+    }
+}
+
+impl FromStr for ByteString {
+    type Err = ();
+    fn from_str(s: &str) -> Result<ByteString, ()> {
+        Ok(ByteString::new(s.to_owned().into_bytes()))
+    }
+}
+
+impl ops::Deref for ByteString {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+/// A string that is constructed from a UCS-2 buffer by replacing invalid code
+/// points with the replacement character.
+#[derive(Clone, Default, Eq, Hash, MallocSizeOf, Ord, PartialEq, PartialOrd)]
+pub struct USVString(pub String);
+
+impl Borrow<str> for USVString {
+    #[inline]
+    fn borrow(&self) -> &str {
+        &self.0
+    }
+}
+
+impl Deref for USVString {
+    type Target = str;
+
+    #[inline]
+    fn deref(&self) -> &str {
+        &self.0
+    }
+}
+
+impl DerefMut for USVString {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut str {
+        &mut self.0
+    }
+}
+
+impl AsRef<str> for USVString {
+    fn as_ref(&self) -> &str {
+        &self.0
+    }
+}
+
+impl fmt::Display for USVString {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Display::fmt(&**self, f)
+    }
+}
+
+impl PartialEq<str> for USVString {
+    fn eq(&self, other: &str) -> bool {
+        &**self == other
+    }
+}
+
+impl<'a> PartialEq<&'a str> for USVString {
+    fn eq(&self, other: &&'a str) -> bool {
+        &**self == *other
+    }
+}
+
+impl From<String> for USVString {
+    fn from(contents: String) -> USVString {
+        USVString(contents)
+    }
+}
+
+/// Returns whether `s` is a `token`, as defined by
+/// [RFC 2616](http://tools.ietf.org/html/rfc2616#page-17).
+pub fn is_token(s: &[u8]) -> bool {
+    if s.is_empty() {
+        return false; // A token must be at least a single character
+    }
+    s.iter().all(|&x| {
+        // http://tools.ietf.org/html/rfc2616#section-2.2
+        match x {
+            0..=31 | 127 => false, // CTLs
+            40 | 41 | 60 | 62 | 64 | 44 | 59 | 58 | 92 | 34 | 47 | 91 | 93 | 63 | 61 | 123 |
+            125 | 32 => false, // separators
+            x if x > 127 => false, // non-CHARs
+            _ => true,
+        }
+    })
+}
+
+/// A DOMString.
+///
+/// This type corresponds to the [`DOMString`] type in WebIDL.
+///
+/// [`DOMString`]: https://webidl.spec.whatwg.org/#idl-DOMString
+///
+/// Conceptually, a DOMString has the same value space as a JavaScript String,
+/// i.e., an array of 16-bit *code units* representing UTF-16, potentially with
+/// unpaired surrogates present (also sometimes called WTF-16).
+///
+/// Currently, this type stores a Rust `String`, in order to avoid issues when
+/// integrating with the rest of the Rust ecosystem and even the rest of the
+/// browser itself.
+///
+/// However, Rust `String`s are guaranteed to be valid UTF-8, and as such have
+/// a *smaller value space* than WTF-16 (i.e., some JavaScript String values
+/// can not be represented as a Rust `String`). This introduces the question of
+/// what to do with values being passed from JavaScript to Rust that contain
+/// unpaired surrogates.
+///
+/// The hypothesis is that it does not matter much how exactly those values are
+/// transformed, because passing unpaired surrogates into the DOM is very rare.
+/// In order to test this hypothesis, Servo will panic when encountering any
+/// unpaired surrogates on conversion to `DOMString` by default. (The command
+/// line option `-Z replace-surrogates` instead causes Servo to replace the
+/// unpaired surrogate by a U+FFFD replacement character.)
+///
+/// Currently, the lack of crash reports about this issue provides some
+/// evidence to support the hypothesis. This evidence will hopefully be used to
+/// convince other browser vendors that it would be safe to replace unpaired
+/// surrogates at the boundary between JavaScript and native code. (This would
+/// unify the `DOMString` and `USVString` types, both in the WebIDL standard
+/// and in Servo.)
+///
+/// This type is currently `!Send`, in order to help with an independent
+/// experiment to store `JSString`s rather than Rust `String`s.
+#[derive(Clone, Debug, Eq, Hash, MallocSizeOf, Ord, PartialEq, PartialOrd)]
+pub struct DOMString(String, PhantomData<*const ()>);
+
+impl DOMString {
+    /// Creates a new `DOMString`.
+    pub fn new() -> DOMString {
+        DOMString(String::new(), PhantomData)
+    }
+
+    /// Creates a new `DOMString` from a `String`.
+    pub fn from_string(s: String) -> DOMString {
+        DOMString(s, PhantomData)
+    }
+
+    /// Get the internal `&str` value of this [`DOMString`].
+    pub fn str(&self) -> &str {
+        &self.0
+    }
+
+    /// Appends a given string slice onto the end of this String.
+    pub fn push_str(&mut self, string: &str) {
+        self.0.push_str(string)
+    }
+
+    /// Clears this `DOMString`, removing all contents.
+    pub fn clear(&mut self) {
+        self.0.clear()
+    }
+
+    /// Shortens this String to the specified length.
+    pub fn truncate(&mut self, new_len: usize) {
+        self.0.truncate(new_len);
+    }
+
+    /// Removes newline characters according to <https://infra.spec.whatwg.org/#strip-newlines>.
+    pub fn strip_newlines(&mut self) {
+        self.0.retain(|c| c != '\r' && c != '\n');
+    }
+
+    /// Removes leading and trailing ASCII whitespaces according to
+    /// <https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace>.
+    pub fn strip_leading_and_trailing_ascii_whitespace(&mut self) {
+        if self.0.is_empty() {
+            return;
+        }
+
+        let trailing_whitespace_len = self
+            .0
+            .trim_end_matches(|ref c| char::is_ascii_whitespace(c))
+            .len();
+        self.0.truncate(trailing_whitespace_len);
+        if self.0.is_empty() {
+            return;
+        }
+
+        let first_non_whitespace = self.0.find(|ref c| !char::is_ascii_whitespace(c)).unwrap();
+        self.0.replace_range(0..first_non_whitespace, "");
+    }
+
+    /// <https://html.spec.whatwg.org/multipage/#valid-floating-point-number>
+    pub fn is_valid_floating_point_number_string(&self) -> bool {
+        static RE: LazyLock<Regex> = LazyLock::new(|| {
+            Regex::new(r"^-?(?:\d+\.\d+|\d+|\.\d+)(?:(e|E)(\+|\-)?\d+)?$").unwrap()
+        });
+
+        RE.is_match(&self.0) && self.parse_floating_point_number().is_some()
+    }
+
+    /// <https://html.spec.whatwg.org/multipage/#rules-for-parsing-floating-point-number-values>
+    pub fn parse_floating_point_number(&self) -> Option<f64> {
+        // Steps 15-16 are telling us things about IEEE rounding modes
+        // for floating-point significands; this code assumes the Rust
+        // compiler already matches them in any cases where
+        // that actually matters. They are not
+        // related to f64::round(), which is for rounding to integers.
+        let input = &self.0;
+        if let Ok(val) = input.trim().parse::<f64>() {
+            if !(
+                // A valid number is the same as what rust considers to be valid,
+                // except for +1., NaN, and Infinity.
+                val.is_infinite() || val.is_nan() || input.ends_with('.') || input.starts_with('+')
+            ) {
+                return Some(val);
+            }
+        }
+        None
+    }
+
+    /// Applies the same processing as `parse_floating_point_number` with some additional handling
+    /// according to ECMA's string conversion steps.
+    ///
+    /// Used for specific elements when handling floating point values, namely the `number` and
+    /// `range` inputs, as well as `meter` and `progress` elements.
+    ///
+    /// <https://html.spec.whatwg.org/multipage/#best-representation-of-the-number-as-a-floating-point-number>
+    /// <https://tc39.es/ecma262/#sec-numeric-types-number-tostring>
+    pub fn set_best_representation_of_the_floating_point_number(&mut self) {
+        if let Some(val) = self.parse_floating_point_number() {
+            // [tc39] Step 2: If x is either +0 or -0, return "0".
+            let parsed_value = if val.is_zero() { 0.0_f64 } else { val };
+
+            self.0 = parsed_value.to_string()
+        }
+    }
+}
+
+impl Borrow<str> for DOMString {
+    #[inline]
+    fn borrow(&self) -> &str {
+        &self.0
+    }
+}
+
+impl Default for DOMString {
+    fn default() -> Self {
+        DOMString(String::new(), PhantomData)
+    }
+}
+
+impl Deref for DOMString {
+    type Target = str;
+
+    #[inline]
+    fn deref(&self) -> &str {
+        &self.0
+    }
+}
+
+impl DerefMut for DOMString {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut str {
+        &mut self.0
+    }
+}
+
+impl AsRef<str> for DOMString {
+    fn as_ref(&self) -> &str {
+        &self.0
+    }
+}
+
+impl fmt::Display for DOMString {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fmt::Display::fmt(&**self, f)
+    }
+}
+
+impl PartialEq<str> for DOMString {
+    fn eq(&self, other: &str) -> bool {
+        &**self == other
+    }
+}
+
+impl<'a> PartialEq<&'a str> for DOMString {
+    fn eq(&self, other: &&'a str) -> bool {
+        &**self == *other
+    }
+}
+
+impl From<String> for DOMString {
+    fn from(contents: String) -> DOMString {
+        DOMString(contents, PhantomData)
+    }
+}
+
+impl From<&str> for DOMString {
+    fn from(contents: &str) -> DOMString {
+        DOMString::from(String::from(contents))
+    }
+}
+
+impl<'a> From<Cow<'a, str>> for DOMString {
+    fn from(contents: Cow<'a, str>) -> DOMString {
+        match contents {
+            Cow::Owned(s) => DOMString::from(s),
+            Cow::Borrowed(s) => DOMString::from(s),
+        }
+    }
+}
+
+impl From<DOMString> for LocalName {
+    fn from(contents: DOMString) -> LocalName {
+        LocalName::from(contents.0)
+    }
+}
+
+impl From<DOMString> for Namespace {
+    fn from(contents: DOMString) -> Namespace {
+        Namespace::from(contents.0)
+    }
+}
+
+impl From<DOMString> for Atom {
+    fn from(contents: DOMString) -> Atom {
+        Atom::from(contents.0)
+    }
+}
+
+impl From<DOMString> for String {
+    fn from(contents: DOMString) -> String {
+        contents.0
+    }
+}
+
+impl From<DOMString> for Vec<u8> {
+    fn from(contents: DOMString) -> Vec<u8> {
+        contents.0.into()
+    }
+}
+
+impl<'a> From<DOMString> for Cow<'a, str> {
+    fn from(contents: DOMString) -> Cow<'a, str> {
+        contents.0.into()
+    }
+}
+
+impl<'a> From<DOMString> for CowRcStr<'a> {
+    fn from(contents: DOMString) -> CowRcStr<'a> {
+        contents.0.into()
+    }
+}
+
+impl Extend<char> for DOMString {
+    fn extend<I>(&mut self, iterable: I)
+    where
+        I: IntoIterator<Item = char>,
+    {
+        self.0.extend(iterable)
+    }
+}
author	Josh Matthews <josh@joshmatthews.net>	2025-01-29 02:40:25 -0500
committer	GitHub <noreply@github.com>	2025-01-29 07:40:25 +0000
commit	a6218b42eafa9a1e6f845d7f709a108005ed58a8 (patch)
tree	fa3c1bf16caad7f45cdf2f700b7007965952d10b /components/script_bindings/str.rs
parent	1188d2b2e7c18434f06df5505bed7cfd859f47e2 (diff)
download	servo-a6218b42eafa9a1e6f845d7f709a108005ed58a8.tar.gz servo-a6218b42eafa9a1e6f845d7f709a108005ed58a8.zip