diff options
author | Josh Matthews <josh@joshmatthews.net> | 2025-01-29 02:40:25 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-29 07:40:25 +0000 |
commit | a6218b42eafa9a1e6f845d7f709a108005ed58a8 (patch) | |
tree | fa3c1bf16caad7f45cdf2f700b7007965952d10b /components/script_bindings/str.rs | |
parent | 1188d2b2e7c18434f06df5505bed7cfd859f47e2 (diff) | |
download | servo-a6218b42eafa9a1e6f845d7f709a108005ed58a8.tar.gz servo-a6218b42eafa9a1e6f845d7f709a108005ed58a8.zip |
bindings: Move string-related bindings code to script_bindings. (#35172)
Signed-off-by: Josh Matthews <josh@joshmatthews.net>
Diffstat (limited to 'components/script_bindings/str.rs')
-rw-r--r-- | components/script_bindings/str.rs | 422 |
1 files changed, 422 insertions, 0 deletions
diff --git a/components/script_bindings/str.rs b/components/script_bindings/str.rs new file mode 100644 index 00000000000..d7968e74523 --- /dev/null +++ b/components/script_bindings/str.rs @@ -0,0 +1,422 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ + +//! The `ByteString` struct. +use std::borrow::{Borrow, Cow, ToOwned}; +use std::default::Default; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; +use std::ops::{Deref, DerefMut}; +use std::str::FromStr; +use std::sync::LazyLock; +use std::{fmt, ops, str}; + +use cssparser::CowRcStr; +use html5ever::{LocalName, Namespace}; +use num_traits::Zero; +use regex::Regex; +use servo_atoms::Atom; + +/// Encapsulates the IDL `ByteString` type. +#[derive(Clone, Debug, Default, Eq, JSTraceable, MallocSizeOf, PartialEq)] +pub struct ByteString(Vec<u8>); + +impl ByteString { + /// Creates a new `ByteString`. + pub fn new(value: Vec<u8>) -> ByteString { + ByteString(value) + } + + /// Returns `self` as a string, if it encodes valid UTF-8, and `None` + /// otherwise. + pub fn as_str(&self) -> Option<&str> { + str::from_utf8(&self.0).ok() + } + + /// Returns the length. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Checks if the ByteString is empty. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns `self` with A–Z replaced by a–z. + pub fn to_lower(&self) -> ByteString { + ByteString::new(self.0.to_ascii_lowercase()) + } +} + +impl From<ByteString> for Vec<u8> { + fn from(byte_string: ByteString) -> Vec<u8> { + byte_string.0 + } +} + +impl Hash for ByteString { + fn hash<H: Hasher>(&self, state: &mut H) { + self.0.hash(state); + } +} + +impl FromStr for ByteString { + type Err = (); + fn from_str(s: &str) -> Result<ByteString, ()> { + Ok(ByteString::new(s.to_owned().into_bytes())) + } +} + +impl ops::Deref for ByteString { + type Target = [u8]; + fn deref(&self) -> &[u8] { + &self.0 + } +} + +/// A string that is constructed from a UCS-2 buffer by replacing invalid code +/// points with the replacement character. +#[derive(Clone, Default, Eq, Hash, MallocSizeOf, Ord, PartialEq, PartialOrd)] +pub struct USVString(pub String); + +impl Borrow<str> for USVString { + #[inline] + fn borrow(&self) -> &str { + &self.0 + } +} + +impl Deref for USVString { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + &self.0 + } +} + +impl DerefMut for USVString { + #[inline] + fn deref_mut(&mut self) -> &mut str { + &mut self.0 + } +} + +impl AsRef<str> for USVString { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl fmt::Display for USVString { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } +} + +impl PartialEq<str> for USVString { + fn eq(&self, other: &str) -> bool { + &**self == other + } +} + +impl<'a> PartialEq<&'a str> for USVString { + fn eq(&self, other: &&'a str) -> bool { + &**self == *other + } +} + +impl From<String> for USVString { + fn from(contents: String) -> USVString { + USVString(contents) + } +} + +/// Returns whether `s` is a `token`, as defined by +/// [RFC 2616](http://tools.ietf.org/html/rfc2616#page-17). +pub fn is_token(s: &[u8]) -> bool { + if s.is_empty() { + return false; // A token must be at least a single character + } + s.iter().all(|&x| { + // http://tools.ietf.org/html/rfc2616#section-2.2 + match x { + 0..=31 | 127 => false, // CTLs + 40 | 41 | 60 | 62 | 64 | 44 | 59 | 58 | 92 | 34 | 47 | 91 | 93 | 63 | 61 | 123 | + 125 | 32 => false, // separators + x if x > 127 => false, // non-CHARs + _ => true, + } + }) +} + +/// A DOMString. +/// +/// This type corresponds to the [`DOMString`] type in WebIDL. +/// +/// [`DOMString`]: https://webidl.spec.whatwg.org/#idl-DOMString +/// +/// Conceptually, a DOMString has the same value space as a JavaScript String, +/// i.e., an array of 16-bit *code units* representing UTF-16, potentially with +/// unpaired surrogates present (also sometimes called WTF-16). +/// +/// Currently, this type stores a Rust `String`, in order to avoid issues when +/// integrating with the rest of the Rust ecosystem and even the rest of the +/// browser itself. +/// +/// However, Rust `String`s are guaranteed to be valid UTF-8, and as such have +/// a *smaller value space* than WTF-16 (i.e., some JavaScript String values +/// can not be represented as a Rust `String`). This introduces the question of +/// what to do with values being passed from JavaScript to Rust that contain +/// unpaired surrogates. +/// +/// The hypothesis is that it does not matter much how exactly those values are +/// transformed, because passing unpaired surrogates into the DOM is very rare. +/// In order to test this hypothesis, Servo will panic when encountering any +/// unpaired surrogates on conversion to `DOMString` by default. (The command +/// line option `-Z replace-surrogates` instead causes Servo to replace the +/// unpaired surrogate by a U+FFFD replacement character.) +/// +/// Currently, the lack of crash reports about this issue provides some +/// evidence to support the hypothesis. This evidence will hopefully be used to +/// convince other browser vendors that it would be safe to replace unpaired +/// surrogates at the boundary between JavaScript and native code. (This would +/// unify the `DOMString` and `USVString` types, both in the WebIDL standard +/// and in Servo.) +/// +/// This type is currently `!Send`, in order to help with an independent +/// experiment to store `JSString`s rather than Rust `String`s. +#[derive(Clone, Debug, Eq, Hash, MallocSizeOf, Ord, PartialEq, PartialOrd)] +pub struct DOMString(String, PhantomData<*const ()>); + +impl DOMString { + /// Creates a new `DOMString`. + pub fn new() -> DOMString { + DOMString(String::new(), PhantomData) + } + + /// Creates a new `DOMString` from a `String`. + pub fn from_string(s: String) -> DOMString { + DOMString(s, PhantomData) + } + + /// Get the internal `&str` value of this [`DOMString`]. + pub fn str(&self) -> &str { + &self.0 + } + + /// Appends a given string slice onto the end of this String. + pub fn push_str(&mut self, string: &str) { + self.0.push_str(string) + } + + /// Clears this `DOMString`, removing all contents. + pub fn clear(&mut self) { + self.0.clear() + } + + /// Shortens this String to the specified length. + pub fn truncate(&mut self, new_len: usize) { + self.0.truncate(new_len); + } + + /// Removes newline characters according to <https://infra.spec.whatwg.org/#strip-newlines>. + pub fn strip_newlines(&mut self) { + self.0.retain(|c| c != '\r' && c != '\n'); + } + + /// Removes leading and trailing ASCII whitespaces according to + /// <https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace>. + pub fn strip_leading_and_trailing_ascii_whitespace(&mut self) { + if self.0.is_empty() { + return; + } + + let trailing_whitespace_len = self + .0 + .trim_end_matches(|ref c| char::is_ascii_whitespace(c)) + .len(); + self.0.truncate(trailing_whitespace_len); + if self.0.is_empty() { + return; + } + + let first_non_whitespace = self.0.find(|ref c| !char::is_ascii_whitespace(c)).unwrap(); + self.0.replace_range(0..first_non_whitespace, ""); + } + + /// <https://html.spec.whatwg.org/multipage/#valid-floating-point-number> + pub fn is_valid_floating_point_number_string(&self) -> bool { + static RE: LazyLock<Regex> = LazyLock::new(|| { + Regex::new(r"^-?(?:\d+\.\d+|\d+|\.\d+)(?:(e|E)(\+|\-)?\d+)?$").unwrap() + }); + + RE.is_match(&self.0) && self.parse_floating_point_number().is_some() + } + + /// <https://html.spec.whatwg.org/multipage/#rules-for-parsing-floating-point-number-values> + pub fn parse_floating_point_number(&self) -> Option<f64> { + // Steps 15-16 are telling us things about IEEE rounding modes + // for floating-point significands; this code assumes the Rust + // compiler already matches them in any cases where + // that actually matters. They are not + // related to f64::round(), which is for rounding to integers. + let input = &self.0; + if let Ok(val) = input.trim().parse::<f64>() { + if !( + // A valid number is the same as what rust considers to be valid, + // except for +1., NaN, and Infinity. + val.is_infinite() || val.is_nan() || input.ends_with('.') || input.starts_with('+') + ) { + return Some(val); + } + } + None + } + + /// Applies the same processing as `parse_floating_point_number` with some additional handling + /// according to ECMA's string conversion steps. + /// + /// Used for specific elements when handling floating point values, namely the `number` and + /// `range` inputs, as well as `meter` and `progress` elements. + /// + /// <https://html.spec.whatwg.org/multipage/#best-representation-of-the-number-as-a-floating-point-number> + /// <https://tc39.es/ecma262/#sec-numeric-types-number-tostring> + pub fn set_best_representation_of_the_floating_point_number(&mut self) { + if let Some(val) = self.parse_floating_point_number() { + // [tc39] Step 2: If x is either +0 or -0, return "0". + let parsed_value = if val.is_zero() { 0.0_f64 } else { val }; + + self.0 = parsed_value.to_string() + } + } +} + +impl Borrow<str> for DOMString { + #[inline] + fn borrow(&self) -> &str { + &self.0 + } +} + +impl Default for DOMString { + fn default() -> Self { + DOMString(String::new(), PhantomData) + } +} + +impl Deref for DOMString { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + &self.0 + } +} + +impl DerefMut for DOMString { + #[inline] + fn deref_mut(&mut self) -> &mut str { + &mut self.0 + } +} + +impl AsRef<str> for DOMString { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl fmt::Display for DOMString { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } +} + +impl PartialEq<str> for DOMString { + fn eq(&self, other: &str) -> bool { + &**self == other + } +} + +impl<'a> PartialEq<&'a str> for DOMString { + fn eq(&self, other: &&'a str) -> bool { + &**self == *other + } +} + +impl From<String> for DOMString { + fn from(contents: String) -> DOMString { + DOMString(contents, PhantomData) + } +} + +impl From<&str> for DOMString { + fn from(contents: &str) -> DOMString { + DOMString::from(String::from(contents)) + } +} + +impl<'a> From<Cow<'a, str>> for DOMString { + fn from(contents: Cow<'a, str>) -> DOMString { + match contents { + Cow::Owned(s) => DOMString::from(s), + Cow::Borrowed(s) => DOMString::from(s), + } + } +} + +impl From<DOMString> for LocalName { + fn from(contents: DOMString) -> LocalName { + LocalName::from(contents.0) + } +} + +impl From<DOMString> for Namespace { + fn from(contents: DOMString) -> Namespace { + Namespace::from(contents.0) + } +} + +impl From<DOMString> for Atom { + fn from(contents: DOMString) -> Atom { + Atom::from(contents.0) + } +} + +impl From<DOMString> for String { + fn from(contents: DOMString) -> String { + contents.0 + } +} + +impl From<DOMString> for Vec<u8> { + fn from(contents: DOMString) -> Vec<u8> { + contents.0.into() + } +} + +impl<'a> From<DOMString> for Cow<'a, str> { + fn from(contents: DOMString) -> Cow<'a, str> { + contents.0.into() + } +} + +impl<'a> From<DOMString> for CowRcStr<'a> { + fn from(contents: DOMString) -> CowRcStr<'a> { + contents.0.into() + } +} + +impl Extend<char> for DOMString { + fn extend<I>(&mut self, iterable: I) + where + I: IntoIterator<Item = char>, + { + self.0.extend(iterable) + } +} |