From f10640199d25645785c260fcf23f9ef182773687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=BClker?= Date: Mon, 7 Apr 2025 21:48:05 +0200 Subject: Split up the URLPattern implementation (#36391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current implementation is already rather large at ~2.5k lines (~2k LoC). There is still quite a lot of functionality left to implement, so let's split it up while it's still manageable. Testing: Covered by existing web platform tests Signed-off-by: Simon Wülker --- components/script/dom/urlpattern/mod.rs | 810 ++++++++++++++++++++++++++++++++ 1 file changed, 810 insertions(+) create mode 100644 components/script/dom/urlpattern/mod.rs (limited to 'components/script/dom/urlpattern/mod.rs') diff --git a/components/script/dom/urlpattern/mod.rs b/components/script/dom/urlpattern/mod.rs new file mode 100644 index 00000000000..e92963c672b --- /dev/null +++ b/components/script/dom/urlpattern/mod.rs @@ -0,0 +1,810 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ + +mod pattern_parser; +mod preprocessing; +mod tokenizer; + +use std::ptr; + +use dom_struct::dom_struct; +use js::jsapi::{Heap, JSObject, RegExpFlag_IgnoreCase, RegExpFlag_UnicodeSets, RegExpFlags}; +use js::rust::HandleObject; +use pattern_parser::parse_a_pattern_string; +use preprocessing::{ + canonicalize_a_hash, canonicalize_a_hostname, canonicalize_a_password, canonicalize_a_pathname, + canonicalize_a_port, canonicalize_a_protocol, canonicalize_a_search, canonicalize_a_username, + escape_a_regexp_string, process_a_url_pattern_init, +}; +use script_bindings::error::{Error, Fallible}; +use script_bindings::reflector::Reflector; +use script_bindings::root::DomRoot; +use script_bindings::script_runtime::CanGc; +use script_bindings::str::USVString; + +use crate::dom::bindings::cell::RefCell; +use crate::dom::bindings::codegen::Bindings::URLPatternBinding::{ + URLPatternInit, URLPatternMethods, URLPatternOptions, +}; +use crate::dom::bindings::reflector::reflect_dom_object_with_proto; +use crate::dom::globalscope::GlobalScope; +use crate::dom::htmlinputelement::new_js_regex; + +/// +const FULL_WILDCARD_REGEXP_VALUE: &str = ".*"; + +/// +#[dom_struct] +pub(crate) struct URLPattern { + reflector: Reflector, + + /// + associated_url_pattern: RefCell, +} + +#[derive(JSTraceable, MallocSizeOf)] +#[cfg_attr(crown, crown::unrooted_must_root_lint::must_root)] +struct URLPatternInternal { + /// + protocol: Component, + + /// + username: Component, + + /// + password: Component, + + /// + hostname: Component, + + /// + port: Component, + + /// + pathname: Component, + + /// + search: Component, + + /// + hash: Component, +} + +/// +#[derive(JSTraceable, MallocSizeOf)] +#[cfg_attr(crown, crown::unrooted_must_root_lint::must_root)] +struct Component { + /// + pattern_string: USVString, + + /// + #[ignore_malloc_size_of = "mozjs"] + regular_expression: Box>, + + /// + group_name_list: Vec, + + /// + has_regexp_groups: bool, +} + +/// +#[derive(Debug)] +struct Part { + /// + part_type: PartType, + + /// + value: String, + + /// + modifier: PartModifier, + + /// + name: String, + + /// + prefix: String, + + /// + suffix: String, +} + +/// +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum PartType { + /// + FixedText, + + /// + Regexp, + + /// + SegmentWildcard, + + /// + FullWildcard, +} + +/// +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[allow(dead_code)] // Parser is not implemented yet +enum PartModifier { + /// + None, + + /// + Optional, + + /// + ZeroOrMore, + + /// + OneOrMore, +} + +/// +#[derive(Clone, Copy, Default)] +#[allow(dead_code)] // Parser is not fully implemented yet +struct Options { + /// + delimiter_code_point: Option, + + /// + prefix_code_point: Option, + + /// + ignore_case: bool, +} + +impl Component { + fn new_unrooted() -> Self { + Self { + pattern_string: Default::default(), + regular_expression: Heap::boxed(ptr::null_mut()), + group_name_list: Default::default(), + has_regexp_groups: false, + } + } +} + +impl URLPattern { + #[cfg_attr(crown, allow(crown::unrooted_must_root))] + fn new_inherited() -> URLPattern { + let associated_url_pattern = URLPatternInternal { + protocol: Component::new_unrooted(), + username: Component::new_unrooted(), + password: Component::new_unrooted(), + hostname: Component::new_unrooted(), + port: Component::new_unrooted(), + pathname: Component::new_unrooted(), + search: Component::new_unrooted(), + hash: Component::new_unrooted(), + }; + + URLPattern { + reflector: Reflector::new(), + associated_url_pattern: RefCell::new(associated_url_pattern), + } + } + + #[cfg_attr(crown, allow(crown::unrooted_must_root))] + pub(crate) fn new_with_proto( + global: &GlobalScope, + proto: Option, + can_gc: CanGc, + ) -> DomRoot { + reflect_dom_object_with_proto(Box::new(URLPattern::new_inherited()), global, proto, can_gc) + } + + /// + fn initialize( + global: &GlobalScope, + proto: Option, + input: &URLPatternInit, + options: &URLPatternOptions, + can_gc: CanGc, + ) -> Fallible> { + // Step 1. Set this’s associated URL pattern to the result of create given input, baseURL, and options. + let pattern = URLPattern::new_with_proto(global, proto, can_gc); + URLPatternInternal::create( + input, + options, + &mut pattern.associated_url_pattern.borrow_mut(), + )?; + + Ok(pattern) + } +} + +impl URLPatternMethods for URLPattern { + /// + fn Constructor( + global: &GlobalScope, + proto: Option, + can_gc: CanGc, + input: &URLPatternInit, + options: &URLPatternOptions, + ) -> Fallible> { + // Step 1. Run initialize given this, input, null, and options. + URLPattern::initialize(global, proto, input, options, can_gc) + } + + /// + fn Protocol(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s protocol component’s pattern string. + self.associated_url_pattern + .borrow() + .protocol + .pattern_string + .clone() + } + + /// + fn Username(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s username component’s pattern string. + self.associated_url_pattern + .borrow() + .username + .pattern_string + .clone() + } + + /// + fn Password(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s password component’s pattern string. + self.associated_url_pattern + .borrow() + .password + .pattern_string + .clone() + } + + /// + fn Hostname(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s hostname component’s pattern string. + self.associated_url_pattern + .borrow() + .hostname + .pattern_string + .clone() + } + + /// + fn Port(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s port component’s pattern string. + self.associated_url_pattern + .borrow() + .port + .pattern_string + .clone() + } + + /// + fn Pathname(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s pathname component’s pattern string. + self.associated_url_pattern + .borrow() + .pathname + .pattern_string + .clone() + } + + /// + fn Search(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s search component’s pattern string. + self.associated_url_pattern + .borrow() + .search + .pattern_string + .clone() + } + + /// + fn Hash(&self) -> USVString { + // Step 1. Return this’s associated URL pattern’s hash component’s pattern string. + self.associated_url_pattern + .borrow() + .hash + .pattern_string + .clone() + } + + /// + fn HasRegExpGroups(&self) -> bool { + // Step 1. If this’s associated URL pattern’s has regexp groups, then return true. + // Step 2. Return false. + self.associated_url_pattern.borrow().has_regexp_groups() + } +} + +impl URLPatternInternal { + /// + fn create(input: &URLPatternInit, options: &URLPatternOptions, out: &mut Self) -> Fallible<()> { + // Step 1. Let init be null. + // Step 2. If input is a scalar value string then: + // NOTE: We don't support strings as input yet + // Step 3. Otherwise: + // Step 3.1 Assert: input is a URLPatternInit. + // Step 3.2 If baseURL is not null, then throw a TypeError. + if input.baseURL.is_some() { + return Err(Error::Type("baseURL must be none".into())); + } + + // Step 3.3 Set init to input. + let init = input; + + // Step 4. Let processedInit be the result of process a URLPatternInit given init, "pattern", null, null, + // null, null, null, null, null, and null. + let mut processed_init = process_a_url_pattern_init(init, PatternInitType::Pattern)?; + + // Step 5. For each componentName of « "protocol", "username", "password", "hostname", "port", + // "pathname", "search", "hash" »: + // Step 5.1 If processedInit[componentName] does not exist, then set processedInit[componentName] to "*". + // NOTE: We do this later on + + // Step 6. If processedInit["protocol"] is a special scheme and processedInit["port"] is a string + // which represents its corresponding default port in radix-10 using ASCII digits then set + // processedInit["port"] to the empty string. + let default_port = processed_init + .protocol + .as_deref() + .and_then(default_port_for_special_scheme); + let given_port = processed_init + .port + .as_deref() + .map(str::parse) + .transpose() + .ok() + .flatten(); + if default_port.is_some() && default_port == given_port { + processed_init.port = Some(Default::default()); + } + + // Step 7. Let urlPattern be a new URL pattern. + // NOTE: We construct the pattern provided as the out parameter. + + // Step 8. Set urlPattern’s protocol component to the result of compiling a component given + // processedInit["protocol"], canonicalize a protocol, and default options. + Component::compile( + processed_init.protocol.as_deref().unwrap_or("*"), + Box::new(canonicalize_a_protocol), + Options::default(), + &mut out.protocol, + )?; + + // Step 9. Set urlPattern’s username component to the result of compiling a component given + // processedInit["username"], canonicalize a username, and default options. + Component::compile( + processed_init.username.as_deref().unwrap_or("*"), + Box::new(|i| Ok(canonicalize_a_username(i))), + Options::default(), + &mut out.username, + )?; + + // Step 10. Set urlPattern’s password component to the result of compiling a component given + // processedInit["password"], canonicalize a password, and default options. + Component::compile( + processed_init.password.as_deref().unwrap_or("*"), + Box::new(|i| Ok(canonicalize_a_password(i))), + Options::default(), + &mut out.password, + )?; + + // FIXME: Steps 11 and 12: Compile host pattern correctly + Component::compile( + processed_init.hostname.as_deref().unwrap_or("*"), + Box::new(canonicalize_a_hostname), + Options::HOSTNAME, + &mut out.hostname, + )?; + + // Step 13. Set urlPattern’s port component to the result of compiling a component given + // processedInit["port"], canonicalize a port, and default options. + Component::compile( + processed_init.port.as_deref().unwrap_or("*"), + Box::new(|i| canonicalize_a_port(i, None)), + Options::default(), + &mut out.port, + )?; + + // FIXME: Step 14: respect ignore case option from here on out + let _ = options; + + // FIXME: Steps 15-16: Compile path pattern correctly + Component::compile( + processed_init.pathname.as_deref().unwrap_or("*"), + Box::new(|i| Ok(canonicalize_a_pathname(i))), + Options::PATHNAME, + &mut out.pathname, + )?; + + // Step 17. Set urlPattern’s search component to the result of compiling a component given + // processedInit["search"], canonicalize a search, and compileOptions. + Component::compile( + processed_init.search.as_deref().unwrap_or("*"), + Box::new(|i| Ok(canonicalize_a_search(i))), + Options::default(), + &mut out.search, + )?; + + // Step 18. Set urlPattern’s hash component to the result of compiling a component given + // processedInit["hash"], canonicalize a hash, and compileOptions. + Component::compile( + processed_init.hash.as_deref().unwrap_or("*"), + Box::new(|i| Ok(canonicalize_a_hash(i))), + Options::default(), + &mut out.hash, + )?; + + // Step 19. Return urlPattern. + // NOTE: not necessary since we use an out parameter + Ok(()) + } + + /// + fn has_regexp_groups(&self) -> bool { + self.protocol.has_regexp_groups || + self.username.has_regexp_groups || + self.password.has_regexp_groups || + self.hostname.has_regexp_groups || + self.port.has_regexp_groups || + self.pathname.has_regexp_groups || + self.search.has_regexp_groups || + self.hash.has_regexp_groups + } +} + +impl Component { + /// + fn compile( + input: &str, + encoding_callback: EncodingCallback, + options: Options, + out: &mut Self, + ) -> Fallible<()> { + // Step 1. Let part list be the result of running parse a pattern string given input, options, + // and encoding callback. + let part_list = parse_a_pattern_string(input, options, encoding_callback)?; + + // Step 2. Let (regular expression string, name list) be the result of running generate a regular expression and + // name list given part list and options. + let (regular_expression_string, name_list) = + generate_a_regular_expression_and_name_list(&part_list, options); + + log::debug!("Compiled {input:?} (URLPattern) to {regular_expression_string:?} (Regex)"); + + // Step 3. Let flags be an empty string. + // Step 4. If options’s ignore case is true then set flags to "vi". + let flags = if options.ignore_case { + RegExpFlags { + flags_: RegExpFlag_UnicodeSets | RegExpFlag_IgnoreCase, + } + } + // Step 5. Otherwise set flags to "v" + else { + RegExpFlags { + flags_: RegExpFlag_UnicodeSets, + } + }; + + // Step 6. Let regular expression be RegExpCreate(regular expression string, flags). + // If this throws an exception, catch it, and throw a TypeError. + let cx = GlobalScope::get_cx(); + rooted!(in(*cx) let mut regular_expression: *mut JSObject = ptr::null_mut()); + let succeeded = new_js_regex( + cx, + ®ular_expression_string, + flags, + regular_expression.handle_mut(), + ); + if !succeeded { + return Err(Error::Type(format!( + "Failed to compile {regular_expression_string:?} as a regular expression" + ))); + } + + // TODO Step 7. Let pattern string be the result of running generate a pattern string given + // part list and options. + let pattern_string = Default::default(); + + // Step 8. Let has regexp groups be false. + // Step 9. For each part of part list: + // Step 9.1 If part’s type is "regexp", then set has regexp groups to true. + let has_regexp_groups = part_list + .iter() + .any(|part| part.part_type == PartType::Regexp); + + // Step 10. Return a new component whose pattern string is pattern string, regular expression + // is regular expression, group name list is name list, and has regexp groups is has regexp groups. + out.pattern_string = pattern_string; + out.regular_expression.set(*regular_expression.handle()); + out.group_name_list = name_list; + out.has_regexp_groups = has_regexp_groups; + + Ok(()) + } +} + +/// +fn generate_a_regular_expression_and_name_list( + part_list: &[Part], + options: Options, +) -> (String, Vec) { + // Step 1. Let result be "^". + let mut result = String::from("^"); + + // Step 2. Let name list be a new list. + let mut name_list = vec![]; + + // Step 3. For each part of part list: + for part in part_list { + // Step 3.1 If part’s type is "fixed-text": + if part.part_type == PartType::FixedText { + // Step 3.1.1 If part’s modifier is "none", then append the result of running escape a regexp string given + // part’s value to the end of result. + if part.modifier == PartModifier::None { + result.push_str(&escape_a_regexp_string(&part.value)); + } + // Step 3.1.2 Otherwise: + else { + // Step 3.1.2.1 Append "(?:" to the end of result. + result.push_str("(?:"); + + // Step 3.1.2.2 Append the result of running escape a regexp string given part’s value + // to the end of result. + result.push_str(&escape_a_regexp_string(&part.value)); + + // Step 3.1.2.3 Append ")" to the end of result. + result.push(')'); + + // Step 3.1.2.4 Append the result of running convert a modifier to a string given part’s + // modifier to the end of result. + result.push_str(part.modifier.convert_to_string()); + } + + // Step 3.1.3 Continue. + continue; + } + + // Step 3.2 Assert: part’s name is not the empty string. + debug_assert!(!part.name.is_empty()); + + // Step 3.3 Append part’s name to name list. + name_list.push(USVString(part.name.to_string())); + + // Step 3.4 Let regexp value be part’s value. + let mut regexp_value = part.value.clone(); + + // Step 3.5 If part’s type is "segment-wildcard", then set regexp value to the result of running + // generate a segment wildcard regexp given options. + if part.part_type == PartType::SegmentWildcard { + regexp_value = generate_a_segment_wildcard_regexp(options); + } + // Step 3.6 Otherwise if part’s type is "full-wildcard", then set regexp value to full wildcard regexp value. + else if part.part_type == PartType::FullWildcard { + regexp_value = FULL_WILDCARD_REGEXP_VALUE.into(); + } + + // Step 3.7 If part’s prefix is the empty string and part’s suffix is the empty string: + if part.prefix.is_empty() && part.suffix.is_empty() { + // Step 3.7.1 If part’s modifier is "none" or "optional", then: + if matches!(part.modifier, PartModifier::None | PartModifier::Optional) { + // Step 3.7.1.1 Append "(" to the end of result. + result.push('('); + + // Step 3.7.1.2 Append regexp value to the end of result. + result.push_str(®exp_value); + + // Step 3.7.1.3 Append ")" to the end of result. + result.push(')'); + + // Step 3.7.1.4 Append the result of running convert a modifier to a string given part’s modifier + // to the end of result. + result.push_str(part.modifier.convert_to_string()); + } + // Step 3.7.2 Otherwise: + else { + // Step 3.7.2.1 Append "((?:" to the end of result. + result.push_str("((?:"); + + // Step 3.7.2.2 Append regexp value to the end of result. + result.push_str(®exp_value); + + // Step 3.7.2.3 Append ")" to the end of result. + result.push(')'); + + // Step 3.7.2.4 Append the result of running convert a modifier to a string given part’s modifier + // to the end of result. + result.push_str(part.modifier.convert_to_string()); + + // Step 3.7.2.5 Append ")" to the end of result. + result.push(')'); + } + + // Step 3.7.3 Continue. + continue; + } + + // Step 3.8 If part’s modifier is "none" or "optional": + if matches!(part.modifier, PartModifier::None | PartModifier::Optional) { + // Step 3.8.1 Append "(?:" to the end of result. + result.push_str("(?:"); + + // Step 3.8.2 Append the result of running escape a regexp string given part’s prefix + // to the end of result. + result.push_str(&escape_a_regexp_string(&part.prefix)); + + // Step 3.8.3 Append "(" to the end of result. + result.push('('); + + // Step 3.8.4 Append regexp value to the end of result. + result.push_str(®exp_value); + + // Step 3.8.5 Append ")" to the end of result. + result.push(')'); + + // Step 3.8.6 Append the result of running escape a regexp string given part’s suffix + // to the end of result. + result.push_str(&escape_a_regexp_string(&part.suffix)); + + // Step 3.8.7 Append ")" to the end of result. + result.push(')'); + + // Step 3.8.8 Append the result of running convert a modifier to a string given part’s modifier to + // the end of result. + result.push_str(part.modifier.convert_to_string()); + + // Step 3.8.9 Continue. + continue; + } + + // Step 3.9 Assert: part’s modifier is "zero-or-more" or "one-or-more". + debug_assert!(matches!( + part.modifier, + PartModifier::ZeroOrMore | PartModifier::OneOrMore + )); + + // Step 3.10 Assert: part’s prefix is not the empty string or part’s suffix is not the empty string. + debug_assert!(!part.prefix.is_empty() || !part.suffix.is_empty()); + + // Step 3.11 Append "(?:" to the end of result. + result.push_str("(?:"); + + // Step 3.12 Append the result of running escape a regexp string given part’s prefix to the end of result. + result.push_str(&escape_a_regexp_string(&part.prefix)); + + // Step 3.13 Append "((?:" to the end of result. + result.push_str("((?:"); + + // Step 3.14 Append regexp value to the end of result. + result.push_str(®exp_value); + + // Step 3.15 Append ")(?:" to the end of result. + result.push_str(")(?:"); + + // Step 3.16 Append the result of running escape a regexp string given part’s suffix to the end of result. + result.push_str(&escape_a_regexp_string(&part.suffix)); + + // Step 3.17 Append the result of running escape a regexp string given part’s prefix to the end of result. + result.push_str(&escape_a_regexp_string(&part.prefix)); + + // Step 3.18 Append "(?:" to the end of result. + result.push_str("(?:"); + + // Step 3.19 Append regexp value to the end of result. + result.push_str(®exp_value); + + // Step 3.20 Append "))*)" to the end of result. + result.push_str("))*)"); + + // Step 3.21 Append the result of running escape a regexp string given part’s suffix to the end of result. + result.push_str(&escape_a_regexp_string(&part.suffix)); + + // Step 3.22 Append ")" to the end of result. + result.push(')'); + + // Step 3.23 If part’s modifier is "zero-or-more" then append "?" to the end of result. + if part.modifier == PartModifier::ZeroOrMore { + result.push('?'); + } + } + + // Step 4. Append "$" to the end of result. + result.push('$'); + + // Step 5. Return (result, name list). + (result, name_list) +} + +/// +type EncodingCallback = Box Fallible>; + +// FIXME: Deduplicate this with the url crate +/// +fn default_port_for_special_scheme(scheme: &str) -> Option { + match scheme { + "ftp" => Some(21), + "http" | "ws" => Some(80), + "https" | "wss" => Some(443), + _ => None, + } +} + +/// +fn is_special_scheme(scheme: &str) -> bool { + matches!(scheme, "ftp" | "http" | "https" | "ws" | "wss") +} + +/// +fn generate_a_segment_wildcard_regexp(options: Options) -> String { + // Step 1. Let result be "[^". + let mut result = String::from("[^"); + + // Step 2. Append the result of running escape a regexp string given options’s + // delimiter code point to the end of result. + result.push_str(&escape_a_regexp_string( + &options + .delimiter_code_point + .map(|c| c.to_string()) + .unwrap_or_default(), + )); + + // Step 3. Append "]+?" to the end of result. + result.push_str("]+?"); + + // Step 4. Return result. + result +} + +impl PartModifier { + /// + fn convert_to_string(&self) -> &'static str { + match self { + // Step 1. If modifier is "zero-or-more", then return "*". + Self::ZeroOrMore => "*", + // Step 2. If modifier is "optional", then return "?". + Self::Optional => "?", + // Step 3. If modifier is "one-or-more", then return "+". + Self::OneOrMore => "+", + // Step 4. Return the empty string. + _ => "", + } + } +} + +impl Options { + /// + const HOSTNAME: Self = Self { + delimiter_code_point: Some('.'), + prefix_code_point: None, + ignore_case: false, + }; + + /// + const PATHNAME: Self = Self { + delimiter_code_point: Some('/'), + prefix_code_point: Some('/'), + ignore_case: false, + }; +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum PatternInitType { + Pattern, + Url, +} + +impl Part { + fn new(part_type: PartType, value: String, modifier: PartModifier) -> Self { + Self { + part_type, + value, + modifier, + name: String::new(), + prefix: String::new(), + suffix: String::new(), + } + } +} -- cgit v1.2.3