diff options
author | Timo Tijhof <krinkle@fastmail.com> | 2023-08-05 03:53:16 +0100 |
---|---|---|
committer | Timo Tijhof <krinkle@fastmail.com> | 2023-08-10 03:39:42 +0100 |
commit | b862174dc0fd09aae147a0c2f328298bcf1a7f28 (patch) | |
tree | 34078969015c0742a7164230333f1bac88cf7189 /includes/utils | |
parent | 1dca5ba36c9358f1df25e5bb849e9c0e41bc789c (diff) | |
download | mediawikicore-b862174dc0fd09aae147a0c2f328298bcf1a7f28.tar.gz mediawikicore-b862174dc0fd09aae147a0c2f328298bcf1a7f28.zip |
UrlUtils: Make assemble() and removeDotSegments() stateless
Follows-up 472a914c63baa (I706ef8a50aafb51), which moved various
functions here en-mass, but these two don't require any state.
The warnings about parse_url() in UrlUtils.php have been obsolete
since about PHP 5.4, when it started to support protocol-relative
URLs, non-slash protocols like "mailto", and deal with spaces/newlines
correctly (https://3v4l.org/YWUkl).
Rather than complicate many components that would otherwise remain
stateless and unit-testable, fix these to once again be the static
functions they were.
For impact, see upto PS17 of change I5117eab95f57297eb02bed.
Bug: T227900
Change-Id: Ifb3f720fc429b107348644c98eb9cd8e1113a42a
Diffstat (limited to 'includes/utils')
-rw-r--r-- | includes/utils/UrlUtils.php | 37 |
1 files changed, 19 insertions, 18 deletions
diff --git a/includes/utils/UrlUtils.php b/includes/utils/UrlUtils.php index 3e1cd7a534f0..8c3fcfff9942 100644 --- a/includes/utils/UrlUtils.php +++ b/includes/utils/UrlUtils.php @@ -180,14 +180,14 @@ class UrlUtils { $bits = $this->parse( $url ); if ( $bits && isset( $bits['path'] ) ) { - $bits['path'] = $this->removeDotSegments( $bits['path'] ); - return $this->assemble( $bits ); + $bits['path'] = self::removeDotSegments( $bits['path'] ); + return self::assemble( $bits ); } elseif ( $bits ) { # No path to expand return $url; } elseif ( !str_starts_with( $url, '/' ) ) { # URL is a relative path - return $this->removeDotSegments( $url ); + return self::removeDotSegments( $url ); } # Expanded URL is not valid. @@ -227,12 +227,11 @@ class UrlUtils { * This is the basic structure used (brackets contain keys for $urlParts): * [scheme][delimiter][user]:[pass]@[host]:[port][path]?[query]#[fragment] * - * @todo Need to integrate this into expand() (see T34168) - * + * @since 1.41 * @param array $urlParts URL parts, as output from parse() * @return string URL assembled from its component parts */ - public function assemble( array $urlParts ): string { + public static function assemble( array $urlParts ): string { $result = ''; if ( isset( $urlParts['delimiter'] ) ) { @@ -278,12 +277,11 @@ class UrlUtils { * Remove all dot-segments in the provided URL path. For example, '/a/./b/../c/' becomes * '/a/c/'. For details on the algorithm, please see RFC3986 section 5.2.4. * - * @todo Need to integrate this into expand() (see T34168) - * + * @since 1.41 * @param string $urlPath URL path, potentially containing dot-segments * @return string URL path with all dot-segments removed */ - public function removeDotSegments( string $urlPath ): string { + public static function removeDotSegments( string $urlPath ): string { $output = ''; $inputOffset = 0; $inputLength = strlen( $urlPath ); @@ -400,14 +398,17 @@ class UrlUtils { } /** - * parse_url() work-alike, but non-broken. Differences: + * Advanced and configurable version of parse_url(). + * + * 1) Add a "delimiter" element to the array, which helps permits to blindly re-assemble + * any URL regardless of protocol, including those that don't use `://`, + * such as "mailto:" and "news:". + * 2) Reject URLs with protocols not in $wgUrlProtocols. + * 3) Reject relative or incomplete URLs that parse_url would return a partial array for. * - * 1) Handles protocols that don't use :// (e.g., mailto: and news:, as well as - * protocol-relative URLs) correctly. - * 2) Adds a "delimiter" element to the array (see (2)). - * 3) Verifies that the protocol is on the UrlProtocols allowed list. - * 4) Rejects some invalid URLs that parse_url doesn't, e.g. the empty string or URLs starting - * with a line feed character. + * If all you need is to extract parts of an HTTP or HTTPS URL (i.e. not specific to + * site-configurable extra protocols, or user input) then `parse_url()` can be used + * directly instead. * * @param string $url A URL to parse * @return ?string[] Bits of the URL in an associative array, or null on failure. @@ -432,8 +433,8 @@ class UrlUtils { $url = "http:$url"; } $bits = parse_url( $url ); - // parse_url() returns an array without scheme for some invalid URLs, e.g. - // parse_url("%0Ahttp://example.com") == [ 'host' => '%0Ahttp', 'path' => 'example.com' ] + // parse_url() returns an array without scheme for invalid URLs, e.g. + // parse_url("something bad://example") == [ 'path' => 'something bad://example' ] if ( !$bits || !isset( $bits['scheme'] ) ) { return null; } |