diff options
Diffstat (limited to 'includes/language/LanguageCode.php')
-rw-r--r-- | includes/language/LanguageCode.php | 53 |
1 files changed, 52 insertions, 1 deletions
diff --git a/includes/language/LanguageCode.php b/includes/language/LanguageCode.php index 6f6b216b5c1a..a456ba699d10 100644 --- a/includes/language/LanguageCode.php +++ b/includes/language/LanguageCode.php @@ -163,7 +163,7 @@ class LanguageCode { } /** - * Get the normalised IETF language tag + * Get the normalised IANA language tag * See unit test for examples. * See mediawiki.language.bcp47 for the JavaScript implementation. * @@ -199,6 +199,57 @@ class LanguageCode { } /** + * Convert standardized BCP 47 codes to the internal names used + * by MediaWiki and returned by Language::getCode(). This function + * should be the inverse of LanguageCode::bcp47(). Note that BCP 47 + * explicitly states that language codes are case insensitive. + * + * Since LanguageFactory::getLanguage() is pretty generous about + * accepting aliases (as long as they are lowercased), this function + * should be equivalent to: + * LanguageFactory::getLanguage(strtolower($code))->getCode() + * but (a) better describes the caller's intention, and (b) should + * be much more efficient in practice. + * + * @param string $code The standard BCP-47 language code + * @return string A MediaWiki-internal code, as returned for example by + * Language::getCode() + * @since 1.40 + */ + public static function bcp47ToInternal( $code ) { + static $invertedLookup = []; + if ( !$invertedLookup ) { + // There should never be two different entries in + // NON_STANDARD_LANGUAGE_CODE_MAPPING which map *different* + // internal codes to the same external BCP-47 code. That is, + // BCP-47 should preserve all the information from the internal + // code (discussed further above)[*]. But note the converse isn't + // true: multiple BCP-47 codes can alias to the same internal code: + // BCP-47 internal + // zh-Hans-CN => zh-cn (in NON_STANDARD_LANGUAGE_CODE_MAPPING) + // zh-Hans => zh-hans (not in " ) + // zh-CN => zh-cn (not in " ) + // + // [*] eml/egl are the "exception that proves the rule": `egl` *is* + // (prematurely?) defined as an internal code, but only + // eml.wikipedia.org exists, and it defines its language as `eml`; + // for internal purposes `egl` should map back into `eml` until + // `eml` is deprecated (aka an `eml => egl` entry is added to + // DEPRECATED_LANGUAGE_CODE_MAPPING): T36217. + foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $internal => $bcp47 ) { + $invertedLookup[strtolower( $bcp47 )] = $internal; + } + // We deliberately do *not* use DEPRECATED_LANGUAGE_CODE_MAPPING + // here: deprecated codes are no longer valid mediawiki internal + // codes and we should never return them. + } + // Internal codes are all lowercase. This also achieves + // case-insensitivity in the lookup. + $code = strtolower( $code ); + return $invertedLookup[$code] ?? $code; + } + + /** * Returns true if a language code string is a well-formed language tag * according to RFC 5646. * This function only checks well-formedness; it doesn't check that |