aboutsummaryrefslogtreecommitdiffstats
path: root/includes/language
diff options
context:
space:
mode:
authorFomafix <fomafix@googlemail.com>2022-08-29 12:43:21 +0000
committerFomafix <fomafix@googlemail.com>2022-08-30 10:54:18 +0000
commit3e279faba1aac3926b23f7078025635d6fba1ec3 (patch)
tree4efaaa13d0d403d9cb40597733b2b04199abbd29 /includes/language
parent7b88ee10c417748158ddda577f3a79d3244bf6b4 (diff)
downloadmediawikicore-3e279faba1aac3926b23f7078025635d6fba1ec3.tar.gz
mediawikicore-3e279faba1aac3926b23f7078025635d6fba1ec3.zip
Update Language::isWellFormedLanguageTag
Source: https://github.com/unicode-org/icu/blob/37e295627156bc334e1f1e88807025fac984da0e/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt Also update the URL in the test cases from http://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html to https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html Change-Id: I08c11081afda84514892e0cdeac7e2023eb44118
Diffstat (limited to 'includes/language')
-rw-r--r--includes/language/Language.php13
1 files changed, 7 insertions, 6 deletions
diff --git a/includes/language/Language.php b/includes/language/Language.php
index ff9e29402a75..fb11c7d4f4ac 100644
--- a/includes/language/Language.php
+++ b/includes/language/Language.php
@@ -320,7 +320,7 @@ class Language {
* language, script or variant codes actually exist in the repositories.
*
* Based on regexes by Mark Davis of the Unicode Consortium:
- * https://www.unicode.org/repos/cldr/trunk/tools/java/org/unicode/cldr/util/data/langtagRegex.txt
+ * https://github.com/unicode-org/icu/blob/37e295627156bc334e1f1e88807025fac984da0e/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt
*
* @param string $code
* @param bool $lenient Whether to allow '_' as separator. The default is only '-'.
@@ -343,13 +343,14 @@ class Language {
$extension = "$singleton(?:$s$alphanum{2,8})+";
$privateUse = "$x(?:$s$alphanum{1,8})+";
- # Define certain grandfathered codes, since otherwise the regex is pretty useless.
+ # Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
+ # since otherwise the regex is pretty useless.
# Since these are limited, this is safe even later changes to the registry --
# the only oddity is that it might change the type of the tag, and thus
# the results from the capturing groups.
# https://www.iana.org/assignments/language-subtag-registry
- $grandfathered = "en{$s}GB{$s}oed"
+ $legacy = "en{$s}GB{$s}oed"
. "|i{$s}(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)"
. "|no{$s}(?:bok|nyn)"
. "|sgn{$s}(?:BE{$s}(?:fr|nl)|CH{$s}de)"
@@ -365,10 +366,10 @@ class Language {
. "(?:$s$extensionList)?"
. "(?:$s$privateUse)?)";
- # The final breakdown, with capturing groups for each of these components
- # The variants, extensions, grandfathered, and private-use may have interior '-'
+ # Here is the final breakdown, with capturing groups for each of these components
+ # The variants, extensions, legacy, and private-use may have interior '-'
- $root = "^(?:$langtag|$privateUse|$grandfathered)$";
+ $root = "^(?:$langtag|$privateUse|$legacy)$";
return (bool)preg_match( "/$root/", strtolower( $code ) );
}