diff options
author | Fomafix <fomafix@googlemail.com> | 2022-08-29 12:43:21 +0000 |
---|---|---|
committer | Fomafix <fomafix@googlemail.com> | 2022-08-30 10:54:18 +0000 |
commit | 3e279faba1aac3926b23f7078025635d6fba1ec3 (patch) | |
tree | 4efaaa13d0d403d9cb40597733b2b04199abbd29 /includes/language | |
parent | 7b88ee10c417748158ddda577f3a79d3244bf6b4 (diff) | |
download | mediawikicore-3e279faba1aac3926b23f7078025635d6fba1ec3.tar.gz mediawikicore-3e279faba1aac3926b23f7078025635d6fba1ec3.zip |
Update Language::isWellFormedLanguageTag
Source: https://github.com/unicode-org/icu/blob/37e295627156bc334e1f1e88807025fac984da0e/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt
Also update the URL in the test cases from
http://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html
to
https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html
Change-Id: I08c11081afda84514892e0cdeac7e2023eb44118
Diffstat (limited to 'includes/language')
-rw-r--r-- | includes/language/Language.php | 13 |
1 files changed, 7 insertions, 6 deletions
diff --git a/includes/language/Language.php b/includes/language/Language.php index ff9e29402a75..fb11c7d4f4ac 100644 --- a/includes/language/Language.php +++ b/includes/language/Language.php @@ -320,7 +320,7 @@ class Language { * language, script or variant codes actually exist in the repositories. * * Based on regexes by Mark Davis of the Unicode Consortium: - * https://www.unicode.org/repos/cldr/trunk/tools/java/org/unicode/cldr/util/data/langtagRegex.txt + * https://github.com/unicode-org/icu/blob/37e295627156bc334e1f1e88807025fac984da0e/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt * * @param string $code * @param bool $lenient Whether to allow '_' as separator. The default is only '-'. @@ -343,13 +343,14 @@ class Language { $extension = "$singleton(?:$s$alphanum{2,8})+"; $privateUse = "$x(?:$s$alphanum{1,8})+"; - # Define certain grandfathered codes, since otherwise the regex is pretty useless. + # Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47), + # since otherwise the regex is pretty useless. # Since these are limited, this is safe even later changes to the registry -- # the only oddity is that it might change the type of the tag, and thus # the results from the capturing groups. # https://www.iana.org/assignments/language-subtag-registry - $grandfathered = "en{$s}GB{$s}oed" + $legacy = "en{$s}GB{$s}oed" . "|i{$s}(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)" . "|no{$s}(?:bok|nyn)" . "|sgn{$s}(?:BE{$s}(?:fr|nl)|CH{$s}de)" @@ -365,10 +366,10 @@ class Language { . "(?:$s$extensionList)?" . "(?:$s$privateUse)?)"; - # The final breakdown, with capturing groups for each of these components - # The variants, extensions, grandfathered, and private-use may have interior '-' + # Here is the final breakdown, with capturing groups for each of these components + # The variants, extensions, legacy, and private-use may have interior '-' - $root = "^(?:$langtag|$privateUse|$grandfathered)$"; + $root = "^(?:$langtag|$privateUse|$legacy)$"; return (bool)preg_match( "/$root/", strtolower( $code ) ); } |