aboutsummaryrefslogtreecommitdiffstats
path: root/tests/phpunit/unit/includes/language/LanguageCodeTest.php
blob: 718c43b536f1fa0cc64a8717c8e181cfc9b17c5d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
<?php

use MediaWiki\Languages\LanguageNameUtils;
use MediaWiki\Tests\Unit\DummyServicesTrait;
use Wikimedia\Bcp47Code\Bcp47CodeValue;

/**
 * @group Language
 * @covers \LanguageCode
 *
 * @author Thiemo Kreuz
 */
class LanguageCodeTest extends MediaWikiUnitTestCase {
	use DummyServicesTrait;

	public function testConstructor() {
		$instance = new LanguageCode( 'fa' );

		$this->assertInstanceOf( LanguageCode::class, $instance );
		$this->assertSame( 'fa', $instance->toString() );
	}

	public function testGetDeprecatedCodeMapping() {
		$map = LanguageCode::getDeprecatedCodeMapping();

		$this->assertIsArray( $map );
		$this->assertContainsOnly( 'string', array_keys( $map ) );
		$this->assertArrayNotHasKey( '', $map );
		$this->assertContainsOnly( 'string', $map );
		$this->assertNotContains( '', $map );

		// Codes special to MediaWiki should never appear in a map of "deprecated" codes
		$this->assertArrayNotHasKey( 'qqq', $map, 'documentation' );
		$this->assertNotContains( 'qqq', $map, 'documentation' );
		$this->assertArrayNotHasKey( 'qqx', $map, 'debug code' );
		$this->assertNotContains( 'qqx', $map, 'debug code' );

		// Valid language codes that are currently not "deprecated"
		$this->assertArrayNotHasKey( 'bh', $map, 'family of Bihari languages' );
		$this->assertArrayNotHasKey( 'no', $map, 'family of Norwegian languages' );
		$this->assertArrayNotHasKey( 'simple', $map );
	}

	public function testReplaceDeprecatedCodes() {
		$this->assertEquals( 'gsw', LanguageCode::replaceDeprecatedCodes( 'als' ) );
		$this->assertEquals( 'gsw', LanguageCode::replaceDeprecatedCodes( 'gsw' ) );
		$this->assertNull( LanguageCode::replaceDeprecatedCodes( null ) );
	}

	/**
	 * test @see LanguageCode::bcp47().
	 * Please note the BCP 47 explicitly state that language codes are case
	 * insensitive, there are some exceptions to the rule :)
	 * This test is used to verify our formatting against all lower and
	 * all upper cases language code.
	 *
	 * @see https://tools.ietf.org/html/bcp47
	 * @dataProvider provideLanguageCodes()
	 */
	public function testBcp47( $code, $expected ) {
		$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
			"Applying BCP 47 standard to '$code'"
		);

		$code = strtolower( $code );
		$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
			"Applying BCP 47 standard to lower case '$code'"
		);

		$code = strtoupper( $code );
		$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
			"Applying BCP 47 standard to upper case '$code'"
		);
	}

	/**
	 * Array format is ($code, $expected)
	 */
	public static function provideLanguageCodes() {
		return [
			// Extracted from BCP 47 (list not exhaustive)
			# 2.1.1
			[ 'en-ca-x-ca', 'en-CA-x-ca' ],
			[ 'sgn-be-fr', 'sgn-BE-FR' ],
			[ 'az-latn-x-latn', 'az-Latn-x-latn' ],
			# 2.2
			[ 'sr-Latn-RS', 'sr-Latn-RS' ],
			[ 'az-arab-ir', 'az-Arab-IR' ],

			# 2.2.5
			[ 'sl-nedis', 'sl-nedis' ],
			[ 'de-ch-1996', 'de-CH-1996' ],

			# 2.2.6
			[
				'en-latn-gb-boont-r-extended-sequence-x-private',
				'en-Latn-GB-boont-r-extended-sequence-x-private'
			],

			// Examples from BCP 47 Appendix A
			# Simple language subtag:
			[ 'DE', 'de' ],
			[ 'fR', 'fr' ],
			[ 'ja', 'ja' ],

			# Language subtag plus script subtag:
			[ 'zh-hans', 'zh-Hans' ],
			[ 'sr-cyrl', 'sr-Cyrl' ],
			[ 'sr-latn', 'sr-Latn' ],

			# Extended language subtags and their primary language subtag
			# counterparts:
			[ 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ],
			[ 'cmn-hans-cn', 'cmn-Hans-CN' ],
			[ 'zh-yue-hk', 'zh-yue-HK' ],
			[ 'yue-hk', 'yue-HK' ],

			# Language-Script-Region:
			[ 'zh-hans-cn', 'zh-Hans-CN' ],
			[ 'sr-latn-RS', 'sr-Latn-RS' ],

			# Language-Variant:
			[ 'sl-rozaj', 'sl-rozaj' ],
			[ 'sl-rozaj-biske', 'sl-rozaj-biske' ],
			[ 'sl-nedis', 'sl-nedis' ],

			# Language-Region-Variant:
			[ 'de-ch-1901', 'de-CH-1901' ],
			[ 'sl-it-nedis', 'sl-IT-nedis' ],

			# Language-Script-Region-Variant:
			[ 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ],

			# Language-Region:
			[ 'de-de', 'de-DE' ],
			[ 'en-us', 'en-US' ],
			[ 'es-419', 'es-419' ],

			# Private use subtags:
			[ 'de-ch-x-phonebk', 'de-CH-x-phonebk' ],
			[ 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ],
			/**
			 * Previous test does not reflect the BCP 47 which states:
			 *  az-Arab-x-AZE-derbend
			 * AZE being private, it should be lower case, hence the test above
			 * should probably be:
			 * [ 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ],
			 */

			# Private use registry values:
			[ 'x-whatever', 'x-whatever' ],
			[ 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ],
			[ 'de-qaaa', 'de-Qaaa' ],
			[ 'sr-latn-qm', 'sr-Latn-QM' ],
			[ 'sr-qaaa-rs', 'sr-Qaaa-RS' ],

			# Tags that use extensions
			[ 'en-us-u-islamcal', 'en-US-u-islamcal' ],
			[ 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ],
			[ 'en-a-myext-b-another', 'en-a-myext-b-another' ],

			# Invalid:
			// de-419-DE
			// a-DE
			// ar-a-aaa-b-bbb-a-ccc

			# Non-standard and deprecated language codes used by MediaWiki
			[ 'als', 'gsw' ],
			[ 'bat-smg', 'sgs' ],
			[ 'be-x-old', 'be-tarask' ],
			[ 'fiu-vro', 'vro' ],
			[ 'roa-rup', 'rup' ],
			[ 'zh-classical', 'lzh' ],
			[ 'zh-min-nan', 'nan' ],
			[ 'zh-yue', 'yue' ],
			[ 'cbk-zam', 'cbk' ],
			[ 'de-formal', 'de-x-formal' ],
			[ 'eml', 'egl' ],
			[ 'en-rtl', 'en-x-rtl' ],
			[ 'es-formal', 'es-x-formal' ],
			[ 'hu-formal', 'hu-x-formal' ],
			[ 'kk-Arab', 'kk-Arab' ],
			[ 'kk-Cyrl', 'kk-Cyrl' ],
			[ 'kk-Latn', 'kk-Latn' ],
			[ 'map-bms', 'jv-x-bms' ],
			[ 'mo', 'ro-Cyrl-MD' ],
			[ 'nrm', 'nrf' ],
			[ 'nl-informal', 'nl-x-informal' ],
			[ 'roa-tara', 'nap-x-tara' ],
			[ 'simple', 'en-simple' ],
			[ 'sr-ec', 'sr-Cyrl' ],
			[ 'sr-el', 'sr-Latn' ],
			[ 'zh-cn', 'zh-Hans-CN' ],
			[ 'zh-sg', 'zh-Hans-SG' ],
			[ 'zh-my', 'zh-Hans-MY' ],
			[ 'zh-tw', 'zh-Hant-TW' ],
			[ 'zh-hk', 'zh-Hant-HK' ],
			[ 'zh-mo', 'zh-Hant-MO' ],
			[ 'zh-hans', 'zh-Hans' ],
			[ 'zh-hant', 'zh-Hant' ],
		];
	}

	/**
	 * @dataProvider provideBcp47ToInternal()
	 */
	public function testBcp47ToInternal( $expected, $bcp47 ) {
		$result = LanguageCode::bcp47ToInternal( $bcp47 );
		$this->assertEquals( $expected, $result );
	}

	/**
	 * @dataProvider provideSupportedLanguageCodes()
	 */
	public function testBcp47ToInternalLanguage( $internalCode ) {
		if ( $internalCode === 'egl' ) {
			# 'egl' was added as an internal code prematurely; 'eml' hasn't
			# been added to the deprecated list yet (T36217) and so only
			# 'eml' is a "real" internal code.
			$internalCode = 'eml';
		}
		$lang = $this->createMock( Language::class );
		$lang->method( 'getCode' )->willReturn( $internalCode );
		$result = LanguageCode::bcp47ToInternal( $lang );
		$this->assertEquals( $internalCode, $result );
	}

	public function provideSupportedLanguageCodes() {
		$lnu = $this->getDummyLanguageNameUtils();
		$languages = $lnu->getLanguageNames(
			LanguageNameUtils::AUTONYMS, LanguageNameUtils::SUPPORTED
		);
		foreach ( $languages as $code => $autonym ) {
			yield [ $code ];
		}
	}

	public function provideBcp47ToInternal() {
		foreach ( $this->provideSupportedLanguageCodes() as $args ) {
			$code = $args[0];
			if ( $code === 'egl' ) {
				# 'egl' was added as an internal code prematurely; 'eml' hasn't
				# been added to the deprecated list yet (T36217) and so only
				# 'eml' is a "real" internal code.
				continue;
			}
			$bcp47 = LanguageCode::bcp47( $code );
			yield "$code as string" => [ $code, $bcp47 ];
			yield "$code as Bcp47Code object" => [ $code, new Bcp47CodeValue( $bcp47 ) ];
			// Verify case-insensitivity: lowercase
			$bcp47 = strtolower( $bcp47 );
			yield "$code as lowercase string" => [ $code, $bcp47 ];
			yield "$code as lowercase Bcp47Code object" => [ $code, new Bcp47CodeValue( $bcp47 ) ];
			// Verify case-insensitivity: uppercase
			$bcp47 = strtoupper( $bcp47 );
			yield "$code as uppercase string" => [ $code, $bcp47 ];
			yield "$code as uppercase Bcp47Code object" => [ $code, new Bcp47CodeValue( $bcp47 ) ];
		}
	}

	/**
	 * @dataProvider provideWellFormedLanguageTags
	 */
	public function testWellFormedLanguageTag( $code, $message = '' ) {
		$this->assertTrue(
			LanguageCode::isWellFormedLanguageTag( $code ),
			"validating code $code $message"
		);
	}

	/**
	 * The test cases are based on the tests in the GaBuZoMeu parser
	 * written by Stéphane Bortzmeyer <bortzmeyer@nic.fr>
	 * and distributed as free software, under the GNU General Public Licence.
	 * https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html
	 */
	public static function provideWellFormedLanguageTags() {
		return [
			[ 'fr', 'two-letter code' ],
			[ 'fr-latn', 'two-letter code with lower case script code' ],
			[ 'fr-Latn-FR', 'two-letter code with title case script code and uppercase country code' ],
			[ 'fr-Latn-419', 'two-letter code with title case script code and region number' ],
			[ 'fr-FR', 'two-letter code with uppercase' ],
			[ 'ax-TZ', 'Not in the registry, but well-formed' ],
			[ 'fr-shadok', 'two-letter code with variant' ],
			[ 'fr-y-myext-myext2', 'non-x singleton' ],
			[ 'fra-Latn', 'ISO 639 can be 3-letters' ],
			[ 'fra', 'three-letter language code' ],
			[ 'fra-FX', 'three-letter language code with country code' ],
			[ 'i-klingon', 'grandfathered with singleton' ],
			[ 'I-kLINgon', 'tags are case-insensitive...' ],
			[ 'no-bok', 'grandfathered without singleton' ],
			[ 'i-enochian', 'Grandfathered' ],
			[ 'x-fr-CH', 'private use' ],
			[ 'es-419', 'two-letter code with region number' ],
			[ 'en-Latn-GB-boont-r-extended-sequence-x-private', 'weird, but well-formed' ],
			[ 'ab-x-abc-x-abc', 'anything goes after x' ],
			[ 'ab-x-abc-a-a', 'anything goes after x, including several non-x singletons' ],
			[ 'i-default', 'grandfathered' ],
			[ 'abcd-Latn', 'Language of 4 chars reserved for future use' ],
			[ 'AaBbCcDd-x-y-any-x', 'Language of 5-8 chars, registered' ],
			[ 'de-CH-1901', 'with country and year' ],
			[ 'en-US-x-twain', 'with country and singleton' ],
			[ 'zh-cmn', 'three-letter variant' ],
			[ 'zh-cmn-Hant', 'three-letter variant and script' ],
			[ 'zh-cmn-Hant-HK', 'three-letter variant, script and country' ],
			[ 'xr-p-lze', 'Extension' ],
			[ 'en-GB-oed', 'grandfathered language tag, mixed capitalisation' ],
			[ 'en-gb-oed', 'grandfathered language tag, all lowercase' ],
		];
	}

	/**
	 * @dataProvider provideMalformedLanguageTags
	 */
	public function testMalformedLanguageTag( $code, $message = '' ) {
		$this->assertFalse(
			LanguageCode::isWellFormedLanguageTag( $code ),
			"validating that code $code is a malformed language tag - $message"
		);
	}

	/**
	 * The test cases are based on the tests in the GaBuZoMeu parser
	 * written by Stéphane Bortzmeyer <bortzmeyer@nic.fr>
	 * and distributed as free software, under the GNU General Public Licence.
	 * https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html
	 */
	public static function provideMalformedLanguageTags() {
		return [
			[ 'f', 'language too short' ],
			[ 'f-Latn', 'language too short with script' ],
			[ 'xr-lxs-qut', 'variants too short' ], # extlangS
			[ 'fr-Latn-F', 'region too short' ],
			[ 'a-value', 'language too short with region' ],
			[ 'tlh-a-b-foo', 'valid three-letter with wrong variant' ],
			[
				'i-notexist',
				'grandfathered but not registered: invalid, even if we only test well-formedness'
			],
			[ 'abcdefghi-012345678', 'numbers too long' ],
			[ 'ab-abc-abc-abc-abc', 'invalid extensions' ],
			[ 'ab-abcd-abc', 'invalid extensions' ],
			[ 'ab-ab-abc', 'invalid extensions' ],
			[ 'ab-123-abc', 'invalid extensions' ],
			[ 'a-Hant-ZH', 'short language with valid extensions' ],
			[ 'a1-Hant-ZH', 'invalid character in language' ],
			[ 'ab-abcde-abc', 'invalid extensions' ],
			[ 'ab-1abc-abc', 'invalid characters in extensions' ],
			[ 'ab-ab-abcd', 'invalid order of extensions' ],
			[ 'ab-123-abcd', 'invalid order of extensions' ],
			[ 'ab-abcde-abcd', 'invalid extensions' ],
			[ 'ab-1abc-abcd', 'invalid characters in extensions' ],
			[ 'ab-a-b', 'extensions too short' ],
			[ 'ab-a-x', 'extensions too short, even with singleton' ],
			[ 'ab--ab', 'two separators' ],
			[ 'ab-abc-', 'separator in the end' ],
			[ '-ab-abc', 'separator in the beginning' ],
			[ 'abcd-efg', 'language too long' ],
			[ 'aabbccddE', 'tag too long' ],
			[ 'pa_guru', 'A tag with underscore is invalid in strict mode' ],
			[ 'de-f', 'subtag too short' ],
			[ 'zh-classical', 'internal language code zh-classical is not a well-formed language tag' ],
		];
	}

	public function testLenientLanguageTag() {
		$this->assertTrue(
			LanguageCode::isWellFormedLanguageTag( 'pa_guru', true ),
			'pa_guru is a well-formed language tag in lenient mode'
		);
	}

}