1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
|
<?php
namespace MediaWiki\Parser\Parsoid;
use MediaWiki\Language\LanguageCode;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWiki\Languages\LanguageFactory;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
use MediaWiki\Rest\HttpException;
use MediaWiki\Rest\LocalizedHttpException;
use MediaWiki\Revision\RevisionAccessException;
use MediaWiki\Title\Title;
use MediaWiki\Title\TitleFactory;
use Wikimedia\Bcp47Code\Bcp47Code;
use Wikimedia\Bcp47Code\Bcp47CodeValue;
use Wikimedia\Message\MessageValue;
use Wikimedia\Parsoid\Config\PageConfig;
use Wikimedia\Parsoid\Config\SiteConfig;
use Wikimedia\Parsoid\Core\PageBundle;
use Wikimedia\Parsoid\DOM\Element;
use Wikimedia\Parsoid\Parsoid;
use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMUtils;
/**
* @since 1.40
* @unstable should be marked stable before 1.40 release
*/
class LanguageVariantConverter {
private PageConfigFactory $pageConfigFactory;
private ?PageConfig $pageConfig = null;
private PageIdentity $pageIdentity;
private Title $pageTitle;
private Parsoid $parsoid;
private SiteConfig $siteConfig;
private LanguageConverterFactory $languageConverterFactory;
private LanguageFactory $languageFactory;
/**
* Page language override from the Content-Language header.
*/
private ?Bcp47Code $pageLanguageOverride = null;
private bool $isFallbackLanguageConverterEnabled = true;
public function __construct(
PageIdentity $pageIdentity,
PageConfigFactory $pageConfigFactory,
Parsoid $parsoid,
SiteConfig $siteConfig,
TitleFactory $titleFactory,
LanguageConverterFactory $languageConverterFactory,
LanguageFactory $languageFactory
) {
$this->pageConfigFactory = $pageConfigFactory;
$this->pageIdentity = $pageIdentity;
$this->parsoid = $parsoid;
$this->siteConfig = $siteConfig;
$this->pageTitle = $titleFactory->newFromPageIdentity( $this->pageIdentity );
$this->languageConverterFactory = $languageConverterFactory;
$this->languageFactory = $languageFactory;
}
/**
* Set the PageConfig object to be used during language variant conversion.
* If not provided, the object will be created.
*
* @param PageConfig $pageConfig
* @return void
*/
public function setPageConfig( PageConfig $pageConfig ) {
$this->pageConfig = $pageConfig;
}
/**
* Set the page content language override.
*
* @param Bcp47Code $language
* @return void
*/
public function setPageLanguageOverride( Bcp47Code $language ) {
$this->pageLanguageOverride = $language;
}
/**
* Perform variant conversion on a PageBundle object.
*
* @param PageBundle $pageBundle
* @param Bcp47Code $targetVariant
* @param ?Bcp47Code $sourceVariant
*
* @return PageBundle The converted PageBundle, or the object passed in as
* $pageBundle if the conversion is not supported.
* @throws HttpException
*/
public function convertPageBundleVariant(
PageBundle $pageBundle,
Bcp47Code $targetVariant,
?Bcp47Code $sourceVariant = null
): PageBundle {
[ $pageLanguage, $sourceVariant ] =
$this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant );
if ( !$this->siteConfig->langConverterEnabledBcp47( $pageLanguage ) ) {
// If the language doesn't support variants, just return the content unmodified.
return $pageBundle;
}
$pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant );
if ( $this->parsoid->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) {
return $this->parsoid->pb2pb(
$pageConfig, 'variant', $pageBundle,
[
'variant' => [
'source' => $sourceVariant,
'target' => $targetVariant,
]
]
);
} else {
if ( !$this->isFallbackLanguageConverterEnabled ) {
// Fallback variant conversion is not enabled, return the page bundle as is.
return $pageBundle;
}
// LanguageConverter::hasVariant and LanguageConverter::convertTo
// could take a string|Bcp47Code in the future, which would
// allow us to avoid the $targetVariantCode conversion here.
$baseLanguage = $this->languageFactory->getParentLanguage( $targetVariant );
$languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage );
$targetVariantCode = $this->languageFactory->getLanguage( $targetVariant )->getCode();
if ( $languageConverter->hasVariant( $targetVariantCode ) ) {
// NOTE: This is not a convert() because we have the exact desired variant
// and don't need to compute a preferred variant based on a base language.
// Also see T267067 for why convert() should be avoided.
$convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode );
$pageVariant = $targetVariant;
} else {
// No conversion possible - pass through original HTML in original language
$convertedHtml = $pageBundle->html;
$pageVariant = $pageConfig->getPageLanguageBcp47();
}
// Add a note so that we can identify what was used to perform the variant conversion
$msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
$convertedHtml = $msg . $convertedHtml;
// NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo
// Add meta information that Parsoid normally adds
$headers = [
'content-language' => $pageVariant->toBcp47Code(),
'vary' => [ 'Accept', 'Accept-Language' ]
];
$doc = DOMUtils::parseHTML( '' );
$doc->appendChild( $doc->createElement( 'head' ) );
DOMUtils::addHttpEquivHeaders( $doc, $headers );
$docElt = $doc->documentElement;
'@phan-var Element $docElt';
$docHtml = DOMCompat::getOuterHTML( $docElt );
$convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" );
return new PageBundle(
$convertedHtml, [], [], $pageBundle->version, $headers
);
}
}
/**
* Perform variant conversion on a ParserOutput object.
*
* @param ParserOutput $parserOutput
* @param Bcp47Code $targetVariant
* @param ?Bcp47Code $sourceVariant
*
* @return ParserOutput
*/
public function convertParserOutputVariant(
ParserOutput $parserOutput,
Bcp47Code $targetVariant,
?Bcp47Code $sourceVariant = null
): ParserOutput {
$pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
$modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant );
return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput );
}
/**
* Disable fallback language variant converter
*/
public function disableFallbackLanguageConverter(): void {
$this->isFallbackLanguageConverterEnabled = false;
}
private function getPageConfig( Bcp47Code $pageLanguage, ?Bcp47Code $sourceVariant ): PageConfig {
if ( $this->pageConfig ) {
return $this->pageConfig;
}
try {
$this->pageConfig = $this->pageConfigFactory->create(
$this->pageIdentity,
null,
null,
null,
$pageLanguage
);
if ( $sourceVariant ) {
$this->pageConfig->setVariantBcp47( $sourceVariant );
}
} catch ( RevisionAccessException $exception ) {
// TODO: Throw a different exception, this class should not know
// about HTTP status codes.
throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 );
}
return $this->pageConfig;
}
/**
* Try to determine the page's language code as follows:
*
* First consider any value set by calling ::setPageLanguageOverride();
* this would have come from a Content-Language header.
*
* If ::setPageLanguageOverride() has not been called, check for a
* content-language header in $pageBundle, which should be
* equivalent. These are used when the title/article doesn't
* (yet) exist.
*
* If these are not given, use the $default if given; this is used
* to allow additional parameters to the request to be used as
* fallbacks.
*
* If we don't have $default, but we do have a PageConfig in
* $this->pageConfig, return $this->pageConfig->getPageLanguage().
*
* Finally, fall back to $this->pageTitle->getPageLanguage().
*
* @param PageBundle $pageBundle
* @param Bcp47Code|null $default A default language, used after
* Content-Language but before PageConfig/Title lookup.
*
* @return Bcp47Code the page language; may be a variant.
*/
private function getPageLanguage( PageBundle $pageBundle, ?Bcp47Code $default = null ): Bcp47Code {
// If a language was set by calling setPageLanguageOverride(), always use it!
if ( $this->pageLanguageOverride ) {
return $this->pageLanguageOverride;
}
// If the page bundle contains a language code, use that.
$pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null;
if ( $pageBundleLanguage ) {
// The HTTP header will contain a BCP-47 language code, not a
// mediawiki-internal one.
return new Bcp47CodeValue( $pageBundleLanguage );
}
// NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
// falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
if ( $default ) {
return $default;
}
// If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
// Title::getPageLanguage(), so it has to be the last thing we try.
if ( $this->pageConfig ) {
return $this->pageConfig->getPageLanguageBcp47();
}
// Finally, just go by the code associated with the title. This may come from the database or
// it may be determined based on the title itself.
return $this->pageTitle->getPageLanguage();
}
/**
* Determine the codes of the base language and the source variant.
*
* The base language will be used to find the appropriate LanguageConverter.
* It should never be a variant.
*
* The source variant will be used to instruct the LanguageConverter.
* It should always be a variant (or null to trigger auto-detection of
* the source variant).
*
* @param PageBundle $pageBundle
* @param ?Bcp47Code $sourceLanguage
*
* @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ]
*/
private function getBaseAndSourceLanguage( PageBundle $pageBundle, ?Bcp47Code $sourceLanguage ): array {
// Try to determine the language code associated with the content of the page.
// The result may be a variant code.
$baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage );
// To find out if $baseLanguage is actually a variant, get the parent language and compare.
$parentLang = $this->languageFactory->getParentLanguage( $baseLanguage );
// If $parentLang is not the same language as $baseLanguage, this means that
// $baseLanguage is a variant. In that case, set $sourceLanguage to that
// variant (unless $sourceLanguage is already set), and set $baseLanguage
// to the $parentLang
if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) {
if ( !$sourceLanguage ) {
$sourceLanguage = $baseLanguage;
}
$baseLanguage = $parentLang;
}
if ( $sourceLanguage !== null ) {
$parentConverter = $this->languageConverterFactory->getLanguageConverter( $parentLang );
// If the source variant isn't actually a variant, trigger auto-detection
$sourceIsVariant = (
strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 &&
$parentConverter->hasVariant(
LanguageCode::bcp47ToInternal( $sourceLanguage->toBcp47Code() )
)
);
if ( !$sourceIsVariant ) {
$sourceLanguage = null;
}
}
return [ $baseLanguage, $sourceLanguage ];
}
}
|