* @author fdcn * @author shinjiman * @author PhiLiP */ namespace MediaWiki\Language; use InvalidArgumentException; use MediaWiki\Context\RequestContext; use MediaWiki\Debug\DeprecationHelper; use MediaWiki\HookContainer\HookRunner; use MediaWiki\Html\Html; use MediaWiki\Linker\LinkTarget; use MediaWiki\Logger\LoggerFactory; use MediaWiki\MainConfigNames; use MediaWiki\MediaWikiServices; use MediaWiki\Page\PageIdentity; use MediaWiki\Parser\Parser; use MediaWiki\Parser\Sanitizer; use MediaWiki\Revision\RevisionRecord; use MediaWiki\Revision\SlotRecord; use MediaWiki\StubObject\StubUserLang; use MediaWiki\Title\Title; use MediaWiki\User\User; use RuntimeException; use StringUtils; use UnexpectedValueException; use Wikimedia\ObjectCache\BagOStuff; /** * Base class for multi-variant language conversion. * * @ingroup Language */ abstract class LanguageConverter implements ILanguageConverter { use DeprecationHelper; /** * languages supporting variants * @since 1.20 * @var string[] */ public static $languagesWithVariants = [ 'ban', 'en', 'crh', 'gan', 'iu', 'ku', 'mni', 'sh', 'shi', 'sr', 'tg', 'tly', 'uz', 'wuu', 'zgh', 'zh', ]; /** * static default variant of languages supporting variants * for use with DefaultOptionsLookup.php * @since 1.40 * @var array */ public static $languagesWithStaticDefaultVariant = [ 'ban' => 'ban', 'en' => 'en', 'crh' => 'crh', 'gan' => 'gan', 'iu' => 'iu', 'ku' => 'ku', 'mni' => 'mni', 'sh' => 'sh-latn', 'shi' => 'shi', 'sr' => 'sr', 'tg' => 'tg', 'tly' => 'tly', 'uz' => 'uz', 'wuu' => 'wuu', 'zgh' => 'zgh', 'zh' => 'zh', ]; /** @var bool */ private $mTablesLoaded = false; /** @var ReplacementArray[] */ protected $mTables = []; /** @var Language|StubUserLang */ private $mLangObj; /** @var string|false */ private $mConvRuleTitle = false; /** @var string|null */ private $mURLVariant; /** @var string|null */ private $mUserVariant; /** @var string|null */ private $mHeaderVariant; /** @var int */ private $mMaxDepth = 10; /** @var string|null */ private $mVarSeparatorPattern; private const CACHE_VERSION_KEY = 'VERSION 7'; /** * @param Language|StubUserLang $langobj */ public function __construct( $langobj ) { $this->mLangObj = $langobj; } /** * Get the language code with converter (the "main" language code). * Page language code would be the same of the language code with converter. * Note that this code might not be included as one of the variant languages. * @since 1.36 * * @return string */ abstract public function getMainCode(): string; /** * Get static default variant. * For use of specify the default variant form when it different from the * default "unconverted/mixed-variant form". * @since 1.40 * * @return string */ protected function getStaticDefaultVariant(): string { $code = $this->getMainCode(); return self::$languagesWithStaticDefaultVariant[$code] ?? $code; } /** * Get supported variants of the language. * @since 1.36 * * @return array */ abstract protected function getLanguageVariants(): array; /** * Get language variants fallbacks. * @since 1.36 * * @return array */ abstract public function getVariantsFallbacks(): array; /** * Get the strings that map to the flags. * @since 1.36 * * @return array */ final public function getFlags(): array { $defaultflags = [ // 'S' show the converted text // '+' add rules for alltext // 'E' the flags have an error // these flags above are reserved for program 'A' => 'A', // add rule for convert code (all text converted) 'T' => 'T', // title convert 'R' => 'R', // raw content 'D' => 'D', // convert description (subclass implement) '-' => '-', // remove convert (not implement) 'H' => 'H', // add rule for convert code (but no display in placed code) 'N' => 'N', // current variant name ]; $flags = array_merge( $defaultflags, $this->getAdditionalFlags() ); foreach ( $this->getVariants() as $v ) { $flags[$v] = $v; } return $flags; } /** * Provides additional flags for converter. By default, it returns empty array and * typically should be overridden by implementation of converter. * * @return array */ protected function getAdditionalFlags(): array { return []; } /** * Get manual level limit for supported variants. * @since 1.36 * * @return array */ final public function getManualLevel() { $manualLevel = $this->getAdditionalManualLevel(); $result = []; foreach ( $this->getVariants() as $v ) { if ( array_key_exists( $v, $manualLevel ) ) { $result[$v] = $manualLevel[$v]; } else { $result[$v] = 'bidirectional'; } } return $result; } /** * Provides additional flags for converter. By default, this function returns an empty array and * typically should be overridden by the implementation of converter. * @since 1.36 * * @return array */ protected function getAdditionalManualLevel(): array { return []; } /** * Get desc code separator. By default returns ":", can be overridden by * implementation of converter. * @since 1.36 * * @return string */ public function getDescCodeSeparator(): string { return ':'; } /** * Get desc var separator. By default returns ";", can be overridden by * implementation of converter. * @since 1.36 * * @return string */ public function getDescVarSeparator(): string { return ';'; } /** * Get variant names. * * @return array */ public function getVariantNames(): array { return MediaWikiServices::getInstance() ->getLanguageNameUtils() ->getLanguageNames(); } final public function getVariants() { $disabledVariants = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DisabledVariants ); return array_diff( $this->getLanguageVariants(), $disabledVariants ); } public function getVariantFallbacks( $variant ) { return $this->getVariantsFallbacks()[$variant] ?? $this->getStaticDefaultVariant(); } public function getConvRuleTitle() { return $this->mConvRuleTitle; } public function getPreferredVariant() { $req = $this->getURLVariant(); $services = MediaWikiServices::getInstance(); ( new HookRunner( $services->getHookContainer() ) )->onGetLangPreferredVariant( $req ); if ( !$req ) { $user = RequestContext::getMain()->getUser(); // NOTE: For some calls there may not be a context user or session that is safe // to use, see (T235360) // Use case: During user autocreation, UserNameUtils::isUsable is called which uses interface // messages for reserved usernames. if ( $user->isSafeToLoad() && $user->isRegistered() ) { $req = $this->getUserVariant( $user ); } else { $req = $this->getHeaderVariant(); } } $defaultLanguageVariant = $services->getMainConfig() ->get( MainConfigNames::DefaultLanguageVariant ); if ( !$req && $defaultLanguageVariant ) { $req = $this->validateVariant( $defaultLanguageVariant ); } $req = $this->validateVariant( $req ); // This function, unlike the other get*Variant functions, is // not memoized (i.e., there return value is not cached) since // new information might appear during processing after this // is first called. return $req ?? $this->getStaticDefaultVariant(); } public function getDefaultVariant() { $defaultLanguageVariant = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::DefaultLanguageVariant ); $req = $this->getURLVariant() ?? $this->getHeaderVariant(); if ( !$req && $defaultLanguageVariant ) { $req = $this->validateVariant( $defaultLanguageVariant ); } return $req ?? $this->getStaticDefaultVariant(); } public function validateVariant( $variant = null ) { if ( $variant === null ) { return null; } // Our internal variants are always lower-case; the variant we // are validating may have mixed cases. $variant = LanguageCode::replaceDeprecatedCodes( strtolower( $variant ) ); if ( in_array( $variant, $this->getVariants() ) ) { return $variant; } // Browsers are supposed to use BCP 47 standard in the // Accept-Language header, but not all of our internal // mediawiki variant codes are BCP 47. Map BCP 47 code // to our internal code. foreach ( $this->getVariants() as $v ) { // Case-insensitive match (BCP 47 is mixed-case) if ( strtolower( LanguageCode::bcp47( $v ) ) === $variant ) { return $v; } } return null; } public function getURLVariant() { if ( $this->mURLVariant ) { return $this->mURLVariant; } $request = RequestContext::getMain()->getRequest(); // see if the preference is set in the request $ret = $request->getText( 'variant' ); if ( !$ret ) { $ret = $request->getVal( 'uselang' ); } $this->mURLVariant = $this->validateVariant( $ret ); return $this->mURLVariant; } /** * Determine if the user has a variant set. * * @param User $user * @return string|null Variant if one found, null otherwise */ protected function getUserVariant( User $user ) { // This should only be called within the class after the user is known to be // safe to load and logged in, but check just in case. if ( !$user->isSafeToLoad() ) { return null; } if ( !$this->mUserVariant ) { $services = MediaWikiServices::getInstance(); if ( $user->isRegistered() ) { // Get language variant preference from logged in users if ( $this->getMainCode() === $services->getContentLanguage()->getCode() ) { $optionName = 'variant'; } else { $optionName = 'variant-' . $this->getMainCode(); } } else { // figure out user lang without constructing wgLang to avoid // infinite recursion $optionName = 'language'; } $ret = $services->getUserOptionsLookup()->getOption( $user, $optionName ); $this->mUserVariant = $this->validateVariant( $ret ); } return $this->mUserVariant; } /** * Determine the language variant from the Accept-Language header. * * @return string|null Variant if one found, null otherwise */ protected function getHeaderVariant() { if ( $this->mHeaderVariant ) { return $this->mHeaderVariant; } $request = RequestContext::getMain()->getRequest(); // See if some supported language variant is set in the // HTTP header. $languages = array_keys( $request->getAcceptLang() ); if ( !$languages ) { return null; } $fallbackLanguages = []; foreach ( $languages as $language ) { $this->mHeaderVariant = $this->validateVariant( $language ); if ( $this->mHeaderVariant ) { break; } // To see if there are fallbacks of current language. // We record these fallback variants, and process // them later. $fallbacks = $this->getVariantFallbacks( $language ); if ( is_string( $fallbacks ) && $fallbacks !== $this->getStaticDefaultVariant() ) { $fallbackLanguages[] = $fallbacks; } elseif ( is_array( $fallbacks ) ) { $fallbackLanguages = array_merge( $fallbackLanguages, $fallbacks ); } } if ( !$this->mHeaderVariant ) { // process fallback languages now $fallback_languages = array_unique( $fallbackLanguages ); foreach ( $fallback_languages as $language ) { $this->mHeaderVariant = $this->validateVariant( $language ); if ( $this->mHeaderVariant ) { break; } } } return $this->mHeaderVariant; } public function autoConvert( $text, $toVariant = false ) { $this->loadTables(); if ( !$toVariant ) { $toVariant = $this->getPreferredVariant(); if ( !$toVariant ) { return $text; } } if ( $this->guessVariant( $text, $toVariant ) ) { return $text; } /** * We convert everything except: * 1. HTML markups (anything between < and >) * 2. HTML entities * 3. placeholders created by the parser * IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404). * Minimize the use of backtracking where possible. */ static $reg; if ( $reg === null ) { $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f'; // this one is needed when the text is inside an HTML markup $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>'; // Optimize for the common case where these tags have // few or no children. Thus try and possessively get as much as // possible, and only engage in backtracking when we hit a '<'. // disable convert to variants between tags $codefix = '[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|'; // disable conversion of