diff options
author | Reedy <reedy@wikimedia.org> | 2023-08-23 12:13:16 +0100 |
---|---|---|
committer | Thiemo Kreuz (WMDE) <thiemo.kreuz@wikimedia.de> | 2024-02-01 08:01:27 +0000 |
commit | d3b14ec8628372b607e938d8ca5003cc0783b14d (patch) | |
tree | 27c7fabd4ec70d807c35f432f6ea6f5db1d754d1 /includes/content | |
parent | 5dca00526add34d5c566e882e6763baed9d80a38 (diff) | |
download | mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.tar.gz mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.zip |
WikiTextStructure/WikitextContentHandler: Minor cleanup
Change-Id: If2f8243867994609d82618e61ddaaacca3516990
Diffstat (limited to 'includes/content')
-rw-r--r-- | includes/content/WikiTextStructure.php | 84 | ||||
-rw-r--r-- | includes/content/WikitextContentHandler.php | 70 |
2 files changed, 63 insertions, 91 deletions
diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php index 003eb2761da4..1046a4329b89 100644 --- a/includes/content/WikiTextStructure.php +++ b/includes/content/WikiTextStructure.php @@ -5,30 +5,20 @@ use MediaWiki\Parser\ParserOutput; use MediaWiki\Parser\Sanitizer; /** - * Class allowing to explore structure of parsed wikitext. + * Class allowing to explore the structure of parsed wikitext. */ class WikiTextStructure { - /** - * @var string - */ - private $openingText; - /** - * @var string - */ - private $allText; - /** - * @var string[] - */ - private $auxText = []; - /** - * @var ParserOutput - */ - private $parserOutput; + + private ?string $openingText = null; + private ?string $allText = null; + /** @var string[] */ + private array $auxText = []; + private ParserOutput $parserOutput; /** - * @var string[] selectors to elements that are excluded entirely from search + * Selectors to elements that are excluded entirely from search */ - private $excludedElementSelectors = [ + private const EXCLUDED_ELEMENT_SELECTORS = [ // "it looks like you don't have javascript enabled..." – do not need to index 'audio', 'video', // CSS stylesheets aren't content @@ -39,7 +29,7 @@ class WikiTextStructure { '.mw-cite-backlink', // Headings are already indexed in their own field. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - // Collapsed fields are hidden by default so we don't want them showing up. + // Collapsed fields are hidden by default, so we don't want them showing up. '.autocollapse', // Content explicitly decided to be not searchable by editors such // as custom navigation templates. @@ -49,9 +39,9 @@ class WikiTextStructure { ]; /** - * @var string[] selectors to elements that are considered auxiliary to article text for search + * Selectors to elements that are considered auxiliary to the article text for search */ - private $auxiliaryElementSelectors = [ + private const AUXILIARY_ELEMENT_SELECTORS = [ // Thumbnail captions aren't really part of the text proper '.thumbcaption', 'figcaption', @@ -73,15 +63,18 @@ class WikiTextStructure { } /** - * Get headings on the page. + * Gets headings from the page. * @return string[] * First strip out things that look like references. We can't use HTML filtering because * the references come back as <sup> tags without a class. To keep from breaking stuff like * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>== - * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove - * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo - * or something. Whatever. So we only strip things that look like <sup> tags wrapping a - * reference. And since the data looks like: + * we don't remove the whole <sup> tag. + * + * We also don't want to strip the <sup> tag and remove everything that looks like [2] because, + * I don't know, maybe there is a band named Word [2] Foo r something. Whatever. + * + * So we only strip things that look like <sup> tags wrapping a reference. And since the data + * looks like: * Reference in heading <sup>[1]</sup><sup>[2]</sup> * we can not really use HtmlFormatter as we have no suitable selector. */ @@ -123,14 +116,16 @@ class WikiTextStructure { */ public static function parseSettingsInMessage( $message ) { $lines = explode( "\n", $message ); - $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments - $lines = array_map( 'trim', $lines ); // Remove extra spaces - $lines = array_filter( $lines ); // Remove empty lines - return $lines; + // Remove comments + $lines = preg_replace( '/#.*$/', '', $lines ); + // Remove extra spaces + $lines = array_map( 'trim', $lines ); + // Remove empty lines + return array_filter( $lines ); } /** - * Get list of heading to ignore. + * Gets a list of heading to ignore. * @return string[] */ private function getIgnoredHeadings() { @@ -139,12 +134,13 @@ class WikiTextStructure { $ignoredHeadings = []; $source = wfMessage( 'search-ignored-headings' )->inContentLanguage(); if ( $source->isBlank() ) { - // Try old version too, just in case + // Try the old version too, just in case $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage(); } if ( !$source->isDisabled() ) { $lines = self::parseSettingsInMessage( $source->plain() ); - $ignoredHeadings = $lines; // Now we just have headings! + // Now we just have headings! + $ignoredHeadings = $lines; } } return $ignoredHeadings; @@ -172,13 +168,13 @@ class WikiTextStructure { $formatter = new HtmlFormatter( $text ); // Strip elements from the page that we never want in the search text. - $formatter->remove( $this->excludedElementSelectors ); + $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); $formatter->filterContent(); // Strip elements from the page that are auxiliary text. These will still be - // searched but matches will be ranked lower and non-auxiliary matches will be + // searched, but matches will be ranked lower and non-auxiliary matches will be // preferred in highlighting. - $formatter->remove( $this->auxiliaryElementSelectors ); + $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); $auxiliaryElements = $formatter->filterContent(); $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); foreach ( $auxiliaryElements as $auxiliaryElement ) { @@ -195,25 +191,25 @@ class WikiTextStructure { private function extractTextBeforeFirstHeading( $text ) { $matches = []; if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) { - // There isn't a first heading so we interpret this as the article + // There isn't a first heading, so we interpret this as the article // being entirely without heading. return null; } $text = substr( $text, 0, $matches[ 0 ][ 1 ] ); if ( !$text ) { - // There isn't any text before the first heading so we declare there isn't + // There isn't any text before the first heading, so we declare there isn't // a first heading. return null; } $formatter = new HtmlFormatter( $text ); - $formatter->remove( $this->excludedElementSelectors ); - $formatter->remove( $this->auxiliaryElementSelectors ); + $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); + $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); $formatter->filterContent(); $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); if ( !$text ) { - // There isn't any text after filtering before the first heading so we declare + // There isn't any text after filtering before the first heading, so we declare // that there isn't a first heading. return null; } @@ -222,7 +218,7 @@ class WikiTextStructure { } /** - * @return string + * @return string|null */ public function getOpeningText() { $this->extractWikitextParts(); @@ -246,7 +242,7 @@ class WikiTextStructure { } /** - * Get the defaultsort property + * Get the "defaultsort" property * @return string|null */ public function getDefaultSort() { diff --git a/includes/content/WikitextContentHandler.php b/includes/content/WikitextContentHandler.php index 24a93e63e177..1420665e73a6 100644 --- a/includes/content/WikitextContentHandler.php +++ b/includes/content/WikitextContentHandler.php @@ -44,37 +44,14 @@ use Wikimedia\UUID\GlobalIdGenerator; */ class WikitextContentHandler extends TextContentHandler { - /** @var TitleFactory */ - private $titleFactory; + private TitleFactory $titleFactory; + private ParserFactory $parserFactory; + private GlobalIdGenerator $globalIdGenerator; + private LanguageNameUtils $languageNameUtils; + private LinkRenderer $linkRenderer; + private MagicWordFactory $magicWordFactory; + private ParsoidParserFactory $parsoidParserFactory; - /** @var ParserFactory */ - private $parserFactory; - - /** @var GlobalIdGenerator */ - private $globalIdGenerator; - - /** @var LanguageNameUtils */ - private $languageNameUtils; - - /** @var LinkRenderer */ - private $linkRenderer; - - /** @var MagicWordFactory */ - private $magicWordFactory; - - /** @var ParsoidParserFactory */ - private $parsoidParserFactory; - - /** - * @param string $modelId - * @param TitleFactory $titleFactory - * @param ParserFactory $parserFactory - * @param GlobalIdGenerator $globalIdGenerator - * @param LanguageNameUtils $languageNameUtils - * @param LinkRenderer $linkRenderer - * @param MagicWordFactory $magicWordFactory - * @param ParsoidParserFactory $parsoidParserFactory - */ public function __construct( string $modelId, TitleFactory $titleFactory, @@ -121,9 +98,9 @@ class WikitextContentHandler extends TextContentHandler { } else { $iw = $destination->getInterwiki(); if ( $iw && $this->languageNameUtils->getLanguageName( $iw, - LanguageNameUtils::AUTONYMS, - LanguageNameUtils::DEFINED ) - ) { + LanguageNameUtils::AUTONYMS, + LanguageNameUtils::DEFINED + ) ) { $optionalColon = ':'; } } @@ -212,10 +189,8 @@ class WikitextContentHandler extends TextContentHandler { $fields['opening_text']->setFlag( SearchIndexField::FLAG_SCORING | SearchIndexField::FLAG_NO_HIGHLIGHT ); - // Until we have full first-class content handler for files, we invoke it explicitly here - $fields = array_merge( $fields, $this->getFileHandler()->getFieldsForSearchIndex( $engine ) ); - - return $fields; + // Until we have the full first-class content handler for files, we invoke it explicitly here + return array_merge( $fields, $this->getFileHandler()->getFieldsForSearchIndex( $engine ) ); } public function getDataForSearchIndex( @@ -235,10 +210,12 @@ class WikitextContentHandler extends TextContentHandler { $fields['defaultsort'] = $structure->getDefaultSort(); $fields['file_text'] = null; - // Until we have full first-class content handler for files, we invoke it explicitly here + // Until we have the full first-class content handler for files, we invoke it explicitly here if ( $page->getTitle()->getNamespace() === NS_FILE ) { - $fields = array_merge( $fields, - $this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision ) ); + $fields = array_merge( + $fields, + $this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision ) + ); } return $fields; } @@ -304,13 +281,12 @@ class WikitextContentHandler extends TextContentHandler { '@phan-var WikitextContent $content'; $text = $content->getText(); - $plt = $this->parserFactory->getInstance() - ->getPreloadText( - $text, - $pltParams->getPage(), - $pltParams->getParserOptions(), - $pltParams->getParams() - ); + $plt = $this->parserFactory->getInstance()->getPreloadText( + $text, + $pltParams->getPage(), + $pltParams->getParserOptions(), + $pltParams->getParams() + ); $contentClass = $this->getContentClass(); return new $contentClass( $plt ); |