diff options
Diffstat (limited to 'includes/content/WikiTextStructure.php')
-rw-r--r-- | includes/content/WikiTextStructure.php | 84 |
1 files changed, 40 insertions, 44 deletions
diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php index 003eb2761da4..1046a4329b89 100644 --- a/includes/content/WikiTextStructure.php +++ b/includes/content/WikiTextStructure.php @@ -5,30 +5,20 @@ use MediaWiki\Parser\ParserOutput; use MediaWiki\Parser\Sanitizer; /** - * Class allowing to explore structure of parsed wikitext. + * Class allowing to explore the structure of parsed wikitext. */ class WikiTextStructure { - /** - * @var string - */ - private $openingText; - /** - * @var string - */ - private $allText; - /** - * @var string[] - */ - private $auxText = []; - /** - * @var ParserOutput - */ - private $parserOutput; + + private ?string $openingText = null; + private ?string $allText = null; + /** @var string[] */ + private array $auxText = []; + private ParserOutput $parserOutput; /** - * @var string[] selectors to elements that are excluded entirely from search + * Selectors to elements that are excluded entirely from search */ - private $excludedElementSelectors = [ + private const EXCLUDED_ELEMENT_SELECTORS = [ // "it looks like you don't have javascript enabled..." – do not need to index 'audio', 'video', // CSS stylesheets aren't content @@ -39,7 +29,7 @@ class WikiTextStructure { '.mw-cite-backlink', // Headings are already indexed in their own field. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - // Collapsed fields are hidden by default so we don't want them showing up. + // Collapsed fields are hidden by default, so we don't want them showing up. '.autocollapse', // Content explicitly decided to be not searchable by editors such // as custom navigation templates. @@ -49,9 +39,9 @@ class WikiTextStructure { ]; /** - * @var string[] selectors to elements that are considered auxiliary to article text for search + * Selectors to elements that are considered auxiliary to the article text for search */ - private $auxiliaryElementSelectors = [ + private const AUXILIARY_ELEMENT_SELECTORS = [ // Thumbnail captions aren't really part of the text proper '.thumbcaption', 'figcaption', @@ -73,15 +63,18 @@ class WikiTextStructure { } /** - * Get headings on the page. + * Gets headings from the page. * @return string[] * First strip out things that look like references. We can't use HTML filtering because * the references come back as <sup> tags without a class. To keep from breaking stuff like * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>== - * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove - * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo - * or something. Whatever. So we only strip things that look like <sup> tags wrapping a - * reference. And since the data looks like: + * we don't remove the whole <sup> tag. + * + * We also don't want to strip the <sup> tag and remove everything that looks like [2] because, + * I don't know, maybe there is a band named Word [2] Foo r something. Whatever. + * + * So we only strip things that look like <sup> tags wrapping a reference. And since the data + * looks like: * Reference in heading <sup>[1]</sup><sup>[2]</sup> * we can not really use HtmlFormatter as we have no suitable selector. */ @@ -123,14 +116,16 @@ class WikiTextStructure { */ public static function parseSettingsInMessage( $message ) { $lines = explode( "\n", $message ); - $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments - $lines = array_map( 'trim', $lines ); // Remove extra spaces - $lines = array_filter( $lines ); // Remove empty lines - return $lines; + // Remove comments + $lines = preg_replace( '/#.*$/', '', $lines ); + // Remove extra spaces + $lines = array_map( 'trim', $lines ); + // Remove empty lines + return array_filter( $lines ); } /** - * Get list of heading to ignore. + * Gets a list of heading to ignore. * @return string[] */ private function getIgnoredHeadings() { @@ -139,12 +134,13 @@ class WikiTextStructure { $ignoredHeadings = []; $source = wfMessage( 'search-ignored-headings' )->inContentLanguage(); if ( $source->isBlank() ) { - // Try old version too, just in case + // Try the old version too, just in case $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage(); } if ( !$source->isDisabled() ) { $lines = self::parseSettingsInMessage( $source->plain() ); - $ignoredHeadings = $lines; // Now we just have headings! + // Now we just have headings! + $ignoredHeadings = $lines; } } return $ignoredHeadings; @@ -172,13 +168,13 @@ class WikiTextStructure { $formatter = new HtmlFormatter( $text ); // Strip elements from the page that we never want in the search text. - $formatter->remove( $this->excludedElementSelectors ); + $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); $formatter->filterContent(); // Strip elements from the page that are auxiliary text. These will still be - // searched but matches will be ranked lower and non-auxiliary matches will be + // searched, but matches will be ranked lower and non-auxiliary matches will be // preferred in highlighting. - $formatter->remove( $this->auxiliaryElementSelectors ); + $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); $auxiliaryElements = $formatter->filterContent(); $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); foreach ( $auxiliaryElements as $auxiliaryElement ) { @@ -195,25 +191,25 @@ class WikiTextStructure { private function extractTextBeforeFirstHeading( $text ) { $matches = []; if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) { - // There isn't a first heading so we interpret this as the article + // There isn't a first heading, so we interpret this as the article // being entirely without heading. return null; } $text = substr( $text, 0, $matches[ 0 ][ 1 ] ); if ( !$text ) { - // There isn't any text before the first heading so we declare there isn't + // There isn't any text before the first heading, so we declare there isn't // a first heading. return null; } $formatter = new HtmlFormatter( $text ); - $formatter->remove( $this->excludedElementSelectors ); - $formatter->remove( $this->auxiliaryElementSelectors ); + $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS ); + $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS ); $formatter->filterContent(); $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) ); if ( !$text ) { - // There isn't any text after filtering before the first heading so we declare + // There isn't any text after filtering before the first heading, so we declare // that there isn't a first heading. return null; } @@ -222,7 +218,7 @@ class WikiTextStructure { } /** - * @return string + * @return string|null */ public function getOpeningText() { $this->extractWikitextParts(); @@ -246,7 +242,7 @@ class WikiTextStructure { } /** - * Get the defaultsort property + * Get the "defaultsort" property * @return string|null */ public function getDefaultSort() { |