WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990
author: Reedy <reedy@wikimedia.org> 2023-08-23 12:13:16 +0100
committer: Thiemo Kreuz (WMDE) <thiemo.kreuz@wikimedia.de> 2024-02-01 08:01:27 +0000
commit: d3b14ec8628372b607e938d8ca5003cc0783b14d (patch)
tree: 27c7fabd4ec70d807c35f432f6ea6f5db1d754d1 /includes/content
parent: 5dca00526add34d5c566e882e6763baed9d80a38 (diff)
download: mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.tar.gz
mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.zip
2 files changed, 63 insertions, 91 deletions
diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php
index 003eb2761da4..1046a4329b89 100644
--- a/includes/content/WikiTextStructure.php
+++ b/includes/content/WikiTextStructure.php
@@ -5,30 +5,20 @@ use MediaWiki\Parser\ParserOutput;
 use MediaWiki\Parser\Sanitizer;
 
 /**
- * Class allowing to explore structure of parsed wikitext.
+ * Class allowing to explore the structure of parsed wikitext.
  */
 class WikiTextStructure {
-	/**
-	 * @var string
-	 */
-	private $openingText;
-	/**
-	 * @var string
-	 */
-	private $allText;
-	/**
-	 * @var string[]
-	 */
-	private $auxText = [];
-	/**
-	 * @var ParserOutput
-	 */
-	private $parserOutput;
+
+	private ?string $openingText = null;
+	private ?string $allText = null;
+	/** @var string[] */
+	private array $auxText = [];
+	private ParserOutput $parserOutput;
 
 	/**
-	 * @var string[] selectors to elements that are excluded entirely from search
+	 * Selectors to elements that are excluded entirely from search
 	 */
-	private $excludedElementSelectors = [
+	private const EXCLUDED_ELEMENT_SELECTORS = [
 		// "it looks like you don't have javascript enabled..." – do not need to index
 		'audio', 'video',
 		// CSS stylesheets aren't content
@@ -39,7 +29,7 @@ class WikiTextStructure {
 		'.mw-cite-backlink',
 		// Headings are already indexed in their own field.
 		'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-		// Collapsed fields are hidden by default so we don't want them showing up.
+		// Collapsed fields are hidden by default, so we don't want them showing up.
 		'.autocollapse',
 		// Content explicitly decided to be not searchable by editors such
 		// as custom navigation templates.
@@ -49,9 +39,9 @@ class WikiTextStructure {
 	];
 
 	/**
-	 * @var string[] selectors to elements that are considered auxiliary to article text for search
+	 * Selectors to elements that are considered auxiliary to the article text for search
 	 */
-	private $auxiliaryElementSelectors = [
+	private const AUXILIARY_ELEMENT_SELECTORS = [
 		// Thumbnail captions aren't really part of the text proper
 		'.thumbcaption',
 		'figcaption',
@@ -73,15 +63,18 @@ class WikiTextStructure {
 	}
 
 	/**
-	 * Get headings on the page.
+	 * Gets headings from the page.
 	 * @return string[]
 	 * First strip out things that look like references.  We can't use HTML filtering because
 	 * the references come back as <sup> tags without a class.  To keep from breaking stuff like
 	 *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
-	 * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
-	 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
-	 * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
-	 * reference.  And since the data looks like:
+	 * we don't remove the whole <sup> tag.
+	 *
+	 * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
+	 * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
+	 *
+	 * So we only strip things that look like <sup> tags wrapping a reference. And since the data
+	 * looks like:
 	 *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
 	 * we can not really use HtmlFormatter as we have no suitable selector.
 	 */
@@ -123,14 +116,16 @@ class WikiTextStructure {
 	 */
 	public static function parseSettingsInMessage( $message ) {
 		$lines = explode( "\n", $message );
-		$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
-		$lines = array_map( 'trim', $lines );          // Remove extra spaces
-		$lines = array_filter( $lines );               // Remove empty lines
-		return $lines;
+		// Remove comments
+		$lines = preg_replace( '/#.*$/', '', $lines );
+		// Remove extra spaces
+		$lines = array_map( 'trim', $lines );
+		// Remove empty lines
+		return array_filter( $lines );
 	}
 
 	/**
-	 * Get list of heading to ignore.
+	 * Gets a list of heading to ignore.
 	 * @return string[]
 	 */
 	private function getIgnoredHeadings() {
@@ -139,12 +134,13 @@ class WikiTextStructure {
 			$ignoredHeadings = [];
 			$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
 			if ( $source->isBlank() ) {
-				// Try old version too, just in case
+				// Try the old version too, just in case
 				$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
 			}
 			if ( !$source->isDisabled() ) {
 				$lines = self::parseSettingsInMessage( $source->plain() );
-				$ignoredHeadings = $lines;               // Now we just have headings!
+				// Now we just have headings!
+				$ignoredHeadings = $lines;
 			}
 		}
 		return $ignoredHeadings;
@@ -172,13 +168,13 @@ class WikiTextStructure {
 		$formatter = new HtmlFormatter( $text );
 
 		// Strip elements from the page that we never want in the search text.
-		$formatter->remove( $this->excludedElementSelectors );
+		$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
 		$formatter->filterContent();
 
 		// Strip elements from the page that are auxiliary text.  These will still be
-		// searched but matches will be ranked lower and non-auxiliary matches will be
+		// searched, but matches will be ranked lower and non-auxiliary matches will be
 		// preferred in highlighting.
-		$formatter->remove( $this->auxiliaryElementSelectors );
+		$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
 		$auxiliaryElements = $formatter->filterContent();
 		$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 		foreach ( $auxiliaryElements as $auxiliaryElement ) {
@@ -195,25 +191,25 @@ class WikiTextStructure {
 	private function extractTextBeforeFirstHeading( $text ) {
 		$matches = [];
 		if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
-			// There isn't a first heading so we interpret this as the article
+			// There isn't a first heading, so we interpret this as the article
 			// being entirely without heading.
 			return null;
 		}
 		$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
 		if ( !$text ) {
-			// There isn't any text before the first heading so we declare there isn't
+			// There isn't any text before the first heading, so we declare there isn't
 			// a first heading.
 			return null;
 		}
 
 		$formatter = new HtmlFormatter( $text );
-		$formatter->remove( $this->excludedElementSelectors );
-		$formatter->remove( $this->auxiliaryElementSelectors );
+		$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
+		$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
 		$formatter->filterContent();
 		$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 
 		if ( !$text ) {
-			// There isn't any text after filtering before the first heading so we declare
+			// There isn't any text after filtering before the first heading, so we declare
 			// that there isn't a first heading.
 			return null;
 		}
@@ -222,7 +218,7 @@ class WikiTextStructure {
 	}
 
 	/**
-	 * @return string
+	 * @return string|null
 	 */
 	public function getOpeningText() {
 		$this->extractWikitextParts();
@@ -246,7 +242,7 @@ class WikiTextStructure {
 	}
 
 	/**
-	 * Get the defaultsort property
+	 * Get the "defaultsort" property
 	 * @return string|null
 	 */
 	public function getDefaultSort() {
diff --git a/includes/content/WikitextContentHandler.php b/includes/content/WikitextContentHandler.php
index 24a93e63e177..1420665e73a6 100644
--- a/includes/content/WikitextContentHandler.php
+++ b/includes/content/WikitextContentHandler.php
@@ -44,37 +44,14 @@ use Wikimedia\UUID\GlobalIdGenerator;
  */
 class WikitextContentHandler extends TextContentHandler {
 
-	/** @var TitleFactory */
-	private $titleFactory;
+	private TitleFactory $titleFactory;
+	private ParserFactory $parserFactory;
+	private GlobalIdGenerator $globalIdGenerator;
+	private LanguageNameUtils $languageNameUtils;
+	private LinkRenderer $linkRenderer;
+	private MagicWordFactory $magicWordFactory;
+	private ParsoidParserFactory $parsoidParserFactory;
 
-	/** @var ParserFactory */
-	private $parserFactory;
-
-	/** @var GlobalIdGenerator */
-	private $globalIdGenerator;
-
-	/** @var LanguageNameUtils */
-	private $languageNameUtils;
-
-	/** @var LinkRenderer */
-	private $linkRenderer;
-
-	/** @var MagicWordFactory */
-	private $magicWordFactory;
-
-	/** @var ParsoidParserFactory */
-	private $parsoidParserFactory;
-
-	/**
-	 * @param string $modelId
-	 * @param TitleFactory $titleFactory
-	 * @param ParserFactory $parserFactory
-	 * @param GlobalIdGenerator $globalIdGenerator
-	 * @param LanguageNameUtils $languageNameUtils
-	 * @param LinkRenderer $linkRenderer
-	 * @param MagicWordFactory $magicWordFactory
-	 * @param ParsoidParserFactory $parsoidParserFactory
-	 */
 	public function __construct(
 		string $modelId,
 		TitleFactory $titleFactory,
@@ -121,9 +98,9 @@ class WikitextContentHandler extends TextContentHandler {
 		} else {
 			$iw = $destination->getInterwiki();
 			if ( $iw && $this->languageNameUtils->getLanguageName( $iw,
-						LanguageNameUtils::AUTONYMS,
-						LanguageNameUtils::DEFINED )
-			) {
+				LanguageNameUtils::AUTONYMS,
+				LanguageNameUtils::DEFINED
+			) ) {
 				$optionalColon = ':';
 			}
 		}
@@ -212,10 +189,8 @@ class WikitextContentHandler extends TextContentHandler {
 		$fields['opening_text']->setFlag(
 			SearchIndexField::FLAG_SCORING | SearchIndexField::FLAG_NO_HIGHLIGHT
 		);
-		// Until we have full first-class content handler for files, we invoke it explicitly here
-		$fields = array_merge( $fields, $this->getFileHandler()->getFieldsForSearchIndex( $engine ) );
-
-		return $fields;
+		// Until we have the full first-class content handler for files, we invoke it explicitly here
+		return array_merge( $fields, $this->getFileHandler()->getFieldsForSearchIndex( $engine ) );
 	}
 
 	public function getDataForSearchIndex(
@@ -235,10 +210,12 @@ class WikitextContentHandler extends TextContentHandler {
 		$fields['defaultsort'] = $structure->getDefaultSort();
 		$fields['file_text'] = null;
 
-		// Until we have full first-class content handler for files, we invoke it explicitly here
+		// Until we have the full first-class content handler for files, we invoke it explicitly here
 		if ( $page->getTitle()->getNamespace() === NS_FILE ) {
-			$fields = array_merge( $fields,
-					$this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision ) );
+			$fields = array_merge(
+				$fields,
+				$this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision )
+			);
 		}
 		return $fields;
 	}
@@ -304,13 +281,12 @@ class WikitextContentHandler extends TextContentHandler {
 		'@phan-var WikitextContent $content';
 		$text = $content->getText();
 
-		$plt = $this->parserFactory->getInstance()
-			->getPreloadText(
-				$text,
-				$pltParams->getPage(),
-				$pltParams->getParserOptions(),
-				$pltParams->getParams()
-			);
+		$plt = $this->parserFactory->getInstance()->getPreloadText(
+			$text,
+			$pltParams->getPage(),
+			$pltParams->getParserOptions(),
+			$pltParams->getParams()
+		);
 
 		$contentClass = $this->getContentClass();
 		return new $contentClass( $plt );
author	Reedy <reedy@wikimedia.org>	2023-08-23 12:13:16 +0100
committer	Thiemo Kreuz (WMDE) <thiemo.kreuz@wikimedia.de>	2024-02-01 08:01:27 +0000
commit	d3b14ec8628372b607e938d8ca5003cc0783b14d (patch)
tree	27c7fabd4ec70d807c35f432f6ea6f5db1d754d1 /includes/content
parent	5dca00526add34d5c566e882e6763baed9d80a38 (diff)
download	mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.tar.gz mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.zip