aboutsummaryrefslogtreecommitdiffstats
path: root/includes/content
diff options
context:
space:
mode:
authorReedy <reedy@wikimedia.org>2023-08-23 12:13:16 +0100
committerThiemo Kreuz (WMDE) <thiemo.kreuz@wikimedia.de>2024-02-01 08:01:27 +0000
commitd3b14ec8628372b607e938d8ca5003cc0783b14d (patch)
tree27c7fabd4ec70d807c35f432f6ea6f5db1d754d1 /includes/content
parent5dca00526add34d5c566e882e6763baed9d80a38 (diff)
downloadmediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.tar.gz
mediawikicore-d3b14ec8628372b607e938d8ca5003cc0783b14d.zip
WikiTextStructure/WikitextContentHandler: Minor cleanup
Change-Id: If2f8243867994609d82618e61ddaaacca3516990
Diffstat (limited to 'includes/content')
-rw-r--r--includes/content/WikiTextStructure.php84
-rw-r--r--includes/content/WikitextContentHandler.php70
2 files changed, 63 insertions, 91 deletions
diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php
index 003eb2761da4..1046a4329b89 100644
--- a/includes/content/WikiTextStructure.php
+++ b/includes/content/WikiTextStructure.php
@@ -5,30 +5,20 @@ use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Sanitizer;
/**
- * Class allowing to explore structure of parsed wikitext.
+ * Class allowing to explore the structure of parsed wikitext.
*/
class WikiTextStructure {
- /**
- * @var string
- */
- private $openingText;
- /**
- * @var string
- */
- private $allText;
- /**
- * @var string[]
- */
- private $auxText = [];
- /**
- * @var ParserOutput
- */
- private $parserOutput;
+
+ private ?string $openingText = null;
+ private ?string $allText = null;
+ /** @var string[] */
+ private array $auxText = [];
+ private ParserOutput $parserOutput;
/**
- * @var string[] selectors to elements that are excluded entirely from search
+ * Selectors to elements that are excluded entirely from search
*/
- private $excludedElementSelectors = [
+ private const EXCLUDED_ELEMENT_SELECTORS = [
// "it looks like you don't have javascript enabled..." – do not need to index
'audio', 'video',
// CSS stylesheets aren't content
@@ -39,7 +29,7 @@ class WikiTextStructure {
'.mw-cite-backlink',
// Headings are already indexed in their own field.
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
- // Collapsed fields are hidden by default so we don't want them showing up.
+ // Collapsed fields are hidden by default, so we don't want them showing up.
'.autocollapse',
// Content explicitly decided to be not searchable by editors such
// as custom navigation templates.
@@ -49,9 +39,9 @@ class WikiTextStructure {
];
/**
- * @var string[] selectors to elements that are considered auxiliary to article text for search
+ * Selectors to elements that are considered auxiliary to the article text for search
*/
- private $auxiliaryElementSelectors = [
+ private const AUXILIARY_ELEMENT_SELECTORS = [
// Thumbnail captions aren't really part of the text proper
'.thumbcaption',
'figcaption',
@@ -73,15 +63,18 @@ class WikiTextStructure {
}
/**
- * Get headings on the page.
+ * Gets headings from the page.
* @return string[]
* First strip out things that look like references. We can't use HTML filtering because
* the references come back as <sup> tags without a class. To keep from breaking stuff like
* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
- * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
- * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
- * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
- * reference. And since the data looks like:
+ * we don't remove the whole <sup> tag.
+ *
+ * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
+ * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
+ *
+ * So we only strip things that look like <sup> tags wrapping a reference. And since the data
+ * looks like:
* Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
* we can not really use HtmlFormatter as we have no suitable selector.
*/
@@ -123,14 +116,16 @@ class WikiTextStructure {
*/
public static function parseSettingsInMessage( $message ) {
$lines = explode( "\n", $message );
- $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
- $lines = array_map( 'trim', $lines ); // Remove extra spaces
- $lines = array_filter( $lines ); // Remove empty lines
- return $lines;
+ // Remove comments
+ $lines = preg_replace( '/#.*$/', '', $lines );
+ // Remove extra spaces
+ $lines = array_map( 'trim', $lines );
+ // Remove empty lines
+ return array_filter( $lines );
}
/**
- * Get list of heading to ignore.
+ * Gets a list of heading to ignore.
* @return string[]
*/
private function getIgnoredHeadings() {
@@ -139,12 +134,13 @@ class WikiTextStructure {
$ignoredHeadings = [];
$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
if ( $source->isBlank() ) {
- // Try old version too, just in case
+ // Try the old version too, just in case
$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
}
if ( !$source->isDisabled() ) {
$lines = self::parseSettingsInMessage( $source->plain() );
- $ignoredHeadings = $lines; // Now we just have headings!
+ // Now we just have headings!
+ $ignoredHeadings = $lines;
}
}
return $ignoredHeadings;
@@ -172,13 +168,13 @@ class WikiTextStructure {
$formatter = new HtmlFormatter( $text );
// Strip elements from the page that we never want in the search text.
- $formatter->remove( $this->excludedElementSelectors );
+ $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
$formatter->filterContent();
// Strip elements from the page that are auxiliary text. These will still be
- // searched but matches will be ranked lower and non-auxiliary matches will be
+ // searched, but matches will be ranked lower and non-auxiliary matches will be
// preferred in highlighting.
- $formatter->remove( $this->auxiliaryElementSelectors );
+ $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
$auxiliaryElements = $formatter->filterContent();
$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
foreach ( $auxiliaryElements as $auxiliaryElement ) {
@@ -195,25 +191,25 @@ class WikiTextStructure {
private function extractTextBeforeFirstHeading( $text ) {
$matches = [];
if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
- // There isn't a first heading so we interpret this as the article
+ // There isn't a first heading, so we interpret this as the article
// being entirely without heading.
return null;
}
$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
if ( !$text ) {
- // There isn't any text before the first heading so we declare there isn't
+ // There isn't any text before the first heading, so we declare there isn't
// a first heading.
return null;
}
$formatter = new HtmlFormatter( $text );
- $formatter->remove( $this->excludedElementSelectors );
- $formatter->remove( $this->auxiliaryElementSelectors );
+ $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
+ $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
$formatter->filterContent();
$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
if ( !$text ) {
- // There isn't any text after filtering before the first heading so we declare
+ // There isn't any text after filtering before the first heading, so we declare
// that there isn't a first heading.
return null;
}
@@ -222,7 +218,7 @@ class WikiTextStructure {
}
/**
- * @return string
+ * @return string|null
*/
public function getOpeningText() {
$this->extractWikitextParts();
@@ -246,7 +242,7 @@ class WikiTextStructure {
}
/**
- * Get the defaultsort property
+ * Get the "defaultsort" property
* @return string|null
*/
public function getDefaultSort() {
diff --git a/includes/content/WikitextContentHandler.php b/includes/content/WikitextContentHandler.php
index 24a93e63e177..1420665e73a6 100644
--- a/includes/content/WikitextContentHandler.php
+++ b/includes/content/WikitextContentHandler.php
@@ -44,37 +44,14 @@ use Wikimedia\UUID\GlobalIdGenerator;
*/
class WikitextContentHandler extends TextContentHandler {
- /** @var TitleFactory */
- private $titleFactory;
+ private TitleFactory $titleFactory;
+ private ParserFactory $parserFactory;
+ private GlobalIdGenerator $globalIdGenerator;
+ private LanguageNameUtils $languageNameUtils;
+ private LinkRenderer $linkRenderer;
+ private MagicWordFactory $magicWordFactory;
+ private ParsoidParserFactory $parsoidParserFactory;
- /** @var ParserFactory */
- private $parserFactory;
-
- /** @var GlobalIdGenerator */
- private $globalIdGenerator;
-
- /** @var LanguageNameUtils */
- private $languageNameUtils;
-
- /** @var LinkRenderer */
- private $linkRenderer;
-
- /** @var MagicWordFactory */
- private $magicWordFactory;
-
- /** @var ParsoidParserFactory */
- private $parsoidParserFactory;
-
- /**
- * @param string $modelId
- * @param TitleFactory $titleFactory
- * @param ParserFactory $parserFactory
- * @param GlobalIdGenerator $globalIdGenerator
- * @param LanguageNameUtils $languageNameUtils
- * @param LinkRenderer $linkRenderer
- * @param MagicWordFactory $magicWordFactory
- * @param ParsoidParserFactory $parsoidParserFactory
- */
public function __construct(
string $modelId,
TitleFactory $titleFactory,
@@ -121,9 +98,9 @@ class WikitextContentHandler extends TextContentHandler {
} else {
$iw = $destination->getInterwiki();
if ( $iw && $this->languageNameUtils->getLanguageName( $iw,
- LanguageNameUtils::AUTONYMS,
- LanguageNameUtils::DEFINED )
- ) {
+ LanguageNameUtils::AUTONYMS,
+ LanguageNameUtils::DEFINED
+ ) ) {
$optionalColon = ':';
}
}
@@ -212,10 +189,8 @@ class WikitextContentHandler extends TextContentHandler {
$fields['opening_text']->setFlag(
SearchIndexField::FLAG_SCORING | SearchIndexField::FLAG_NO_HIGHLIGHT
);
- // Until we have full first-class content handler for files, we invoke it explicitly here
- $fields = array_merge( $fields, $this->getFileHandler()->getFieldsForSearchIndex( $engine ) );
-
- return $fields;
+ // Until we have the full first-class content handler for files, we invoke it explicitly here
+ return array_merge( $fields, $this->getFileHandler()->getFieldsForSearchIndex( $engine ) );
}
public function getDataForSearchIndex(
@@ -235,10 +210,12 @@ class WikitextContentHandler extends TextContentHandler {
$fields['defaultsort'] = $structure->getDefaultSort();
$fields['file_text'] = null;
- // Until we have full first-class content handler for files, we invoke it explicitly here
+ // Until we have the full first-class content handler for files, we invoke it explicitly here
if ( $page->getTitle()->getNamespace() === NS_FILE ) {
- $fields = array_merge( $fields,
- $this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision ) );
+ $fields = array_merge(
+ $fields,
+ $this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision )
+ );
}
return $fields;
}
@@ -304,13 +281,12 @@ class WikitextContentHandler extends TextContentHandler {
'@phan-var WikitextContent $content';
$text = $content->getText();
- $plt = $this->parserFactory->getInstance()
- ->getPreloadText(
- $text,
- $pltParams->getPage(),
- $pltParams->getParserOptions(),
- $pltParams->getParams()
- );
+ $plt = $this->parserFactory->getInstance()->getPreloadText(
+ $text,
+ $pltParams->getPage(),
+ $pltParams->getParserOptions(),
+ $pltParams->getParams()
+ );
$contentClass = $this->getContentClass();
return new $contentClass( $plt );