aboutsummaryrefslogtreecommitdiffstats
path: root/includes/tidy/RemexCompatFormatter.php
blob: 54f6d2890b83c282ba97c948b1c04cffbc92d9bf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
<?php

namespace MediaWiki\Tidy;

use MediaWiki\Parser\Sanitizer;
use Wikimedia\RemexHtml\HTMLData;
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
use Wikimedia\RemexHtml\Serializer\SerializerNode;

/**
 * @internal
 *
 * WATCH OUT! Unlike normal HtmlFormatter, this class requires the 'ignoreCharRefs' option
 * in Tokenizer to be used. If that option is not used, it will produce wrong results (T354361).
 */
class RemexCompatFormatter extends HtmlFormatter {
	private const MARKED_EMPTY_ELEMENTS = [
		'li' => true,
		'p' => true,
		'tr' => true,
	];

	/** @var ?callable */
	private $textProcessor;

	public function __construct( $options = [] ) {
		parent::__construct( $options );
		// Escape non-breaking space
		$this->attributeEscapes["\u{00A0}"] = '&#160;';
		$this->textEscapes["\u{00A0}"] = '&#160;';
		// Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
		unset( $this->attributeEscapes["&"] );
		unset( $this->textEscapes["&"] );
		$this->textProcessor = $options['textProcessor'] ?? null;
	}

	public function startDocument( $fragmentNamespace, $fragmentName ) {
		return '';
	}

	/**
	 * WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains
	 * unexpanded character references (entities), as a result of using the 'ignoreCharRefs' option
	 * in Tokenizer. If that option is not used, this method will produce wrong results (T354361).
	 *
	 * @inheritDoc
	 */
	public function characters( SerializerNode $parent, $text, $start, $length ) {
		$text = parent::characters( $parent, $text, $start, $length );

		if ( $parent->namespace !== HTMLData::NS_HTML
			|| !isset( $this->rawTextElements[$parent->name] )
		) {
			if ( $this->textProcessor !== null ) {
				$text = call_user_func( $this->textProcessor, $text );
			}
		}

		// Ensure a consistent representation for all entities
		$text = Sanitizer::normalizeCharReferences( $text );
		return $text;
	}

	public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
		$data = $node->snData;
		if ( $data && $data->isPWrapper ) {
			if ( $data->nonblankNodeCount ) {
				return "<p>$contents</p>";
			} else {
				return $contents;
			}
		}

		$name = $node->name;
		$attrs = $node->attrs;
		if ( isset( self::MARKED_EMPTY_ELEMENTS[$name] ) && $attrs->count() === 0
			&& strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
		) {
			return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
		}

		$s = "<$name";
		foreach ( $attrs->getValues() as $attrName => $attrValue ) {
			$encValue = strtr( $attrValue, $this->attributeEscapes );
			$encValue = Sanitizer::normalizeCharReferences( $encValue );
			$s .= " $attrName=\"$encValue\"";
		}
		if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
			$s .= ' />';
			return $s;
		}

		$s .= '>';
		if ( $node->namespace === HTMLData::NS_HTML
			&& isset( $contents[0] ) && $contents[0] === "\n"
			&& isset( $this->prefixLfElements[$name] )
		) {
			$s .= "\n$contents</$name>";
		} else {
			$s .= "$contents</$name>";
		}
		return $s;
	}
}