1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
<?php
namespace MediaWiki\Tidy;
use MediaWiki\Parser\Sanitizer;
use Wikimedia\RemexHtml\HTMLData;
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
use Wikimedia\RemexHtml\Serializer\SerializerNode;
/**
* @internal
*
* WATCH OUT! Unlike normal HtmlFormatter, this class requires the 'ignoreCharRefs' option
* in Tokenizer to be used. If that option is not used, it will produce wrong results (T354361).
*/
class RemexCompatFormatter extends HtmlFormatter {
private const MARKED_EMPTY_ELEMENTS = [
'li' => true,
'p' => true,
'tr' => true,
];
/** @var ?callable */
private $textProcessor;
public function __construct( $options = [] ) {
parent::__construct( $options );
// Escape non-breaking space
$this->attributeEscapes["\u{00A0}"] = ' ';
$this->textEscapes["\u{00A0}"] = ' ';
// Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
unset( $this->attributeEscapes["&"] );
unset( $this->textEscapes["&"] );
$this->textProcessor = $options['textProcessor'] ?? null;
}
public function startDocument( $fragmentNamespace, $fragmentName ) {
return '';
}
/**
* WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains
* unexpanded character references (entities), as a result of using the 'ignoreCharRefs' option
* in Tokenizer. If that option is not used, this method will produce wrong results (T354361).
*
* @inheritDoc
*/
public function characters( SerializerNode $parent, $text, $start, $length ) {
$text = parent::characters( $parent, $text, $start, $length );
if ( $parent->namespace !== HTMLData::NS_HTML
|| !isset( $this->rawTextElements[$parent->name] )
) {
if ( $this->textProcessor !== null ) {
$text = call_user_func( $this->textProcessor, $text );
}
}
// Ensure a consistent representation for all entities
$text = Sanitizer::normalizeCharReferences( $text );
return $text;
}
public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
$data = $node->snData;
if ( $data && $data->isPWrapper ) {
if ( $data->nonblankNodeCount ) {
return "<p>$contents</p>";
} else {
return $contents;
}
}
$name = $node->name;
$attrs = $node->attrs;
if ( isset( self::MARKED_EMPTY_ELEMENTS[$name] ) && $attrs->count() === 0
&& strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
) {
return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
}
$s = "<$name";
foreach ( $attrs->getValues() as $attrName => $attrValue ) {
$encValue = strtr( $attrValue, $this->attributeEscapes );
$encValue = Sanitizer::normalizeCharReferences( $encValue );
$s .= " $attrName=\"$encValue\"";
}
if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
$s .= ' />';
return $s;
}
$s .= '>';
if ( $node->namespace === HTMLData::NS_HTML
&& isset( $contents[0] ) && $contents[0] === "\n"
&& isset( $this->prefixLfElements[$name] )
) {
$s .= "\n$contents</$name>";
} else {
$s .= "$contents</$name>";
}
return $s;
}
}
|