aboutsummaryrefslogtreecommitdiffstats
path: root/includes/tidy/RemexDriver.php
blob: 0738e8406b87824ad3f7b78d0f675c151018d6bd (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
<?php

namespace MediaWiki\Tidy;

use MediaWiki\Config\ServiceOptions;
use MediaWiki\MainConfigNames;
use Wikimedia\RemexHtml\HTMLData;
use Wikimedia\RemexHtml\Serializer\Serializer;
use Wikimedia\RemexHtml\Serializer\SerializerWithTracer;
use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
use Wikimedia\RemexHtml\TreeBuilder\TreeMutationTracer;

class RemexDriver extends TidyDriverBase {
	/** @var bool */
	private $treeMutationTrace;
	/** @var bool */
	private $serializerTrace;
	/** @var bool */
	private $mungerTrace;
	/** @var bool */
	private $pwrap;
	/** @var bool */
	private $enableLegacyMediaDOM;

	/** @internal */
	public const CONSTRUCTOR_OPTIONS = [
		MainConfigNames::TidyConfig,
		MainConfigNames::ParserEnableLegacyMediaDOM,
	];

	/**
	 * @param ServiceOptions $options
	 */
	public function __construct( ServiceOptions $options ) {
		$options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
		$config = $options->get( MainConfigNames::TidyConfig );
		$this->enableLegacyMediaDOM = $options->get( MainConfigNames::ParserEnableLegacyMediaDOM );
		$config += [
			'treeMutationTrace' => false,
			'serializerTrace' => false,
			'mungerTrace' => false,
			'pwrap' => true
		];
		$this->treeMutationTrace = $config['treeMutationTrace'];
		$this->serializerTrace = $config['serializerTrace'];
		$this->mungerTrace = $config['mungerTrace'];
		$this->pwrap = $config['pwrap'];
		parent::__construct( $config );
	}

	/** @inheritDoc */
	public function tidy( $text, ?callable $textProcessor = null ) {
		$traceCallback = static function ( $msg ) {
			wfDebug( "RemexHtml: $msg" );
		};
		$formatter = new RemexCompatFormatter( [ 'textProcessor' => $textProcessor ] );
		if ( $this->serializerTrace ) {
			$serializer = new SerializerWithTracer( $formatter, null, $traceCallback );
		} else {
			$serializer = new Serializer( $formatter );
		}
		if ( $this->pwrap ) {
			$munger = new RemexCompatMunger( $serializer, $this->mungerTrace );
		} else {
			$munger = $serializer;
		}
		if ( $this->treeMutationTrace ) {
			$tracer = new TreeMutationTracer( $munger, $traceCallback );
		} else {
			$tracer = $munger;
		}
		$treeBuilderClass = $this->enableLegacyMediaDOM ? TreeBuilder::class : RemexCompatBuilder::class;
		$treeBuilder = new $treeBuilderClass( $tracer, [
			'ignoreErrors' => true,
			'ignoreNulls' => true,
		] );
		$dispatcher = new Dispatcher( $treeBuilder );
		$tokenizer = new Tokenizer( $dispatcher, $text, [
			'ignoreErrors' => true,
			'ignoreCharRefs' => true,
			'ignoreNulls' => true,
			'skipPreprocess' => true,
		] );

		$tokenizer->execute( [
			'fragmentNamespace' => HTMLData::NS_HTML,
			'fragmentName' => 'body'
		] );
		return $serializer->getResult();
	}
}