diff options
author | C. Scott Ananian <cscott@cscott.net> | 2022-01-21 17:03:26 -0500 |
---|---|---|
committer | C. Scott Ananian <cscott@cscott.net> | 2022-03-04 14:06:02 -0500 |
commit | 9f14fbd002713abcf65a5b5cddc5d52dee90a977 (patch) | |
tree | 35553d39a02079a3aa561478f5bb28a629064595 /maintenance/benchmarks | |
parent | ccaeb8368072bb00c6bbf6e2447a42da26acbbf2 (diff) | |
download | mediawikicore-9f14fbd002713abcf65a5b5cddc5d52dee90a977.tar.gz mediawikicore-9f14fbd002713abcf65a5b5cddc5d52dee90a977.zip |
Add Sanitizer::removeSomeTags() which uses Remex to tokenize
The existing Sanitizer::removeHTMLtags() method, in addition to having
dodgy capitalization, uses regular expressions to parse the HTML.
That produces corner cases like T298401 and T67747 and is not guaranteed
to yield balanced or well-formed HTML.
Instead, introduce and use a new Sanitizer::removeSomeTags() method
which is guaranteed to always return balanced and well-formed HTML.
Note that Sanitizer::removeHTMLtags()/::removeSomeTags() take a callback
argument which (as far as I can tell) is never used outside core. Mark
that argument as @internal, and clean up the version used by
::removeSomeTags().
Use the new ::removeSomeTags() method in the two places where
DISPLAYTITLE is handled (following up on T67747). The use by the
legacy parser is more difficult to replace (and would have a
performace cost), so leave the old ::removeHTMLtags() method in place
for that call site for now: when the legacy parser is replaced by
Parsoid the need for the old ::removeHTMLtags() will go away. In a
follow-up patch we'll rename ::removeHTMLtags() and mark it @internal
so that we can deprecate ::removeHTMLtags() for external use.
Some benchmarking code added. On my machine, with PHP 7.4, the new
method tidies short 30-character title strings at a rate of about
6764/s while the tidy-based method being replaced here managed 6384/s.
Sanitizer::removeHTMLtags blazes through short strings 20x faster
(120,915/s); some of this difference is due to the set up cost of
creating the tag whitelist and the Remex pipeline, so further
optimizations could doubtless be done if Sanitizer::removeSomeTags()
is more widely used.
Bug: T299722
Bug: T67747
Change-Id: Ic864c01471c292f11799c4fbdac4d7d30b8bc50f
Diffstat (limited to 'maintenance/benchmarks')
-rw-r--r-- | maintenance/benchmarks/benchmarkSanitizer.php | 41 |
1 files changed, 40 insertions, 1 deletions
diff --git a/maintenance/benchmarks/benchmarkSanitizer.php b/maintenance/benchmarks/benchmarkSanitizer.php index 6054e0f563c0..9ba41ba321bf 100644 --- a/maintenance/benchmarks/benchmarkSanitizer.php +++ b/maintenance/benchmarks/benchmarkSanitizer.php @@ -31,11 +31,13 @@ class BenchmarkSanitizer extends Benchmarker { parent::__construct(); $this->addDescription( 'Benchmark for Sanitizer methods.' ); $this->addOption( 'method', 'One of "validateEmail", "encodeAttribute", ' - . '"safeEncodeAttribute", "removeHTMLtags", or "stripAllTags". ' + . '"safeEncodeAttribute", "removeHTMLtags", "removeSomeTags", "tidy", or "stripAllTags". ' . 'Default: (All)', false, true ); } public function execute() { + # text with no html simulates an interface message string or a title + $textWithNoHtml = 'This could be an article title'; $textWithHtmlSm = 'Before <wrap><in>and</in> another <unclose> <in>word</in></wrap>.'; $textWithHtmlLg = str_repeat( // 28K (28 chars * 1000) @@ -71,8 +73,12 @@ class BenchmarkSanitizer extends Benchmarker { }; } if ( !$method || $method === 'removeHTMLtags' ) { + $tiny = strlen( $textWithNoHtml ); $sm = strlen( $textWithHtmlSm ); $lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K'; + $benches["Sanitizer::removeHTMLtags (input: $tiny)"] = static function () use ( $textWithNoHtml ) { + Sanitizer::removeHTMLtags( $textWithNoHtml ); + }; $benches["Sanitizer::removeHTMLtags (input: $sm)"] = static function () use ( $textWithHtmlSm ) { Sanitizer::removeHTMLtags( $textWithHtmlSm ); }; @@ -80,6 +86,39 @@ class BenchmarkSanitizer extends Benchmarker { Sanitizer::removeHTMLtags( $textWithHtmlLg ); }; } + if ( !$method || $method === 'tidy' ) { + # This matches what DISPLAYTITLE was previously doing to sanitize + # title strings + $tiny = strlen( $textWithNoHtml ); + $sm = strlen( $textWithHtmlSm ); + $lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K'; + $doit = static function ( $text ) { + return static function () use ( $text ) { + $tidy = new \MediaWiki\Tidy\RemexDriver( new \MediaWiki\Config\ServiceOptions( [ 'TidyConfig' ], [ + 'TidyConfig' => [ 'pwrap' => false ], + ] ) ); + $textWithTags = $tidy->tidy( $text, [ Sanitizer::class, 'armorFrenchSpaces' ] ); + $textWithTags = Sanitizer::normalizeCharReferences( Sanitizer::removeHTMLtags( $textWithTags ) ); + }; + }; + $benches["DISPLAYTITLE tidy (input: $tiny)"] = $doit( $textWithNoHtml ); + $benches["DISPLAYTITLE tidy (input: $sm)"] = $doit( $textWithHtmlSm ); + $benches["DISPLAYTITLE tidy (input: $lg)"] = $doit( $textWithHtmlLg ); + } + if ( !$method || $method === 'removeSomeTags' ) { + $tiny = strlen( $textWithNoHtml ); + $sm = strlen( $textWithHtmlSm ); + $lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K'; + $benches["Sanitizer::removeSomeTags (input: $tiny)"] = static function () use ( $textWithNoHtml ) { + Sanitizer::removeSomeTags( $textWithNoHtml ); + }; + $benches["Sanitizer::removeSomeTags (input: $sm)"] = static function () use ( $textWithHtmlSm ) { + Sanitizer::removeSomeTags( $textWithHtmlSm ); + }; + $benches["Sanitizer::removeSomeTags (input: $lg)"] = static function () use ( $textWithHtmlLg ) { + Sanitizer::removeSomeTags( $textWithHtmlLg ); + }; + } if ( !$method || $method === 'stripAllTags' ) { $sm = strlen( $textWithHtmlSm ); $lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K'; |