aboutsummaryrefslogtreecommitdiffstats
path: root/maintenance/benchmarks
diff options
context:
space:
mode:
authorC. Scott Ananian <cscott@cscott.net>2022-01-21 17:03:26 -0500
committerC. Scott Ananian <cscott@cscott.net>2022-03-04 14:06:02 -0500
commit9f14fbd002713abcf65a5b5cddc5d52dee90a977 (patch)
tree35553d39a02079a3aa561478f5bb28a629064595 /maintenance/benchmarks
parentccaeb8368072bb00c6bbf6e2447a42da26acbbf2 (diff)
downloadmediawikicore-9f14fbd002713abcf65a5b5cddc5d52dee90a977.tar.gz
mediawikicore-9f14fbd002713abcf65a5b5cddc5d52dee90a977.zip
Add Sanitizer::removeSomeTags() which uses Remex to tokenize
The existing Sanitizer::removeHTMLtags() method, in addition to having dodgy capitalization, uses regular expressions to parse the HTML. That produces corner cases like T298401 and T67747 and is not guaranteed to yield balanced or well-formed HTML. Instead, introduce and use a new Sanitizer::removeSomeTags() method which is guaranteed to always return balanced and well-formed HTML. Note that Sanitizer::removeHTMLtags()/::removeSomeTags() take a callback argument which (as far as I can tell) is never used outside core. Mark that argument as @internal, and clean up the version used by ::removeSomeTags(). Use the new ::removeSomeTags() method in the two places where DISPLAYTITLE is handled (following up on T67747). The use by the legacy parser is more difficult to replace (and would have a performace cost), so leave the old ::removeHTMLtags() method in place for that call site for now: when the legacy parser is replaced by Parsoid the need for the old ::removeHTMLtags() will go away. In a follow-up patch we'll rename ::removeHTMLtags() and mark it @internal so that we can deprecate ::removeHTMLtags() for external use. Some benchmarking code added. On my machine, with PHP 7.4, the new method tidies short 30-character title strings at a rate of about 6764/s while the tidy-based method being replaced here managed 6384/s. Sanitizer::removeHTMLtags blazes through short strings 20x faster (120,915/s); some of this difference is due to the set up cost of creating the tag whitelist and the Remex pipeline, so further optimizations could doubtless be done if Sanitizer::removeSomeTags() is more widely used. Bug: T299722 Bug: T67747 Change-Id: Ic864c01471c292f11799c4fbdac4d7d30b8bc50f
Diffstat (limited to 'maintenance/benchmarks')
-rw-r--r--maintenance/benchmarks/benchmarkSanitizer.php41
1 files changed, 40 insertions, 1 deletions
diff --git a/maintenance/benchmarks/benchmarkSanitizer.php b/maintenance/benchmarks/benchmarkSanitizer.php
index 6054e0f563c0..9ba41ba321bf 100644
--- a/maintenance/benchmarks/benchmarkSanitizer.php
+++ b/maintenance/benchmarks/benchmarkSanitizer.php
@@ -31,11 +31,13 @@ class BenchmarkSanitizer extends Benchmarker {
parent::__construct();
$this->addDescription( 'Benchmark for Sanitizer methods.' );
$this->addOption( 'method', 'One of "validateEmail", "encodeAttribute", '
- . '"safeEncodeAttribute", "removeHTMLtags", or "stripAllTags". '
+ . '"safeEncodeAttribute", "removeHTMLtags", "removeSomeTags", "tidy", or "stripAllTags". '
. 'Default: (All)', false, true );
}
public function execute() {
+ # text with no html simulates an interface message string or a title
+ $textWithNoHtml = 'This could be an article title';
$textWithHtmlSm = 'Before <wrap><in>and</in> another <unclose> <in>word</in></wrap>.';
$textWithHtmlLg = str_repeat(
// 28K (28 chars * 1000)
@@ -71,8 +73,12 @@ class BenchmarkSanitizer extends Benchmarker {
};
}
if ( !$method || $method === 'removeHTMLtags' ) {
+ $tiny = strlen( $textWithNoHtml );
$sm = strlen( $textWithHtmlSm );
$lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K';
+ $benches["Sanitizer::removeHTMLtags (input: $tiny)"] = static function () use ( $textWithNoHtml ) {
+ Sanitizer::removeHTMLtags( $textWithNoHtml );
+ };
$benches["Sanitizer::removeHTMLtags (input: $sm)"] = static function () use ( $textWithHtmlSm ) {
Sanitizer::removeHTMLtags( $textWithHtmlSm );
};
@@ -80,6 +86,39 @@ class BenchmarkSanitizer extends Benchmarker {
Sanitizer::removeHTMLtags( $textWithHtmlLg );
};
}
+ if ( !$method || $method === 'tidy' ) {
+ # This matches what DISPLAYTITLE was previously doing to sanitize
+ # title strings
+ $tiny = strlen( $textWithNoHtml );
+ $sm = strlen( $textWithHtmlSm );
+ $lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K';
+ $doit = static function ( $text ) {
+ return static function () use ( $text ) {
+ $tidy = new \MediaWiki\Tidy\RemexDriver( new \MediaWiki\Config\ServiceOptions( [ 'TidyConfig' ], [
+ 'TidyConfig' => [ 'pwrap' => false ],
+ ] ) );
+ $textWithTags = $tidy->tidy( $text, [ Sanitizer::class, 'armorFrenchSpaces' ] );
+ $textWithTags = Sanitizer::normalizeCharReferences( Sanitizer::removeHTMLtags( $textWithTags ) );
+ };
+ };
+ $benches["DISPLAYTITLE tidy (input: $tiny)"] = $doit( $textWithNoHtml );
+ $benches["DISPLAYTITLE tidy (input: $sm)"] = $doit( $textWithHtmlSm );
+ $benches["DISPLAYTITLE tidy (input: $lg)"] = $doit( $textWithHtmlLg );
+ }
+ if ( !$method || $method === 'removeSomeTags' ) {
+ $tiny = strlen( $textWithNoHtml );
+ $sm = strlen( $textWithHtmlSm );
+ $lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K';
+ $benches["Sanitizer::removeSomeTags (input: $tiny)"] = static function () use ( $textWithNoHtml ) {
+ Sanitizer::removeSomeTags( $textWithNoHtml );
+ };
+ $benches["Sanitizer::removeSomeTags (input: $sm)"] = static function () use ( $textWithHtmlSm ) {
+ Sanitizer::removeSomeTags( $textWithHtmlSm );
+ };
+ $benches["Sanitizer::removeSomeTags (input: $lg)"] = static function () use ( $textWithHtmlLg ) {
+ Sanitizer::removeSomeTags( $textWithHtmlLg );
+ };
+ }
if ( !$method || $method === 'stripAllTags' ) {
$sm = strlen( $textWithHtmlSm );
$lg = round( strlen( $textWithHtmlLg ) / 1000 ) . 'K';