From 2650b5af2f8d76095ce134e5af26a251b7bb32d0 Mon Sep 17 00:00:00 2001 From: Edward Chernenko Date: Mon, 30 Dec 2024 22:50:34 +0300 Subject: [PATCH] Use RemexHtml to properly remove tags (without regexes) --- includes/EventCalendar.php | 13 +++---- includes/HtmlSanitizer.php | 53 +++++++++++++++++++++++------ tests/phpunit/EventCalendarTest.php | 8 +---- 3 files changed, 51 insertions(+), 23 deletions(-) diff --git a/includes/EventCalendar.php b/includes/EventCalendar.php index cd073ba..5ab374b 100644 --- a/includes/EventCalendar.php +++ b/includes/EventCalendar.php @@ -230,15 +230,16 @@ public function findEvents( array $opt, Parser $recursiveParser ) { if ( !$snippet ) { // Full text of the page (no more than N first symbols) was requested. // NOTE: we can't use getParserOutput() here, because we are already inside Parser::parse(). - $parsedHtml = $recursiveParser->recursiveTagParseFully( $row->text ); + $snippet = $recursiveParser->recursiveTagParseFully( $row->text ); - // Remove the image tags: in 99,9% of cases they are too wide to be included into the calendar. - // TODO: properly remove
with all contents (currently hidden by CSS). - $parsedHtml = preg_replace( '/]+>/', '', $parsedHtml ); - $snippet = mb_substr( $parsedHtml, 0, $maxSymbols ); + // Remove unwanted HTML tags (if any), such as images (too wide for the calendar). + $snippet = HtmlSanitizer::sanitizeSnippet( $snippet ); + + // Truncate to maximum allowed length. + $snippet = mb_substr( $snippet, 0, $maxSymbols ); // Remove truncated HTML tags (if any). - $snippet = HtmlSanitizer::sanitizeHTML( $snippet ); + $snippet = HtmlSanitizer::sanitizeSnippet( $snippet ); // Store the snippet in cache. // NOTE: the reason why we don't use $dbCache->set() here is that SqlBagOStuff::set() will do diff --git a/includes/HtmlSanitizer.php b/includes/HtmlSanitizer.php index 73f87ff..ae5eaad 100644 --- a/includes/HtmlSanitizer.php +++ b/includes/HtmlSanitizer.php @@ -25,18 +25,58 @@ use Wikimedia\RemexHtml\Serializer\HtmlFormatter; use Wikimedia\RemexHtml\Serializer\Serializer; +use Wikimedia\RemexHtml\Serializer\SerializerNode; use Wikimedia\RemexHtml\Tokenizer\Tokenizer; use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; class HtmlSanitizer { /** - * Remove invalid/non-matching/truncated HTML tags and return correct (sanitized) HTML. + * Remove unwanted/invalid/non-matching/truncated HTML tags and return correct (sanitized) HTML. * @param string $html * @return string */ - public static function sanitizeHTML( $html ) { - $formatter = new HtmlFormatter; + public static function sanitizeSnippet( $html ) { + $formatter = new class () extends HtmlFormatter { + public function startDocument( $fragmentNamespace, $fragmentName ) { + // Remove DOCTYPE. + return ''; + } + + public function element( SerializerNode $parent, SerializerNode $node, $contents ) { + switch ( $node->name ) { + // Remove everything outside the tag. + case 'head': + return ''; + + case 'html': + case 'body': + return $contents; + + case 'img': + // Remove the image tags: in 99,9% of cases they are too wide + // to be included into the calendar. + // Not needed in MediaWiki 1.40+ (already removed with the below). + return ''; + + case 'span': + if ( $node->attrs['typeof'] ?? '' === 'mw:File' ) { + // Wrapper around the image. + return ''; + } + break; + + // TODO: properly remove
with all contents (currently hidden by CSS). + + case 'p': + // Remove trailing newline inside

tags. + $contents = trim( $contents ); + } + + return parent::element( $parent, $node, $contents ); + } + }; + $serializer = new Serializer( $formatter ); $treeBuilder = new TreeBuilder( $serializer ); $dispatcher = new Dispatcher( $treeBuilder ); @@ -45,13 +85,6 @@ public static function sanitizeHTML( $html ) { $tokenizer->execute(); $html = $serializer->getResult(); - // Remove doctype, , etc.: everything outside the tag. - // TODO: this can probably be implemented by subclassing HtmlFormatter class. - $html = preg_replace( '@^.*(.*).*$@s', '$1', $html ); - - // Remove trailing newline inside

tags. - $html = preg_replace( "@\n

@", '

', $html ); - return $html; } } diff --git a/tests/phpunit/EventCalendarTest.php b/tests/phpunit/EventCalendarTest.php index a5cd65a..1831396 100644 --- a/tests/phpunit/EventCalendarTest.php +++ b/tests/phpunit/EventCalendarTest.php @@ -778,13 +778,7 @@ public function testSnippetForCompressedRevision() { public function testSnippetSanitizer() { $filename = 'Testimage.png'; $pageText = "Expected snippet [[File:$filename]]"; - - $isMW39 = version_compare( MW_VERSION, '1.40.0', '<' ); - $expectedSnippet = '

Expected snippet ' . - ( $isMW39 ? '' : '' ) . - "' . ( $isMW39 ? '' : '' ) . '

'; + $expectedSnippet = '

Expected snippet

'; // Upload a test file, so that [[File:]] syntax would create an actual thumbnail, not a redlink. $user = $this->getTestUser()->getUser();