Skip to content

Commit

Permalink
Use RemexHtml to properly remove <img> tags (without regexes)
Browse files Browse the repository at this point in the history
  • Loading branch information
edwardspec committed Dec 30, 2024
1 parent a648959 commit 2650b5a
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 23 deletions.
13 changes: 7 additions & 6 deletions includes/EventCalendar.php
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,16 @@ public function findEvents( array $opt, Parser $recursiveParser ) {
if ( !$snippet ) {
// Full text of the page (no more than N first symbols) was requested.
// NOTE: we can't use getParserOutput() here, because we are already inside Parser::parse().
$parsedHtml = $recursiveParser->recursiveTagParseFully( $row->text );
$snippet = $recursiveParser->recursiveTagParseFully( $row->text );

// Remove the image tags: in 99,9% of cases they are too wide to be included into the calendar.
// TODO: properly remove <div class="thumb"> with all contents (currently hidden by CSS).
$parsedHtml = preg_replace( '/<img[^>]+>/', '', $parsedHtml );
$snippet = mb_substr( $parsedHtml, 0, $maxSymbols );
// Remove unwanted HTML tags (if any), such as images (too wide for the calendar).
$snippet = HtmlSanitizer::sanitizeSnippet( $snippet );

// Truncate to maximum allowed length.
$snippet = mb_substr( $snippet, 0, $maxSymbols );

// Remove truncated HTML tags (if any).
$snippet = HtmlSanitizer::sanitizeHTML( $snippet );
$snippet = HtmlSanitizer::sanitizeSnippet( $snippet );

// Store the snippet in cache.
// NOTE: the reason why we don't use $dbCache->set() here is that SqlBagOStuff::set() will do
Expand Down
53 changes: 43 additions & 10 deletions includes/HtmlSanitizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,58 @@

use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
use Wikimedia\RemexHtml\Serializer\Serializer;
use Wikimedia\RemexHtml\Serializer\SerializerNode;
use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;

class HtmlSanitizer {
/**
* Remove invalid/non-matching/truncated HTML tags and return correct (sanitized) HTML.
* Remove unwanted/invalid/non-matching/truncated HTML tags and return correct (sanitized) HTML.
* @param string $html
* @return string
*/
public static function sanitizeHTML( $html ) {
$formatter = new HtmlFormatter;
public static function sanitizeSnippet( $html ) {
$formatter = new class () extends HtmlFormatter {
public function startDocument( $fragmentNamespace, $fragmentName ) {
// Remove DOCTYPE.
return '';
}

public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
switch ( $node->name ) {
// Remove everything outside the <body> tag.
case 'head':
return '';

case 'html':
case 'body':
return $contents;

case 'img':
// Remove the image tags: in 99,9% of cases they are too wide
// to be included into the calendar.
// Not needed in MediaWiki 1.40+ (already removed with the <span> below).
return '';

case 'span':
if ( $node->attrs['typeof'] ?? '' === 'mw:File' ) {
// Wrapper around the image.
return '';
}
break;

// TODO: properly remove <div class="thumb"> with all contents (currently hidden by CSS).

case 'p':
// Remove trailing newline inside <p> tags.
$contents = trim( $contents );
}

return parent::element( $parent, $node, $contents );
}
};

$serializer = new Serializer( $formatter );
$treeBuilder = new TreeBuilder( $serializer );
$dispatcher = new Dispatcher( $treeBuilder );
Expand All @@ -45,13 +85,6 @@ public static function sanitizeHTML( $html ) {
$tokenizer->execute();
$html = $serializer->getResult();

// Remove doctype, <head>, etc.: everything outside the <body> tag.
// TODO: this can probably be implemented by subclassing HtmlFormatter class.
$html = preg_replace( '@^.*<body>(.*)</body>.*$@s', '$1', $html );

// Remove trailing newline inside <p></p> tags.
$html = preg_replace( "@\n</p>@", '</p>', $html );

return $html;
}
}
8 changes: 1 addition & 7 deletions tests/phpunit/EventCalendarTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -778,13 +778,7 @@ public function testSnippetForCompressedRevision() {
public function testSnippetSanitizer() {
$filename = 'Testimage.png';
$pageText = "Expected snippet [[File:$filename]]";

$isMW39 = version_compare( MW_VERSION, '1.40.0', '<' );
$expectedSnippet = '<p>Expected snippet ' .
( $isMW39 ? '' : '<span class="mw-default-size" typeof="mw:File">' ) .
"<a href=\"/wiki/File:$filename\" class=\"" .
( $isMW39 ? 'image' : 'mw-file-description' ) .
'"></a>' . ( $isMW39 ? '' : '</span>' ) . '</p>';
$expectedSnippet = '<p>Expected snippet</p>';

// Upload a test file, so that [[File:]] syntax would create an actual thumbnail, not a redlink.
$user = $this->getTestUser()->getUser();
Expand Down

0 comments on commit 2650b5a

Please sign in to comment.