From 4a55c2a6a19f753eb0d2ade993a0bf8060112eea Mon Sep 17 00:00:00 2001 From: Edward Chernenko Date: Tue, 31 Dec 2024 01:28:28 +0300 Subject: [PATCH] Snippets: don't parse the entire article: 1000+maxSymbols is enough --- ChangeLog | 1 + includes/EventCalendar.php | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 7a0000b..1aa7008 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,7 @@ List of changes between releases of Extension:JsCalendar. Features: * Support PostgreSQL. +* Removal of images from HTML snippets is now done with a proper HTML parser instead of regexes. == JsCalendar 0.4.0 == diff --git a/includes/EventCalendar.php b/includes/EventCalendar.php index b7a2adc..c3a8527 100644 --- a/includes/EventCalendar.php +++ b/includes/EventCalendar.php @@ -232,6 +232,12 @@ public function findEvents( array $opt, Parser $recursiveParser ) { // NOTE: we can't use getParserOutput() here, because we are already inside Parser::parse(). $snippet = $recursiveParser->recursiveTagParseFully( $row->text ); + // Truncate to maximum allowed length PLUS some extra (in case some HTML tags get removed), + // so that we wouldn't have to sanitize the whole article (potentially 10-80 kb.), + // but also without snippet becoming shorter than $maxSymbols after the tags are removed. + $extraSymbols = 1000; + $snippet = mb_substr( $snippet, 0, $maxSymbols + $extraSymbols ); + // Remove the image tags: in 99,9% of cases they are too wide to be included into the calendar. $snippet = HtmlSanitizer::sanitizeSnippet( $snippet );