From 1a3a320018d039bc2526a15f23726a25dece3fea Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Fri, 22 Dec 2023 16:22:25 +1100 Subject: [PATCH] SDAAP-103 Headline length exceeded when publishing to BOB --- .../aap_bulletinbuilder_formatter.py | 6 +- .../aap_bulletinbuilder_formatter_tests.py | 56 +++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py b/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py index 9f88366c5..743b60e7f 100644 --- a/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py +++ b/server/aap/publish/formatters/aap_bulletinbuilder_formatter.py @@ -70,7 +70,7 @@ def format(self, article, subscriber, codes=None): formatted_article['abstract'] = self.get_text_content( to_ascii(formatted_article.get('abstract', '') or '')).strip() formatted_article['headline'] = self.get_text_content( - to_ascii(formatted_article.get('headline', ''))).strip() + to_ascii(formatted_article.get('headline', '')), space_on_elements=False).strip() formatted_article['byline'] = self.get_text_content( to_ascii(formatted_article.get('byline', '') or '')).strip() @@ -116,7 +116,7 @@ def format(self, article, subscriber, codes=None): def can_format(self, format_type, article): return format_type == 'AAP BULLETIN BUILDER' - def get_text_content(self, content): + def get_text_content(self, content, space_on_elements=True): content = content.replace('
', '
').replace('
', '') # remove control chars except \n content = re.sub('[\x00-\x09\x0b-\x1f]', '', content) @@ -125,7 +125,7 @@ def get_text_content(self, content): if content == '': return '' - parsed = parse_html(content, content='html', space_on_elements=True) + parsed = parse_html(content, content='html', space_on_elements=space_on_elements) # breaks are replaced with spaces for br in parsed.xpath('//br'): diff --git a/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py b/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py index b3973de98..accc47a0e 100644 --- a/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py +++ b/server/aap/publish/formatters/aap_bulletinbuilder_formatter_tests.py @@ -786,3 +786,59 @@ def test_embedded_item(self): self.assertGreater(int(seq), 0) test_article = json.loads(item.get('data')) self.assertEqual(test_article['body_html'], '

pre amble

post amble

') + + def test_clean_headline_html(self): + article = { + config.ID_FIELD: '123', + config.VERSION: 2, + 'source': 'AAP', + 'headline': '1234567890123456789012345123456789012345678901234567890', + 'slugline': 'slugline', + 'abstract': '

abstract

', + 'type': 'text', + 'anpa_category': [{'qcode': 'a', 'name': 'Australian General News'}], + 'flags': { + 'marked_for_legal': True + }, + 'body_html': ('

The story

'), + "fields_meta": { + "headline": { + "draftjsState": [ + { + "blocks": [ + { + "key": "2fvvl", + "text": "1234567890123456789012345123456789012345678901234567890", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [ + { + "offset": 0, + "length": 55, + "style": "BOLD" + }, + { + "offset": 54, + "length": 1, + "style": "LIMIT_CHARACTERS_OVERFLOW" + } + ], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + } + ], + "entityMap": {} + } + ] + } + } + } + + subscriber = self.app.data.find('subscribers', None, None)[0][0] + seq, item = self._formatter.format(article, subscriber)[0] + item = json.loads(item) + self.assertGreater(int(seq), 0) + test_article = json.loads(item.get('data')) + self.assertEqual(test_article['headline'], '1234567890123456789012345123456789012345678901234567890')