From 10672e785db777c34093deb7282556caf7ffa16d Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Fri, 19 Apr 2024 16:48:48 +1000 Subject: [PATCH] SDAAP-113 Create a feeding service and parser for Bang Showbiz --- server/aap/io/__init__.py | 3 +- server/aap/io/feed_parsers/__init__.py | 1 + server/aap/io/feed_parsers/bang_parser.py | 94 +++++++++++++++++++ server/aap/io/feeding_services/bang.py | 82 ++++++++++++++++ .../tests/io/feed_parsers/bang_parser_test.py | 28 ++++++ .../bang_feeding_service_test.py | 61 ++++++++++++ server/aap/tests/io/fixtures/ABC3058248.xml | 40 ++++++++ 7 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 server/aap/io/feed_parsers/bang_parser.py create mode 100644 server/aap/io/feeding_services/bang.py create mode 100644 server/aap/tests/io/feed_parsers/bang_parser_test.py create mode 100644 server/aap/tests/io/feed_services/bang_feeding_service_test.py create mode 100644 server/aap/tests/io/fixtures/ABC3058248.xml diff --git a/server/aap/io/__init__.py b/server/aap/io/__init__.py index 8fc9f5dc3..a74e4ac7a 100644 --- a/server/aap/io/__init__.py +++ b/server/aap/io/__init__.py @@ -10,4 +10,5 @@ import aap.io.iptc_extension # noqa from .feeding_services.ap_media_relay import APMediaRelayFeedingService # noqa -from .feeding_services.cision import CisionFeedingService # noqa \ No newline at end of file +from .feeding_services.cision import CisionFeedingService # noqa +from .feeding_services.bang import BangFeedingService # noqa \ No newline at end of file diff --git a/server/aap/io/feed_parsers/__init__.py b/server/aap/io/feed_parsers/__init__.py index 3abf22696..5fa5afacf 100644 --- a/server/aap/io/feed_parsers/__init__.py +++ b/server/aap/io/feed_parsers/__init__.py @@ -18,3 +18,4 @@ import aap.io.feed_parsers.abs_calendar_csv # NOQA import aap.io.feed_parsers.three_sixty_ninjs # NOQA import aap.io.feed_parsers.globenewswire # NOQA +import aap.io.feed_parsers.bang_parser # NOQA diff --git a/server/aap/io/feed_parsers/bang_parser.py b/server/aap/io/feed_parsers/bang_parser.py new file mode 100644 index 000000000..2db6aa2a4 --- /dev/null +++ b/server/aap/io/feed_parsers/bang_parser.py @@ -0,0 +1,94 @@ +from datetime import datetime +from superdesk.utc import utc +from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser +from superdesk.io.registry import register_feed_parser +from superdesk.errors import ParserError +from apps.archive.common import format_dateline_to_locmmmddsrc +from superdesk.io.iptc import subject_codes +from aap.io.feeding_services.bang import MUSIC_ID, MOVIES_ID, SHOWBIZ_ID +from flask import current_app as app + + +class BangShowbizParser(NewsMLOneFeedParser): + NAME = "Bang Showbiz" + + label = "Bang Showbiz Feed Parser" + + CITY = "London" + COUNTRY_CODE = "GB" + STATE_CODE = "GB.ENG" + + provider = None + + # Map the field/sources entries to appropriate IPTC codes + subject_map = {MUSIC_ID: "01011000", MOVIES_ID: "01005001", SHOWBIZ_ID: "01021000"} + + def datetime(self, string): + return datetime.strptime(string, "%Y-%m-%d %H:%M:%S").replace(tzinfo=utc) + + def parse(self, xml, provider=None): + self.provider = provider + items = [] + self.root = xml + for newsItem in xml.findall("NewsItem"): + item = {} + try: + self.parse_news_identifier(item, newsItem) + self.parse_newslines(item, newsItem) + self.parse_news_management(item, newsItem) + item["subject"] = [ + { + "qcode": self.subject_map[provider.get("current_id")], + "name": subject_codes[ + self.subject_map[provider.get("current_id")] + ], + } + ] + item["body_html"] = ( + "

" + + newsItem.find( + "NewsComponent/ContentItem/DataContent/body/body.content" + ).text.replace("
", "

") + + "

" + ).replace("\n", "") + + items.append(self.populate_fields(item)) + except Exception as ex: + raise ParserError.newsmlOneParserError(ex, provider) + return items + + def parse_newslines(self, item, tree): + parsed_el = self.parse_elements(tree.find("NewsComponent/NewsLines")) + item["headline"] = parsed_el.get("HeadLine", "").strip() + item["abstract"] = tree.find( + "NewsComponent/NewsLines/NewsLine/NewsLineText" + ).text.strip() + + item.setdefault("dateline", {}) + cities = app.locators.find_cities( + country_code=self.COUNTRY_CODE, state_code=self.STATE_CODE + ) + located = [c for c in cities if c["city"] == self.CITY] + if len(located) > 0: + item["dateline"]["located"] = located[0] + item["dateline"]["text"] = format_dateline_to_locmmmddsrc( + located[0], item["versioncreated"], self.provider.get("source") + ) + return True + + def parse_news_identifier(self, item, tree): + parsed_el = self.parse_elements(tree.find("Identification/NewsIdentifier")) + item["uri"] = item["guid"] = "urn:newsml:{}:{}:{}".format( + self.provider.get("current_id", ""), + self.datetime(parsed_el["DateId"]).isoformat(), + parsed_el["NewsItemId"], + ) + item["versioncreated"] = self.datetime(parsed_el["DateId"]) + item["firstcreated"] = self.datetime(parsed_el["DateId"]) + + def parse_news_management(self, item, tree): + # It's always entertainment + item["anpa_category"] = [{"qcode": "e"}] + + +register_feed_parser(BangShowbizParser.NAME, BangShowbizParser()) diff --git a/server/aap/io/feeding_services/bang.py b/server/aap/io/feeding_services/bang.py new file mode 100644 index 000000000..df7d56da5 --- /dev/null +++ b/server/aap/io/feeding_services/bang.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8; -*- +# +# This file is part of Superdesk. +# +# Copyright 2013, 2014 Sourcefabric z.u. and contributors. +# +# For the full copyright and license information, please see the +# AUTHORS and LICENSE files distributed with this source code, or +# at https://www.sourcefabric.org/superdesk/license + +import lxml.etree +import requests + +from superdesk.errors import AlreadyExistsError +from superdesk.io.feeding_services.http_base_service import HTTPFeedingServiceBase +from superdesk.io.registry import register_feeding_service + +MUSIC_ID = "music_url" +MOVIES_ID = "movies_url" +SHOWBIZ_ID = "showbiz_url" + + +class BangFeedingService(HTTPFeedingServiceBase): + NAME = "Bang" + + label = "Bang Showbiz" + + HTTP_AUTH = False + + session = None + + # configuration fields for the source url's + fields = [ + { + "id": MUSIC_ID, + "type": "text", + "label": "Music URL", + }, + { + "id": MOVIES_ID, + "type": "text", + "label": "Movies URL", + }, + { + "id": SHOWBIZ_ID, + "type": "text", + "label": "Showbiz URL", + }, + ] + + @staticmethod + def _config_test(provider=None): + return True + + def _update(self, provider, update): + if not self.session: + self.session = requests.Session() + + parser = self.get_feed_parser(provider) + + items = [] + for src in self.fields: + current_url = provider.get("config").get(src.get("id")) + if current_url: + provider["current_id"] = src.get("id") + r = self.session.get(current_url) + r.raise_for_status() + xml = lxml.etree.fromstring(r.content) + item = parser.parse(xml, provider=provider) + + items.append(item) + + if self.session: + self.session.close() + + return items + + +try: + register_feeding_service(BangFeedingService) +except AlreadyExistsError: + pass diff --git a/server/aap/tests/io/feed_parsers/bang_parser_test.py b/server/aap/tests/io/feed_parsers/bang_parser_test.py new file mode 100644 index 000000000..69949a168 --- /dev/null +++ b/server/aap/tests/io/feed_parsers/bang_parser_test.py @@ -0,0 +1,28 @@ +import os +from superdesk.tests import TestCase +from superdesk.etree import etree +from aap.io.feed_parsers.bang_parser import BangShowbizParser + + +class BangShowbizParserTestCase(TestCase): + filename = "ABC3058248.xml" + + def setUp(self): + dirname = os.path.dirname(os.path.realpath(__file__)) + fixture = os.path.normpath(os.path.join(dirname, "../fixtures", self.filename)) + provider = { + "name": "Test", + "current_id": "showbiz_url", + "config": {"showbiz_url": "https://url.com/111/aa"}, + } + with open(fixture) as f: + self.xml = f.read() + self.item = BangShowbizParser().parse( + etree.fromstring(self.xml.encode("UTF-8")), provider + ) + + def test_item(self): + self.assertEqual(self.item[0]["headline"], "Headline text here") + self.assertNotIn("byline", self.item[0]) + self.assertEqual(self.item[0]["abstract"], "Summary text here") + self.assertEqual(self.item[0]["body_html"], "

Body here

more here

") diff --git a/server/aap/tests/io/feed_services/bang_feeding_service_test.py b/server/aap/tests/io/feed_services/bang_feeding_service_test.py new file mode 100644 index 000000000..55521b27d --- /dev/null +++ b/server/aap/tests/io/feed_services/bang_feeding_service_test.py @@ -0,0 +1,61 @@ +import os +from httmock import urlmatch, HTTMock +from unittests import AAPTestCase +from aap.io.feeding_services.bang import BangFeedingService + + +PROVIDER = {"_id": "test_provider", + "config": { + "showbiz_url": "https://url.com/111/aa", + "music_url": "https://url.com/222/bb", + "movies_url": "https://url.com/333/cc" + }, + "feed_parser": "Bang Showbiz" + } + +VOCABULARIES = [ + {"_id": "locators", + "items": [] + } +] + + +class bangTestCase(AAPTestCase): + + _calls = None + filename = "ABC3058248.xml" + + def setUp(self): + super().setUp() + self.setupMock(self) + self.app.data.insert('vocabularies', VOCABULARIES) + self.app.data.insert('ingest_providers', [PROVIDER]) + self._calls = 0 + dirname = os.path.dirname(os.path.realpath(__file__)) + fixture = os.path.normpath(os.path.join(dirname, "../fixtures", self.filename)) + with open(fixture) as f: + self.xml = f.read() + + def setupMock(self, context): + context.mock = HTTMock(*[self.showbiz_request], *[self.music_request], *[self.movies_request]) + context.mock.__enter__() + + @urlmatch(scheme='https', netloc='url.com', path='/111/aa') + def showbiz_request(self, url, request): + return {'status_code': 200, 'content': self.xml} + + @urlmatch(scheme='https', netloc='url.com', path='/222/bb') + def music_request(self, url, request): + return {'status_code': 200, 'content': self.xml} + + @urlmatch(scheme='https', netloc='url.com', path='/333/cc') + def movies_request(self, url, request): + return {'status_code': 200, 'content': self.xml} + + def test_request(self): + with self.app.app_context(): + provider = PROVIDER.copy() + service = BangFeedingService() + service.provider = provider + items = service._update(provider, {}) + self.assertEqual(len(items), 3) diff --git a/server/aap/tests/io/fixtures/ABC3058248.xml b/server/aap/tests/io/fixtures/ABC3058248.xml new file mode 100644 index 000000000..0e4f36554 --- /dev/null +++ b/server/aap/tests/io/fixtures/ABC3058248.xml @@ -0,0 +1,40 @@ + + + + + + abc.com + ABC3058248 + 2024-04-17 13:00:00 + + + + + Headline text here + + + + + + + + + + + + + + more here]]> + + + + + + + + + + + \ No newline at end of file