-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SDAAP-113 Create a feeding service and parser for Bang Showbiz
- Loading branch information
1 parent
4b5a163
commit 10672e7
Showing
7 changed files
with
308 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
from datetime import datetime | ||
from superdesk.utc import utc | ||
from superdesk.io.feed_parsers.newsml_1_2 import NewsMLOneFeedParser | ||
from superdesk.io.registry import register_feed_parser | ||
from superdesk.errors import ParserError | ||
from apps.archive.common import format_dateline_to_locmmmddsrc | ||
from superdesk.io.iptc import subject_codes | ||
from aap.io.feeding_services.bang import MUSIC_ID, MOVIES_ID, SHOWBIZ_ID | ||
from flask import current_app as app | ||
|
||
|
||
class BangShowbizParser(NewsMLOneFeedParser): | ||
NAME = "Bang Showbiz" | ||
|
||
label = "Bang Showbiz Feed Parser" | ||
|
||
CITY = "London" | ||
COUNTRY_CODE = "GB" | ||
STATE_CODE = "GB.ENG" | ||
|
||
provider = None | ||
|
||
# Map the field/sources entries to appropriate IPTC codes | ||
subject_map = {MUSIC_ID: "01011000", MOVIES_ID: "01005001", SHOWBIZ_ID: "01021000"} | ||
|
||
def datetime(self, string): | ||
return datetime.strptime(string, "%Y-%m-%d %H:%M:%S").replace(tzinfo=utc) | ||
|
||
def parse(self, xml, provider=None): | ||
self.provider = provider | ||
items = [] | ||
self.root = xml | ||
for newsItem in xml.findall("NewsItem"): | ||
item = {} | ||
try: | ||
self.parse_news_identifier(item, newsItem) | ||
self.parse_newslines(item, newsItem) | ||
self.parse_news_management(item, newsItem) | ||
item["subject"] = [ | ||
{ | ||
"qcode": self.subject_map[provider.get("current_id")], | ||
"name": subject_codes[ | ||
self.subject_map[provider.get("current_id")] | ||
], | ||
} | ||
] | ||
item["body_html"] = ( | ||
"<p>" | ||
+ newsItem.find( | ||
"NewsComponent/ContentItem/DataContent/body/body.content" | ||
).text.replace("<BR>", "</p><p>") | ||
+ "</p>" | ||
).replace("\n", "") | ||
|
||
items.append(self.populate_fields(item)) | ||
except Exception as ex: | ||
raise ParserError.newsmlOneParserError(ex, provider) | ||
return items | ||
|
||
def parse_newslines(self, item, tree): | ||
parsed_el = self.parse_elements(tree.find("NewsComponent/NewsLines")) | ||
item["headline"] = parsed_el.get("HeadLine", "").strip() | ||
item["abstract"] = tree.find( | ||
"NewsComponent/NewsLines/NewsLine/NewsLineText" | ||
).text.strip() | ||
|
||
item.setdefault("dateline", {}) | ||
cities = app.locators.find_cities( | ||
country_code=self.COUNTRY_CODE, state_code=self.STATE_CODE | ||
) | ||
located = [c for c in cities if c["city"] == self.CITY] | ||
if len(located) > 0: | ||
item["dateline"]["located"] = located[0] | ||
item["dateline"]["text"] = format_dateline_to_locmmmddsrc( | ||
located[0], item["versioncreated"], self.provider.get("source") | ||
) | ||
return True | ||
|
||
def parse_news_identifier(self, item, tree): | ||
parsed_el = self.parse_elements(tree.find("Identification/NewsIdentifier")) | ||
item["uri"] = item["guid"] = "urn:newsml:{}:{}:{}".format( | ||
self.provider.get("current_id", ""), | ||
self.datetime(parsed_el["DateId"]).isoformat(), | ||
parsed_el["NewsItemId"], | ||
) | ||
item["versioncreated"] = self.datetime(parsed_el["DateId"]) | ||
item["firstcreated"] = self.datetime(parsed_el["DateId"]) | ||
|
||
def parse_news_management(self, item, tree): | ||
# It's always entertainment | ||
item["anpa_category"] = [{"qcode": "e"}] | ||
|
||
|
||
register_feed_parser(BangShowbizParser.NAME, BangShowbizParser()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# -*- coding: utf-8; -*- | ||
# | ||
# This file is part of Superdesk. | ||
# | ||
# Copyright 2013, 2014 Sourcefabric z.u. and contributors. | ||
# | ||
# For the full copyright and license information, please see the | ||
# AUTHORS and LICENSE files distributed with this source code, or | ||
# at https://www.sourcefabric.org/superdesk/license | ||
|
||
import lxml.etree | ||
import requests | ||
|
||
from superdesk.errors import AlreadyExistsError | ||
from superdesk.io.feeding_services.http_base_service import HTTPFeedingServiceBase | ||
from superdesk.io.registry import register_feeding_service | ||
|
||
MUSIC_ID = "music_url" | ||
MOVIES_ID = "movies_url" | ||
SHOWBIZ_ID = "showbiz_url" | ||
|
||
|
||
class BangFeedingService(HTTPFeedingServiceBase): | ||
NAME = "Bang" | ||
|
||
label = "Bang Showbiz" | ||
|
||
HTTP_AUTH = False | ||
|
||
session = None | ||
|
||
# configuration fields for the source url's | ||
fields = [ | ||
{ | ||
"id": MUSIC_ID, | ||
"type": "text", | ||
"label": "Music URL", | ||
}, | ||
{ | ||
"id": MOVIES_ID, | ||
"type": "text", | ||
"label": "Movies URL", | ||
}, | ||
{ | ||
"id": SHOWBIZ_ID, | ||
"type": "text", | ||
"label": "Showbiz URL", | ||
}, | ||
] | ||
|
||
@staticmethod | ||
def _config_test(provider=None): | ||
return True | ||
|
||
def _update(self, provider, update): | ||
if not self.session: | ||
self.session = requests.Session() | ||
|
||
parser = self.get_feed_parser(provider) | ||
|
||
items = [] | ||
for src in self.fields: | ||
current_url = provider.get("config").get(src.get("id")) | ||
if current_url: | ||
provider["current_id"] = src.get("id") | ||
r = self.session.get(current_url) | ||
r.raise_for_status() | ||
xml = lxml.etree.fromstring(r.content) | ||
item = parser.parse(xml, provider=provider) | ||
|
||
items.append(item) | ||
|
||
if self.session: | ||
self.session.close() | ||
|
||
return items | ||
|
||
|
||
try: | ||
register_feeding_service(BangFeedingService) | ||
except AlreadyExistsError: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import os | ||
from superdesk.tests import TestCase | ||
from superdesk.etree import etree | ||
from aap.io.feed_parsers.bang_parser import BangShowbizParser | ||
|
||
|
||
class BangShowbizParserTestCase(TestCase): | ||
filename = "ABC3058248.xml" | ||
|
||
def setUp(self): | ||
dirname = os.path.dirname(os.path.realpath(__file__)) | ||
fixture = os.path.normpath(os.path.join(dirname, "../fixtures", self.filename)) | ||
provider = { | ||
"name": "Test", | ||
"current_id": "showbiz_url", | ||
"config": {"showbiz_url": "https://url.com/111/aa"}, | ||
} | ||
with open(fixture) as f: | ||
self.xml = f.read() | ||
self.item = BangShowbizParser().parse( | ||
etree.fromstring(self.xml.encode("UTF-8")), provider | ||
) | ||
|
||
def test_item(self): | ||
self.assertEqual(self.item[0]["headline"], "Headline text here") | ||
self.assertNotIn("byline", self.item[0]) | ||
self.assertEqual(self.item[0]["abstract"], "Summary text here") | ||
self.assertEqual(self.item[0]["body_html"], "<p>Body here</p><p>more here</p>") |
61 changes: 61 additions & 0 deletions
61
server/aap/tests/io/feed_services/bang_feeding_service_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import os | ||
from httmock import urlmatch, HTTMock | ||
from unittests import AAPTestCase | ||
from aap.io.feeding_services.bang import BangFeedingService | ||
|
||
|
||
PROVIDER = {"_id": "test_provider", | ||
"config": { | ||
"showbiz_url": "https://url.com/111/aa", | ||
"music_url": "https://url.com/222/bb", | ||
"movies_url": "https://url.com/333/cc" | ||
}, | ||
"feed_parser": "Bang Showbiz" | ||
} | ||
|
||
VOCABULARIES = [ | ||
{"_id": "locators", | ||
"items": [] | ||
} | ||
] | ||
|
||
|
||
class bangTestCase(AAPTestCase): | ||
|
||
_calls = None | ||
filename = "ABC3058248.xml" | ||
|
||
def setUp(self): | ||
super().setUp() | ||
self.setupMock(self) | ||
self.app.data.insert('vocabularies', VOCABULARIES) | ||
self.app.data.insert('ingest_providers', [PROVIDER]) | ||
self._calls = 0 | ||
dirname = os.path.dirname(os.path.realpath(__file__)) | ||
fixture = os.path.normpath(os.path.join(dirname, "../fixtures", self.filename)) | ||
with open(fixture) as f: | ||
self.xml = f.read() | ||
|
||
def setupMock(self, context): | ||
context.mock = HTTMock(*[self.showbiz_request], *[self.music_request], *[self.movies_request]) | ||
context.mock.__enter__() | ||
|
||
@urlmatch(scheme='https', netloc='url.com', path='/111/aa') | ||
def showbiz_request(self, url, request): | ||
return {'status_code': 200, 'content': self.xml} | ||
|
||
@urlmatch(scheme='https', netloc='url.com', path='/222/bb') | ||
def music_request(self, url, request): | ||
return {'status_code': 200, 'content': self.xml} | ||
|
||
@urlmatch(scheme='https', netloc='url.com', path='/333/cc') | ||
def movies_request(self, url, request): | ||
return {'status_code': 200, 'content': self.xml} | ||
|
||
def test_request(self): | ||
with self.app.app_context(): | ||
provider = PROVIDER.copy() | ||
service = BangFeedingService() | ||
service.provider = provider | ||
items = service._update(provider, {}) | ||
self.assertEqual(len(items), 3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<NewsML> | ||
<NewsItem> | ||
<Identification> | ||
<NewsIdentifier> | ||
<ProviderId>abc.com</ProviderId> | ||
<NewsItemId>ABC3058248</NewsItemId> | ||
<DateId>2024-04-17 13:00:00</DateId> | ||
</NewsIdentifier> | ||
</Identification> | ||
<NewsComponent> | ||
<NewsLines> | ||
<HeadLine>Headline text here | ||
</HeadLine> | ||
<NewsLine> | ||
<NewsLineType FormalName="Summary"/> | ||
<NewsLineText> | ||
<![CDATA[Summary text here]]> | ||
</NewsLineText> | ||
</NewsLine> | ||
</NewsLines> | ||
<ContentItem> | ||
<MediaType FormalName="Text"/> | ||
<MimeType FormalName="text/vnd.IPTC.NITF"/> | ||
<DataContent> | ||
<body> | ||
<body.content><![CDATA[Body here<BR>more here]]></body.content> | ||
</body> | ||
</DataContent> | ||
</ContentItem> | ||
<ContentItem | ||
Href="/http://somepic.jp"> | ||
<MediaType FormalName="Picture"/> | ||
<MimeType FormalName="image/jpg"/> | ||
<Property FormalName="caption" | ||
value="Caption"/> | ||
</ContentItem> | ||
</NewsComponent> | ||
</NewsItem> | ||
</NewsML> |