Skip to content

Commit

Permalink
Simplified rss feed parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
bsuryadevara committed Jan 3, 2024
1 parent 1a9e369 commit c8f95a1
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 58 deletions.
64 changes: 21 additions & 43 deletions morpheus/controllers/rss_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ def __init__(self,
})

self._feed_stats_dict = {
input:
url:
FeedStats(failure_count=0, success_count=0, last_failure=-1, last_success=-1, last_try_result="Unknown")
for input in self._feed_input
for url in self._feed_input
}

@property
Expand All @@ -128,7 +128,7 @@ def run_indefinitely(self):

def get_feed_stats(self, feed_url: str) -> FeedStats:
"""
Get feed input stats.
Get feed url stats.
Parameters
----------
Expand All @@ -143,34 +143,19 @@ def get_feed_stats(self, feed_url: str) -> FeedStats:
Raises
------
ValueError
If the feed URL is not found in the feed input provided to the constructor.
If the feed URL is not found in the feed url provided to the constructor.
"""
if feed_url not in self._feed_stats_dict:
raise ValueError("The feed URL is not part of the feed input provided to the constructor.")
raise ValueError("The feed URL is not part of the feed url provided to the constructor.")

return self._feed_stats_dict[feed_url]

def _read_file_content(self, file_path: str) -> str:
with open(file_path, 'r', encoding="utf-8") as file:
return file.read()

def _fetch_feed_content(self, feed_input: str, is_url: bool) -> str:
# If input is an URL.
if is_url:
if not self._enable_cache:
# If cache is not enabled, fetch feed directly using requests.Session.
response = self._session.get(feed_input, timeout=self._request_timeout)
return response.text

# If we are here, feed_input is an actual feed content retrieved from the cache.
return feed_input

# If original input is not an URL, then read the content from the file path.
return self._read_file_content(feed_input)

def _try_parse_feed_with_beautiful_soup(self, feed_input: str, is_url: bool) -> "feedparser.FeedParserDict":
def _try_parse_feed_with_beautiful_soup(self, feed_input: str) -> "feedparser.FeedParserDict":

feed_input = self._fetch_feed_content(feed_input, is_url)
soup = BeautifulSoup(feed_input, 'xml')

# Verify whether the given feed has 'item' or 'entry' tags.
Expand Down Expand Up @@ -211,35 +196,28 @@ def _try_parse_feed(self, url: str) -> "feedparser.FeedParserDict":
is_url = RSSController.is_url(url)

fallback = False
cache_hit = False
use_cache = is_url and self._enable_cache

if use_cache:
if is_url:
response = self._session.get(url, timeout=self._request_timeout)
cache_hit = response.from_cache
feed_input = response.text
else:
feed_input = url

feed = feedparser.parse(feed_input)

if feed["bozo"]:
cache_hit = False

if use_cache:
fallback = True
logger.info("Parsing the cached feed for URL '%s' failed. Attempting direct parsing with feedparser.",
url)
feed = feedparser.parse(url)

if feed["bozo"]:
try:
feed = self._try_parse_feed_with_beautiful_soup(feed_input, is_url)
except Exception:
logger.error("Failed to parse the feed manually: %s", url)
raise
fallback = True
try:
if not is_url:
# Read file content
feed_input = self._read_file_content(feed_input)
# Parse feed content with beautifulsoup
feed = self._try_parse_feed_with_beautiful_soup(feed_input)
except Exception:
logger.error("Failed to parse the feed manually: %s", url)
raise

logger.debug("Parsed feed: %s. Cache hit: %s. Fallback: %s", url, cache_hit, fallback)
logger.debug("Parsed feed: %s. Fallback: %s", url, fallback)

return feed

Expand Down Expand Up @@ -319,17 +297,17 @@ def fetch_dataframes(self):
@classmethod
def is_url(cls, feed_input: str) -> bool:
"""
Check if the provided input is a valid URL.
Check if the provided url is a valid URL.
Parameters
----------
feed_input : str
The input string to be checked.
The url string to be checked.
Returns
-------
bool
True if the input is a valid URL, False otherwise.
True if the url is a valid URL, False otherwise.
"""
try:
parsed_url = urlparse(feed_input)
Expand Down
33 changes: 18 additions & 15 deletions tests/controllers/test_rss_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,13 @@ def test_run_indefinitely_false(feed_input: list[str]):


@pytest.mark.parametrize("feed_input", test_urls)
def test_parse_feed_valid_url(feed_input: list[str], mock_feed: feedparser.FeedParserDict):
def test_parse_feed_valid_url(feed_input: list[str], mock_feed: feedparser.FeedParserDict, mock_get_response: Mock):
controller = RSSController(feed_input=feed_input)
with patch("morpheus.controllers.rss_controller.feedparser.parse") as mock_feedparser_parse:

mock_feedparser_parse = patch("morpheus.controllers.rss_controller.feedparser.parse")
mock_get = patch.object(requests.Session, 'get')
with mock_feedparser_parse, mock_get:
mock_get.return_value = mock_get_response
mock_feedparser_parse.return_value = mock_feed
feed = list(controller.parse_feeds())[0]
assert feed.entries
Expand Down Expand Up @@ -113,10 +117,15 @@ def test_is_url_false(feed_input: list[str]):


@pytest.mark.parametrize("feed_input", [test_urls, test_urls[0]])
def test_fetch_dataframes_url(feed_input: str | list[str], mock_feed: feedparser.FeedParserDict):
def test_fetch_dataframes_url(feed_input: str | list[str],
mock_feed: feedparser.FeedParserDict,
mock_get_response: Mock):
controller = RSSController(feed_input=feed_input)

with patch("morpheus.controllers.rss_controller.feedparser.parse") as mock_feedparser_parse:
mock_feedparser_parse = patch("morpheus.controllers.rss_controller.feedparser.parse")
mock_get = patch.object(requests.Session, 'get')
with mock_feedparser_parse, mock_get:
mock_get.return_value = mock_get_response
mock_feedparser_parse.return_value = mock_feed
dataframes_generator = controller.fetch_dataframes()
dataframe = next(dataframes_generator, None)
Expand All @@ -143,19 +152,13 @@ def test_batch_size(feed_input: list[str], batch_size: int):
assert len(df) <= batch_size


@pytest.mark.parametrize("feed_input, is_url, enable_cache", [(test_file_paths[0], False, False),
(test_urls[0], True, True), (test_urls[0], True, False)])
def test_try_parse_feed_with_beautiful_soup(feed_input: str, is_url: bool, enable_cache: bool, mock_get_response: Mock):
@pytest.mark.parametrize("feed_input, enable_cache", [(test_file_paths[0], False), (test_urls[0], True),
(test_urls[0], False)])
def test_try_parse_feed_with_beautiful_soup(feed_input: str, enable_cache: bool, mock_get_response: Mock):
controller = RSSController(feed_input=feed_input, enable_cache=enable_cache)

if is_url and not enable_cache:
with patch.object(requests.Session, 'get') as mock_get:
mock_get.return_value = mock_get_response
feed_data = controller._try_parse_feed_with_beautiful_soup(feed_input, is_url)
else:
# When enable_cache is set to 'True', the feed content is provided as input.
feed_data = controller._try_parse_feed_with_beautiful_soup(
mock_get_response.text if enable_cache else feed_input, is_url)
# When enable_cache is set to 'True', the feed content is provided as input.
feed_data = controller._try_parse_feed_with_beautiful_soup(mock_get_response.text)

assert isinstance(feed_data, feedparser.FeedParserDict)
assert len(feed_data.entries) > 0
Expand Down

0 comments on commit c8f95a1

Please sign in to comment.