From 7ef215dc2f4c1f67f448ab131610889b54c42bff Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 1 Nov 2024 16:38:25 +0000 Subject: [PATCH] Check for empty playlists after filtering, and after downloading videos --- scraper/src/youtube2zim/scraper.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py index 9fdc9fe0..68678032 100644 --- a/scraper/src/youtube2zim/scraper.py +++ b/scraper/src/youtube2zim/scraper.py @@ -565,11 +565,6 @@ def extract_videos_list(self): # we only return video_ids that we'll use later on. per-playlist JSON stored for playlist in self.playlists: videos_json = get_videos_json(playlist.playlist_id) - if len(videos_json) == 0: - logger.warning( - f"Playlist '{playlist.playlist_id}' is empty, will be ignored" - ) - empty_playlists.append(playlist) # filter in videos within date range and filter away deleted videos skip_outofrange = functools.partial( skip_outofrange_videos, self.dateafter @@ -577,6 +572,12 @@ def extract_videos_list(self): filter_videos = filter(skip_outofrange, videos_json) filter_videos = filter(skip_deleted_videos, filter_videos) filter_videos = filter(skip_non_public_videos, filter_videos) + filter_videos = list(filter_videos) + if len(filter_videos) == 0: + logger.warning( + f"Playlist '{playlist.playlist_id}' is empty, will be ignored" + ) + empty_playlists.append(playlist) all_videos.update( {v["contentDetails"]["videoId"]: v for v in filter_videos} ) @@ -1154,10 +1155,21 @@ def get_playlist_slug(playlist) -> str: home_playlist_list = [] main_playlist_slug = None - if len(self.playlists) > 0: - main_playlist_slug = get_playlist_slug( - self.playlists[0] - ) # set first playlist as main playlist + empty_playlists = list( + filter(lambda playlist: len(get_videos_list(playlist)) == 0, self.playlists) + ) + for empty_playlist in empty_playlists: + logger.warning( + f"Removing finally empty playlist {empty_playlist.playlist_id}" + ) + self.playlists.remove(empty_playlist) + + if len(self.playlists) == 0: + raise Exception("No playlist succeeded to download") + + main_playlist_slug = get_playlist_slug( + self.playlists[0] + ) # set first playlist as main playlist for playlist in self.playlists: playlist_slug = get_playlist_slug(playlist)