From feaa124ea56e258ad51ce60da686b24b39abc335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Sun, 17 Sep 2023 01:22:25 -0300 Subject: [PATCH] Idempotency: Keep a local "archive" (index) of files successfully uploaded the way yt-dlp keep track of downloaded items. - Amend #19 by adding optional idempotency between runs: While concurrent-instances can still rely on a "clean" download directory (#19) a single upload node can use --use-upload-archive can avoid source files removal. Subsequent runs of `tubeup` will omit files already uploaded. - Fixes #23 (and part of #233) - NB: Ability for upload_ia() to return None, paves the way to fix #36 or #109 --- tests/test_tubeup.py | 80 ++++++++++++++++++++++++++++++++++++++++++++ tubeup/TubeUp.py | 37 +++++++++++++++----- tubeup/__main__.py | 17 ++++++++-- 3 files changed, 122 insertions(+), 12 deletions(-) diff --git a/tests/test_tubeup.py b/tests/test_tubeup.py index 9983d56..7603e86 100644 --- a/tests/test_tubeup.py +++ b/tests/test_tubeup.py @@ -590,3 +590,83 @@ def test_archive_urls(self): 'scanner': SCANNER})] self.assertEqual(expected_result, result) + + def test_archive_deletion(self): + root_path = os.path.join(current_path, 'test_tubeup_rootdir') + # Clean up before test + shutil.rmtree(root_path, ignore_errors=True) + + tu = TubeUp(dir_path=root_path, + ia_config_path=get_testfile_path('ia_config_for_test.ini')) + + videobasename = os.path.join( + current_path, 'test_tubeup_rootdir', 'downloads', + 'KdsN9YhkDrY') + + copy_testfiles_to_tubeup_rootdir_test() + dest = os.path.join(root_path, 'downloads', '*') + files_before_upload = glob.glob(dest) + + vid_info = {'mediatype': 'movies', + 'creator': 'RelaxingWorld', + 'channel': 'http://www.youtube.com/channel/UCWpsozCMdAnfI16rZHQ9XDg', + 'collection': 'opensource_movies', + 'title': 'Epic Ramadan - Video Background HD1080p', + 'description': ('If you enjoy my work, please consider Subscribe to my NEW ' + 'channel for more videos:
' + 'https://www.youtube.com/MusicForRelaxation?sub_confirmation=1
' + '▷ If you use this video, please put credits to my channel ' + 'in description:
' + 'Source from RelaxingWorld: https://goo.gl/HsW75m
' + '
' + '▷ Also, do not forget to Subscribe to my channel. Thanks!'), + 'date': '2016-06-25', + 'year': '2016', + 'subject': ('Youtube;video;Film & Animation;Video Background;' + 'Footage;Animation;Cinema;Royalty Free Videos;' + 'Stock Video Footage;Video Backdrops;' + 'Amazing Nature;youtube;HD;1080p;Creative Commons Videos;' + 'relaxing music;Ramadan;'), + 'originalurl': 'https://www.youtube.com/watch?v=KdsN9YhkDrY', + 'licenseurl': '', + 'scanner': SCANNER} + + with requests_mock.Mocker() as m: + # Mock the request to s3.us.archive.org, so it will responds + # a custom json. `internetarchive` library sends GET request to + # that url to check that we don't violate the upload limit. + m.get('https://s3.us.archive.org', + content=b'{"over_limit": 0}', + headers={'content-type': 'application/json'}) + + m.get('https://archive.org/metadata/youtube-KdsN9YhkDrY', + content=b'{}', + headers={'content-type': 'application/json'}) + + # Mock the PUT requests for internetarchive urls that defined + # in mock_upload_response_by_videobasename(), so this test + # doesn't perform upload to the real archive.org server. + mock_upload_response_by_videobasename( + m, 'youtube-KdsN9YhkDrY', videobasename) + + # First upload, this actually get uploaded... + result = list(tu.archive_urls( + ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True)) + + # ... and returns a remote IA item name + expected_result = [('youtube-KdsN9YhkDrY', vid_info)] + self.assertEqual(expected_result, result) + + # ... and no file got deleted + files_after_upload = glob.glob(dest) + self.assertListEqual(files_before_upload, files_after_upload) + # ... and a upload-archive file was created + self.assertTrue(os.path.exists(os.path.join(root_path, '.iauparchive'))) + + # Second upload, nothing was actually uploaded... + result = list(tu.archive_urls( + ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True)) + + # ... and no remote IA item name is returned + expected_result = [(None, vid_info)] + self.assertEqual(expected_result, result) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7dd6801..248db5a 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -309,16 +309,21 @@ def generate_ydl_options(self, return ydl_opts - def upload_ia(self, videobasename, custom_meta=None): + def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): """ Upload video to archive.org. - :param videobasename: A video base name. - :param custom_meta: A custom meta, will be used by internetarchive - library when uploading to archive.org. - :return: A tuple containing item name and metadata used - when uploading to archive.org and whether the item - already exists. + :param videobasename: A video base name. + :param use_upload_archive: Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. + :param custom_meta: A custom meta, will be used by internetarchive + library when uploading to archive.org. + :return: A tuple containing item name and metadata used + when uploading to archive.org and whether the item + already exists. A null item name means upload + didn't happened. """ json_metadata_filepath = videobasename + '.info.json' with open(json_metadata_filepath, 'r', encoding='utf-8') as f: @@ -334,6 +339,12 @@ def upload_ia(self, videobasename, custom_meta=None): metadata = self.create_archive_org_metadata_from_youtubedl_meta( vid_meta) + if use_upload_archive: + ydl = YoutubeDL({'download_archive': os.path.join(self.dir_path['root'], '.iauparchive')}) + if ydl.in_download_archive(vid_meta): + self.logger.debug('Skipping already uploaded video: %s', metadata['title']) + return None, metadata + # Delete empty description file description_file_path = videobasename + '.description' if (os.path.exists(description_file_path) and @@ -375,16 +386,20 @@ def upload_ia(self, videobasename, custom_meta=None): raise Exception(msg) item.upload(files_to_upload, metadata=metadata, retries=9001, - request_kwargs=dict(timeout=9001), delete=True, + request_kwargs=dict(timeout=9001), delete=not use_upload_archive, verbose=self.verbose, access_key=s3_access_key, secret_key=s3_secret_key) + if use_upload_archive: + ydl.record_download_archive(vid_meta) + return itemname, metadata def archive_urls(self, urls, custom_meta=None, cookie_file=None, proxy=None, ydl_username=None, ydl_password=None, use_download_archive=False, + use_upload_archive=False, ignore_existing_item=False): """ Download and upload videos from youtube_dl supported sites to @@ -404,6 +419,10 @@ def archive_urls(self, urls, custom_meta=None, This will download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + :param use_upload_archive: Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. :return: Tuple containing identifier and metadata of the file that has been uploaded to archive.org. @@ -412,7 +431,7 @@ def archive_urls(self, urls, custom_meta=None, urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, ignore_existing_item) for basename in downloaded_file_basenames: - identifier, meta = self.upload_ia(basename, custom_meta) + identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta) yield identifier, meta @staticmethod diff --git a/tubeup/__main__.py b/tubeup/__main__.py index 0841765..4bd199a 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -24,6 +24,7 @@ [--proxy ] [--quiet] [--debug] [--use-download-archive] + [--use-upload-archive] [--output ] [--ignore-existing-item] tubeup -h | --help @@ -45,6 +46,10 @@ This will download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + -U --use-upload-archive Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. -q --quiet Just print errors. -d --debug Print all logs to stdout. -o --output Youtube-dlc output template. @@ -75,6 +80,7 @@ def main(): quiet_mode = args['--quiet'] debug_mode = args['--debug'] use_download_archive = args['--use-download-archive'] + use_upload_archive = args['--use-upload-archive'] ignore_existing_item = args['--ignore-existing-item'] if debug_mode: @@ -100,10 +106,15 @@ def main(): cookie_file, proxy_url, username, password, use_download_archive, + use_upload_archive, ignore_existing_item): - print('\n:: Upload Finished. Item information:') - print('Title: %s' % meta['title']) - print('Item URL: https://archive.org/details/%s\n' % identifier) + if identifier: + print('\n:: Upload Finished. Item information:') + print('Title: %s' % meta['title']) + print('Item URL: https://archive.org/details/%s\n' % identifier) + else: + print('\n:: Upload skipped. Item information:') + print('Title: %s' % meta['title']) except Exception: print('\n\033[91m' # Start red color text 'An exception just occured, if you found this '